From 0807e31144e7cccaf17b2557b0156c03b200bdf9 Mon Sep 17 00:00:00 2001
From: Cedric Gatay <c.gatay@code-troopers.com>
Date: Thu, 8 Dec 2022 08:33:47 +0100
Subject: [PATCH] feat(tnt): highlight tracks / levels / duration

Add highlight to pdf for TNT CFP deliberations

* create a venv: `python -m venv .`
* install mupdf: `brew install swig mupdf`
* install requirements: `pip3 install -r requirements.txt`
* put the input file as `input.pdf`
* run bash script to highlight
---
 .gitignore                                    |  4 +++
 .../highlight-redact-text/highlight.sh        | 19 +++++++++++
 .../highlight-redact-text/pdf_highlighter.py  | 33 ++++++++++---------
 3 files changed, 41 insertions(+), 15 deletions(-)
 create mode 100644 handling-pdf-files/highlight-redact-text/highlight.sh
 mode change 100644 => 100755 handling-pdf-files/highlight-redact-text/pdf_highlighter.py

diff --git a/.gitignore b/.gitignore
index e52f2952..cfd43aef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 
 machine-learning/stock-prediction/.ipynb_checkpoints/stock_prediction-checkpoint.ipynb
+handling-pdf-files/highlight-redact-text/bin/*
+handling-pdf-files/highlight-redact-text/lib/*
+handling-pdf-files/highlight-redact-text/*.pdf
+.DS_Store
diff --git a/handling-pdf-files/highlight-redact-text/highlight.sh b/handling-pdf-files/highlight-redact-text/highlight.sh
new file mode 100644
index 00000000..0bcadb3b
--- /dev/null
+++ b/handling-pdf-files/highlight-redact-text/highlight.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+python pdf_highlighter.py -i input.pdf -s 'Alien' -c CYAN -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Backend, Cloud, Big Data' -c GREEN -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Design, UI, UX' -c PINK -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Front web' -c RED -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Mobile, Internet des objets' -c RED -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Conception, architecture' -c ORANGE -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'age, pratiques de développ' -c GRAY -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Humain & Tech' -c PURPLE -o output.pdf
+
+python pdf_highlighter.py -i output.pdf -s 'Conférence \(50min\)' -c POWDERBLUE -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Hands-on \(120min\)' -c CORAL -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Speaker débutant' -c AQUAMARINE -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Quickie \(15min\)' -c SALMON -o output.pdf
+
+python pdf_highlighter.py -i output.pdf -s 'Beginner' -c GREEN -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Intermediate' -c ORANGE -o output.pdf
+python pdf_highlighter.py -i output.pdf -s 'Advanced' -c MAGENTA -o output.pdf
\ No newline at end of file
diff --git a/handling-pdf-files/highlight-redact-text/pdf_highlighter.py b/handling-pdf-files/highlight-redact-text/pdf_highlighter.py
old mode 100644
new mode 100755
index cf46246f..625088c9
--- a/handling-pdf-files/highlight-redact-text/pdf_highlighter.py
+++ b/handling-pdf-files/highlight-redact-text/pdf_highlighter.py
@@ -49,7 +49,7 @@ def redact_matching_data(page, matched_values):
     # Loop throughout matching values
     for val in matched_values:
         matches_found += 1
-        matching_val_area = page.searchFor(val)
+        matching_val_area = page.search_for(val)
         # Redact matching values
         [page.addRedactAnnot(area, text=" ", fill=(0, 0, 0))
          for area in matching_val_area]
@@ -66,7 +66,7 @@ def frame_matching_data(page, matched_values):
     # Loop throughout matching values
     for val in matched_values:
         matches_found += 1
-        matching_val_area = page.searchFor(val)
+        matching_val_area = page.search_for(val)
         for area in matching_val_area:
             if isinstance(area, fitz.fitz.Rect):
                 # Draw a rectangle around matched values
@@ -79,7 +79,7 @@ def frame_matching_data(page, matched_values):
     return matches_found
 
 
-def highlight_matching_data(page, matched_values, type):
+def highlight_matching_data(page, matched_values, type, colors):
     """
     Highlight matching values
     """
@@ -87,11 +87,11 @@ def highlight_matching_data(page, matched_values, type):
     # Loop throughout matching values
     for val in matched_values:
         matches_found += 1
-        matching_val_area = page.searchFor(val)
+        matching_val_area = page.search_for(val)
         # print("matching_val_area",matching_val_area)
         highlight = None
         if type == 'Highlight':
-            highlight = page.addHighlightAnnot(matching_val_area)
+            highlight = page.add_highlight_annot(matching_val_area)
         elif type == 'Squiggly':
             highlight = page.addSquigglyAnnot(matching_val_area)
         elif type == 'Underline':
@@ -101,14 +101,14 @@ def highlight_matching_data(page, matched_values, type):
         else:
             highlight = page.addHighlightAnnot(matching_val_area)
         # To change the highlight colar
-        # highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) })
-        # highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red'))
-        # highlight.setColors(colors= fitz.utils.getColor('red'))
+        # highlight.set_colors({"stroke": fitz.utils.getColor(colors),"fill": fitz.utils.getColor(colors) })
+        # highlight.set_colors({"stroke":fitz.utils.getColor('red'), "fill": fitz.utils.getColor('black')}) #, fill = fitz.utils.getColor('red'))
+        highlight.set_colors(stroke= fitz.utils.getColor(colors))
         highlight.update()
     return matches_found
 
 
-def process_data(input_file: str, output_file: str, search_str: str, pages: Tuple = None, action: str = 'Highlight'):
+def process_data(input_file: str, output_file: str, search_str: str, pages: Tuple = None, action: str = 'Highlight', colors: str = "yellow"):
     """
     Process the pages of the PDF File
     """
@@ -118,7 +118,7 @@ def process_data(input_file: str, output_file: str, search_str: str, pages: Tupl
     output_buffer = BytesIO()
     total_matches = 0
     # Iterate through pages
-    for pg in range(pdfDoc.pageCount):
+    for pg in range(pdfDoc.page_count):
         # If required for specific pages
         if pages:
             if str(pg) not in pages:
@@ -127,7 +127,7 @@ def process_data(input_file: str, output_file: str, search_str: str, pages: Tupl
         page = pdfDoc[pg]
         # Get Matching Data
         # Split page by lines
-        page_lines = page.getText("text").split('\n')
+        page_lines = page.get_text("text").split('\n')
         matched_values = search_for_text(page_lines, search_str)
         if matched_values:
             if action == 'Redact':
@@ -136,10 +136,10 @@ def process_data(input_file: str, output_file: str, search_str: str, pages: Tupl
                 matches_found = frame_matching_data(page, matched_values)
             elif action in ('Highlight', 'Squiggly', 'Underline', 'Strikeout'):
                 matches_found = highlight_matching_data(
-                    page, matched_values, action)
+                    page, matched_values, action, colors)
             else:
                 matches_found = highlight_matching_data(
-                    page, matched_values, 'Highlight')
+                    page, matched_values, 'Highlight', colors)
             total_matches += matches_found
     print(f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}")
     # Save to output
@@ -195,13 +195,14 @@ def process_file(**kwargs):
     pages = kwargs.get('pages')
     # Redact, Frame, Highlight, Squiggly, Underline, Strikeout, Remove
     action = kwargs.get('action')
+    colors = kwargs.get('colors')
     if action == "Remove":
         # Remove the Highlights except Redactions
         remove_highlght(input_file=input_file,
                         output_file=output_file, pages=pages)
     else:
         process_data(input_file=input_file, output_file=output_file,
-                     search_str=search_str, pages=pages, action=action)
+                     search_str=search_str, pages=pages, action=action, colors=colors)
 
 
 def process_folder(**kwargs):
@@ -256,6 +257,7 @@ def parse_args():
                         default='Highlight', help="Choose whether to Redact or to Frame or to Highlight or to Squiggly or to Underline or to Strikeout or to Remove")
     parser.add_argument('-p', '--pages', dest='pages', type=tuple,
                         help="Enter the pages to consider e.g.: [2,4]")
+    parser.add_argument('-c', '--colors', dest='colors',                         help="Enter the colors to use")
     action = parser.parse_known_args()[0].action
     if action != 'Remove':
         parser.add_argument('-s', '--search_str', dest='search_str'                            # lambda x: os.path.has_valid_dir_syntax(x)
@@ -286,7 +288,8 @@ def parse_args():
         process_file(
             input_file=args['input_path'], output_file=args['output_file'], 
             search_str=args['search_str'] if 'search_str' in (args.keys()) else None, 
-            pages=args['pages'], action=args['action']
+            pages=args['pages'], action=args['action'],
+            colors=args['colors']
         )
     # If Folder Path
     elif os.path.isdir(args['input_path']):