Add target path creation + orig + extract + pdf

1 year ago · a430258750
parent 33787cbe4e
commit a430258750
1 changed files with 22 additions and 5 deletions
--- a/ocr_scan.py
+++ b/ocr_scan.py
@ -2,27 +2,44 @@ from PIL import Image
 from pytesseract import pytesseract
 import os
 from configparser import ConfigParser
+import shutil


 def scan_document_folder():
+    # Read Config, Setup
    ocr_config = ConfigParser()
    ocr_config.read("ocr_config.ini")
-    
    ocr_settings = ocr_config['ocr']
    path_to_tesseract = ocr_settings["path_to_tesseract"]
+    # TODO Checken warum hier keine Umlaute erkannt werden -> Sprachmodell wechseln /config
    pytesseract.tesseract_cmd = path_to_tesseract
-    
+
    #Get the file names in the directory
    for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]):
        #Iterate over each file name in the folder
        for file_name in file_names:
            #Open image with PIL
-            img = Image.open(ocr_settings["path_to_input"] + file_name)
-
+            source_path = os.path.join(ocr_settings["path_to_input"], file_name)
+            img = Image.open(source_path)
            #Extract text from image
            text = pytesseract.image_to_string(img)

-            print(text)
+            # Prep target folder + files
+            file,file_ext = os.path.splitext(file_name)
+            target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")            
+            os.makedirs(os.path.dirname(target_path), exist_ok=True)            
+            print(f"Creating files for : {file}")
+            
+            original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
+            shutil.copy2(source_path,original_file)
+            
+            extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt")
+            #print(text)
+            with open(extracted_file, "w") as text_file:
+                text_file.write(text)
+            
+            pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf")
+            img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True)


 if __name__ == "__main__":