From e6c7549bff687383478ee9b2f3fe5d76c564fe49 Mon Sep 17 00:00:00 2001 From: dev_alex Date: Sun, 30 Jul 2023 22:31:24 +0200 Subject: [PATCH] Finalize Script + Add export to mount fn --- ocr_scan.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/ocr_scan.py b/ocr_scan.py index 62b51e5..ec45b6a 100644 --- a/ocr_scan.py +++ b/ocr_scan.py @@ -5,13 +5,14 @@ from configparser import ConfigParser import shutil -def scan_document_folder(): +def scan_document_folder() -> None: + """Scan Input Odner für OCR Analyse und Verarbeitung der Bilder + """ # Read Config, Setup ocr_config = ConfigParser() ocr_config.read("ocr_config.ini") ocr_settings = ocr_config['ocr'] - path_to_tesseract = ocr_settings["path_to_tesseract"] - # TODO Checken warum hier keine Umlaute erkannt werden -> Sprachmodell wechseln /config + path_to_tesseract = ocr_settings["path_to_tesseract"] pytesseract.tesseract_cmd = path_to_tesseract #Get the file names in the directory @@ -32,18 +33,42 @@ def scan_document_folder(): target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"") os.makedirs(os.path.dirname(target_path), exist_ok=True) print(f"Creating files for : {file}") - + original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"") shutil.copy2(source_path,original_file) - + os.remove(source_path) + extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt") - #print(text) with open(extracted_file, "w") as text_file: text_file.write(text) - + pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf") img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True) +def export_to_mount(symlinks=False, ignore=None) -> None: + """Export zu einem externen Folder(z.b. NAS Mount) + + Args: + symlinks (bool, optional): Config, ob shutil symlinks berücksichtigen soll. Defaults to False. + ignore (str, optional): Config, ob shutil gewisse Files ignorieren soll. Defaults to None. + """ + # Read Config, Setup + ocr_config = ConfigParser() + ocr_config.read("ocr_config.ini") + ocr_settings = ocr_config['ocr'] + + src = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"]) + mount_path = ocr_settings["path_to_mount"] + for item in os.listdir(src): + s = os.path.join(src, item) + d = os.path.join(mount_path, item) + if os.path.isdir(s): + shutil.copytree(s, d, symlinks, ignore) + else: + shutil.copy2(s, d) + + if __name__ == "__main__": - scan_document_folder() \ No newline at end of file + scan_document_folder() + export_to_mount() \ No newline at end of file