from PIL import Image from pytesseract import pytesseract import os from configparser import ConfigParser import shutil def scan_document_folder() -> None: """Scan Input Ordner fuer OCR Analyse und Verarbeitung der Bilder """ ocr_config = ConfigParser() ocr_config.read("ocr_config.ini") ocr_settings = ocr_config['ocr'] path_to_tesseract = ocr_settings["path_to_tesseract"] pytesseract.tesseract_cmd = path_to_tesseract #Get the file names in the directory for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]): #Iterate over each file name in the folder for file_name in file_names: #Open image with PIL source_path = os.path.join(ocr_settings["path_to_input"], file_name) img = Image.open(source_path) #Extract text from image language = "deu" pageseg_mode = "3" # Auto-Segmentation options = f"-l {language} --psm {pageseg_mode}" text = pytesseract.image_to_string(img,config=options) # Prep target folder + files file,file_ext = os.path.splitext(file_name) target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"") os.makedirs(os.path.dirname(target_path), exist_ok=True) print(f"Creating files for : {file}") original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"") shutil.copy2(source_path,original_file) os.remove(source_path) extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt") with open(extracted_file, "w") as text_file: text_file.write(text) pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf") img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True) def export_to_mount(symlinks=False, ignore=None) -> None: """Export zu einem externen Folder(z.b. NAS Mount) Args: symlinks (bool, optional): Config, ob shutil symlinks beruecksichtigen soll. Defaults to False. ignore (str, optional): Config, ob shutil gewisse Files ignorieren soll. Defaults to None. """ # Read Config, Setup ocr_config = ConfigParser() ocr_config.read("ocr_config.ini") ocr_settings = ocr_config['ocr'] src = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"]) mount_path = ocr_settings["path_to_mount"] for item in os.listdir(src): s = os.path.join(src, item) d = os.path.join(mount_path, item) if os.path.isdir(s): shutil.copytree(s, d, symlinks, ignore) else: shutil.copy2(s, d) shutil.rmtree(s) def match_tags(): # Match Tags aus der Config fuer Doc-Renaming # je nachdem welche Worte im Dokument vorhanden sind wird das Dokument umbenannt # Todo einbauen wenn notwendig pass if __name__ == "__main__": scan_document_folder() export_to_mount()