diff --git a/ocr_scan.py b/ocr_scan.py index 93d948c..c9d3147 100644 --- a/ocr_scan.py +++ b/ocr_scan.py @@ -2,27 +2,44 @@ from PIL import Image from pytesseract import pytesseract import os from configparser import ConfigParser +import shutil def scan_document_folder(): + # Read Config, Setup ocr_config = ConfigParser() ocr_config.read("ocr_config.ini") - ocr_settings = ocr_config['ocr'] path_to_tesseract = ocr_settings["path_to_tesseract"] + # TODO Checken warum hier keine Umlaute erkannt werden -> Sprachmodell wechseln /config pytesseract.tesseract_cmd = path_to_tesseract - + #Get the file names in the directory for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]): #Iterate over each file name in the folder for file_name in file_names: #Open image with PIL - img = Image.open(ocr_settings["path_to_input"] + file_name) - + source_path = os.path.join(ocr_settings["path_to_input"], file_name) + img = Image.open(source_path) #Extract text from image text = pytesseract.image_to_string(img) - print(text) + # Prep target folder + files + file,file_ext = os.path.splitext(file_name) + target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"") + os.makedirs(os.path.dirname(target_path), exist_ok=True) + print(f"Creating files for : {file}") + + original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"") + shutil.copy2(source_path,original_file) + + extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt") + #print(text) + with open(extracted_file, "w") as text_file: + text_file.write(text) + + pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf") + img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True) if __name__ == "__main__":