ocr_document_scanner/ocr_scan.py

from PIL import Image
from pytesseract import pytesseract
import os
from configparser import ConfigParser
import shutil


def scan_document_folder():
    # Read Config, Setup
    ocr_config = ConfigParser()
    ocr_config.read("ocr_config.ini")
    ocr_settings = ocr_config['ocr']
    path_to_tesseract = ocr_settings["path_to_tesseract"]
    # TODO Checken warum hier keine Umlaute erkannt werden -> Sprachmodell wechseln /config
    pytesseract.tesseract_cmd = path_to_tesseract

    #Get the file names in the directory
    for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]):
        #Iterate over each file name in the folder
        for file_name in file_names:
            #Open image with PIL
            source_path = os.path.join(ocr_settings["path_to_input"], file_name)
            img = Image.open(source_path)
            #Extract text from image
            text = pytesseract.image_to_string(img)

            # Prep target folder + files
            file,file_ext = os.path.splitext(file_name)
            target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
            os.makedirs(os.path.dirname(target_path), exist_ok=True)
            print(f"Creating files for : {file}")

            original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
            shutil.copy2(source_path,original_file)

            extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt")
            #print(text)
            with open(extracted_file, "w") as text_file:
                text_file.write(text)

            pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf")
            img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True)


if __name__ == "__main__":
    scan_document_folder()