You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

46 lines
2.0 KiB
Python

from PIL import Image
from pytesseract import pytesseract
import os
from configparser import ConfigParser
import shutil
def scan_document_folder():
# Read Config, Setup
ocr_config = ConfigParser()
ocr_config.read("ocr_config.ini")
ocr_settings = ocr_config['ocr']
path_to_tesseract = ocr_settings["path_to_tesseract"]
# TODO Checken warum hier keine Umlaute erkannt werden -> Sprachmodell wechseln /config
pytesseract.tesseract_cmd = path_to_tesseract
#Get the file names in the directory
for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]):
#Iterate over each file name in the folder
for file_name in file_names:
#Open image with PIL
source_path = os.path.join(ocr_settings["path_to_input"], file_name)
img = Image.open(source_path)
#Extract text from image
text = pytesseract.image_to_string(img)
# Prep target folder + files
file,file_ext = os.path.splitext(file_name)
target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
os.makedirs(os.path.dirname(target_path), exist_ok=True)
print(f"Creating files for : {file}")
original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
shutil.copy2(source_path,original_file)
extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt")
#print(text)
with open(extracted_file, "w") as text_file:
text_file.write(text)
pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf")
img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True)
if __name__ == "__main__":
scan_document_folder()