You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

49 lines
2.1 KiB
Python

from PIL import Image
from pytesseract import pytesseract
import os
2 years ago
from configparser import ConfigParser
import shutil
2 years ago
def scan_document_folder():
# Read Config, Setup
2 years ago
ocr_config = ConfigParser()
ocr_config.read("ocr_config.ini")
ocr_settings = ocr_config['ocr']
path_to_tesseract = ocr_settings["path_to_tesseract"]
# TODO Checken warum hier keine Umlaute erkannt werden -> Sprachmodell wechseln /config
2 years ago
pytesseract.tesseract_cmd = path_to_tesseract
2 years ago
#Get the file names in the directory
for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]):
#Iterate over each file name in the folder
for file_name in file_names:
#Open image with PIL
source_path = os.path.join(ocr_settings["path_to_input"], file_name)
img = Image.open(source_path)
2 years ago
#Extract text from image
language = "deu"
pageseg_mode = "3" # Auto-Segmentation
options = f"-l {language} --psm {pageseg_mode}"
text = pytesseract.image_to_string(img,config=options)
# Prep target folder + files
file,file_ext = os.path.splitext(file_name)
target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
os.makedirs(os.path.dirname(target_path), exist_ok=True)
print(f"Creating files for : {file}")
original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
shutil.copy2(source_path,original_file)
extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt")
#print(text)
with open(extracted_file, "w") as text_file:
text_file.write(text)
pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf")
img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True)
2 years ago
if __name__ == "__main__":
scan_document_folder()