You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

74 lines
3.0 KiB
Python

from PIL import Image
from pytesseract import pytesseract
import os
from configparser import ConfigParser
import shutil
def scan_document_folder() -> None:
"""Scan Input Odner für OCR Analyse und Verarbeitung der Bilder
"""
# Read Config, Setup
ocr_config = ConfigParser()
ocr_config.read("ocr_config.ini")
ocr_settings = ocr_config['ocr']
path_to_tesseract = ocr_settings["path_to_tesseract"]
pytesseract.tesseract_cmd = path_to_tesseract
#Get the file names in the directory
for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]):
#Iterate over each file name in the folder
for file_name in file_names:
#Open image with PIL
source_path = os.path.join(ocr_settings["path_to_input"], file_name)
img = Image.open(source_path)
#Extract text from image
language = "deu"
pageseg_mode = "3" # Auto-Segmentation
options = f"-l {language} --psm {pageseg_mode}"
text = pytesseract.image_to_string(img,config=options)
# Prep target folder + files
file,file_ext = os.path.splitext(file_name)
target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
os.makedirs(os.path.dirname(target_path), exist_ok=True)
print(f"Creating files for : {file}")
original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
shutil.copy2(source_path,original_file)
os.remove(source_path)
extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt")
with open(extracted_file, "w") as text_file:
text_file.write(text)
pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf")
img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True)
def export_to_mount(symlinks=False, ignore=None) -> None:
"""Export zu einem externen Folder(z.b. NAS Mount)
Args:
symlinks (bool, optional): Config, ob shutil symlinks berücksichtigen soll. Defaults to False.
ignore (str, optional): Config, ob shutil gewisse Files ignorieren soll. Defaults to None.
"""
# Read Config, Setup
ocr_config = ConfigParser()
ocr_config.read("ocr_config.ini")
ocr_settings = ocr_config['ocr']
src = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"])
mount_path = ocr_settings["path_to_mount"]
for item in os.listdir(src):
s = os.path.join(src, item)
d = os.path.join(mount_path, item)
if os.path.isdir(s):
shutil.copytree(s, d, symlinks, ignore)
else:
shutil.copy2(s, d)
if __name__ == "__main__":
scan_document_folder()
export_to_mount()