|
|
@ -5,13 +5,14 @@ from configparser import ConfigParser
|
|
|
|
import shutil
|
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def scan_document_folder():
|
|
|
|
def scan_document_folder() -> None:
|
|
|
|
|
|
|
|
"""Scan Input Odner für OCR Analyse und Verarbeitung der Bilder
|
|
|
|
|
|
|
|
"""
|
|
|
|
# Read Config, Setup
|
|
|
|
# Read Config, Setup
|
|
|
|
ocr_config = ConfigParser()
|
|
|
|
ocr_config = ConfigParser()
|
|
|
|
ocr_config.read("ocr_config.ini")
|
|
|
|
ocr_config.read("ocr_config.ini")
|
|
|
|
ocr_settings = ocr_config['ocr']
|
|
|
|
ocr_settings = ocr_config['ocr']
|
|
|
|
path_to_tesseract = ocr_settings["path_to_tesseract"]
|
|
|
|
path_to_tesseract = ocr_settings["path_to_tesseract"]
|
|
|
|
# TODO Checken warum hier keine Umlaute erkannt werden -> Sprachmodell wechseln /config
|
|
|
|
|
|
|
|
pytesseract.tesseract_cmd = path_to_tesseract
|
|
|
|
pytesseract.tesseract_cmd = path_to_tesseract
|
|
|
|
|
|
|
|
|
|
|
|
#Get the file names in the directory
|
|
|
|
#Get the file names in the directory
|
|
|
@ -32,18 +33,42 @@ def scan_document_folder():
|
|
|
|
target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
|
|
|
|
target_path = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
|
|
|
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
|
|
|
os.makedirs(os.path.dirname(target_path), exist_ok=True)
|
|
|
|
print(f"Creating files for : {file}")
|
|
|
|
print(f"Creating files for : {file}")
|
|
|
|
|
|
|
|
|
|
|
|
original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
|
|
|
|
original_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file,"")
|
|
|
|
shutil.copy2(source_path,original_file)
|
|
|
|
shutil.copy2(source_path,original_file)
|
|
|
|
|
|
|
|
os.remove(source_path)
|
|
|
|
|
|
|
|
|
|
|
|
extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt")
|
|
|
|
extracted_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".txt")
|
|
|
|
#print(text)
|
|
|
|
|
|
|
|
with open(extracted_file, "w") as text_file:
|
|
|
|
with open(extracted_file, "w") as text_file:
|
|
|
|
text_file.write(text)
|
|
|
|
text_file.write(text)
|
|
|
|
|
|
|
|
|
|
|
|
pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf")
|
|
|
|
pdf_file = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"] , file, file + ".pdf")
|
|
|
|
img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True)
|
|
|
|
img.save(pdf_file, "PDF" ,resolution=100.0, save_all=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def export_to_mount(symlinks=False, ignore=None) -> None:
|
|
|
|
|
|
|
|
"""Export zu einem externen Folder(z.b. NAS Mount)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
|
|
|
symlinks (bool, optional): Config, ob shutil symlinks berücksichtigen soll. Defaults to False.
|
|
|
|
|
|
|
|
ignore (str, optional): Config, ob shutil gewisse Files ignorieren soll. Defaults to None.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Read Config, Setup
|
|
|
|
|
|
|
|
ocr_config = ConfigParser()
|
|
|
|
|
|
|
|
ocr_config.read("ocr_config.ini")
|
|
|
|
|
|
|
|
ocr_settings = ocr_config['ocr']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src = os.path.join(os.path.dirname(os.path.abspath(__file__)) , ocr_settings["path_to_output"])
|
|
|
|
|
|
|
|
mount_path = ocr_settings["path_to_mount"]
|
|
|
|
|
|
|
|
for item in os.listdir(src):
|
|
|
|
|
|
|
|
s = os.path.join(src, item)
|
|
|
|
|
|
|
|
d = os.path.join(mount_path, item)
|
|
|
|
|
|
|
|
if os.path.isdir(s):
|
|
|
|
|
|
|
|
shutil.copytree(s, d, symlinks, ignore)
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
shutil.copy2(s, d)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
if __name__ == "__main__":
|
|
|
|
scan_document_folder()
|
|
|
|
scan_document_folder()
|
|
|
|
|
|
|
|
export_to_mount()
|