diff --git a/ocr_scan.py b/ocr_scan.py index 0aac941..93d948c 100644 --- a/ocr_scan.py +++ b/ocr_scan.py @@ -1,27 +1,29 @@ from PIL import Image from pytesseract import pytesseract import os +from configparser import ConfigParser -#Define path to tessaract.exe #or equivalent on os -path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe' -#Define path to image -#path_to_image = 'input/sampletext1-ocr.png' -#Define path to images folder -path_to_input = r'input/' -path_to_output = r'output/' +def scan_document_folder(): + ocr_config = ConfigParser() + ocr_config.read("ocr_config.ini") + + ocr_settings = ocr_config['ocr'] + path_to_tesseract = ocr_settings["path_to_tesseract"] + pytesseract.tesseract_cmd = path_to_tesseract + + #Get the file names in the directory + for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]): + #Iterate over each file name in the folder + for file_name in file_names: + #Open image with PIL + img = Image.open(ocr_settings["path_to_input"] + file_name) -#Point tessaract_cmd to tessaract.exe -pytesseract.tesseract_cmd = path_to_tesseract + #Extract text from image + text = pytesseract.image_to_string(img) -#Get the file names in the directory -for root, dirs, file_names in os.walk(path_to_input): - #Iterate over each file name in the folder - for file_name in file_names: - #Open image with PIL - img = Image.open(path_to_input + file_name) + print(text) - #Extract text from image - text = pytesseract.image_to_string(img) - print(text) \ No newline at end of file +if __name__ == "__main__": + scan_document_folder() \ No newline at end of file