refactor

1 year ago · a5536eb00e
parent 74e7c64471
commit a5536eb00e
1 changed files with 20 additions and 18 deletions
--- a/ocr_scan.py
+++ b/ocr_scan.py
@ -1,27 +1,29 @@
 from PIL import Image
 from pytesseract import pytesseract
 import os
 from configparser import ConfigParser
 #Define path to tessaract.exe #or equivalent on os
 path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
-#Define path to image
+def scan_document_folder():
-#path_to_image = 'input/sampletext1-ocr.png'
+    ocr_config = ConfigParser()
-#Define path to images folder
+    ocr_config.read("ocr_config.ini")
 path_to_input = r'input/'
 path_to_output = r'output/'
-#Point tessaract_cmd to tessaract.exe
+    ocr_settings = ocr_config['ocr']
    path_to_tesseract = ocr_settings["path_to_tesseract"]
    pytesseract.tesseract_cmd = path_to_tesseract
    #Get the file names in the directory
-for root, dirs, file_names in os.walk(path_to_input):
+    for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]):
        #Iterate over each file name in the folder
        for file_name in file_names:
            #Open image with PIL
-        img = Image.open(path_to_input + file_name)
+            img = Image.open(ocr_settings["path_to_input"] + file_name)
            #Extract text from image
            text = pytesseract.image_to_string(img)
            print(text)
 if __name__ == "__main__":
    scan_document_folder()