refactor
parent
74e7c64471
commit
a5536eb00e
@ -1,27 +1,29 @@
|
||||
from PIL import Image
|
||||
from pytesseract import pytesseract
|
||||
import os
|
||||
from configparser import ConfigParser
|
||||
|
||||
#Define path to tessaract.exe #or equivalent on os
|
||||
path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
||||
|
||||
#Define path to image
|
||||
#path_to_image = 'input/sampletext1-ocr.png'
|
||||
#Define path to images folder
|
||||
path_to_input = r'input/'
|
||||
path_to_output = r'output/'
|
||||
def scan_document_folder():
|
||||
ocr_config = ConfigParser()
|
||||
ocr_config.read("ocr_config.ini")
|
||||
|
||||
ocr_settings = ocr_config['ocr']
|
||||
path_to_tesseract = ocr_settings["path_to_tesseract"]
|
||||
pytesseract.tesseract_cmd = path_to_tesseract
|
||||
|
||||
#Get the file names in the directory
|
||||
for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]):
|
||||
#Iterate over each file name in the folder
|
||||
for file_name in file_names:
|
||||
#Open image with PIL
|
||||
img = Image.open(ocr_settings["path_to_input"] + file_name)
|
||||
|
||||
#Point tessaract_cmd to tessaract.exe
|
||||
pytesseract.tesseract_cmd = path_to_tesseract
|
||||
#Extract text from image
|
||||
text = pytesseract.image_to_string(img)
|
||||
|
||||
#Get the file names in the directory
|
||||
for root, dirs, file_names in os.walk(path_to_input):
|
||||
#Iterate over each file name in the folder
|
||||
for file_name in file_names:
|
||||
#Open image with PIL
|
||||
img = Image.open(path_to_input + file_name)
|
||||
print(text)
|
||||
|
||||
#Extract text from image
|
||||
text = pytesseract.image_to_string(img)
|
||||
|
||||
print(text)
|
||||
if __name__ == "__main__":
|
||||
scan_document_folder()
|
Loading…
Reference in New Issue