main
dev_alex 1 year ago
parent 74e7c64471
commit a5536eb00e

@ -1,27 +1,29 @@
from PIL import Image from PIL import Image
from pytesseract import pytesseract from pytesseract import pytesseract
import os import os
from configparser import ConfigParser
#Define path to tessaract.exe #or equivalent on os
path_to_tesseract = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
#Define path to image def scan_document_folder():
#path_to_image = 'input/sampletext1-ocr.png' ocr_config = ConfigParser()
#Define path to images folder ocr_config.read("ocr_config.ini")
path_to_input = r'input/'
path_to_output = r'output/' ocr_settings = ocr_config['ocr']
path_to_tesseract = ocr_settings["path_to_tesseract"]
pytesseract.tesseract_cmd = path_to_tesseract
#Get the file names in the directory
for root, dirs, file_names in os.walk(ocr_settings["path_to_input"]):
#Iterate over each file name in the folder
for file_name in file_names:
#Open image with PIL
img = Image.open(ocr_settings["path_to_input"] + file_name)
#Point tessaract_cmd to tessaract.exe #Extract text from image
pytesseract.tesseract_cmd = path_to_tesseract text = pytesseract.image_to_string(img)
#Get the file names in the directory print(text)
for root, dirs, file_names in os.walk(path_to_input):
#Iterate over each file name in the folder
for file_name in file_names:
#Open image with PIL
img = Image.open(path_to_input + file_name)
#Extract text from image
text = pytesseract.image_to_string(img)
print(text) if __name__ == "__main__":
scan_document_folder()
Loading…
Cancel
Save