From 9d33c13953f66d237c244c4cf84e018b1abd36eb Mon Sep 17 00:00:00 2001 From: dev_alex Date: Sun, 30 Jul 2023 20:40:39 +0200 Subject: [PATCH] Add non-english scanning --- ocr_scan.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ocr_scan.py b/ocr_scan.py index c9d3147..62b51e5 100644 --- a/ocr_scan.py +++ b/ocr_scan.py @@ -22,7 +22,10 @@ def scan_document_folder(): source_path = os.path.join(ocr_settings["path_to_input"], file_name) img = Image.open(source_path) #Extract text from image - text = pytesseract.image_to_string(img) + language = "deu" + pageseg_mode = "3" # Auto-Segmentation + options = f"-l {language} --psm {pageseg_mode}" + text = pytesseract.image_to_string(img,config=options) # Prep target folder + files file,file_ext = os.path.splitext(file_name)