Python OCR Image Reader with Tesseract
Python OCR Image Reader with Tesseract
💾 1. Install Required Libraries
Use pip to install all required packages:
pip install opencv-python pytesseract pillow numpy
🔍 2. Overview
This Python script reads an image file (e.g., denek.png
) containing Turkish text, preprocesses it for better recognition, extracts text using pytesseract, and prints specific fields like Gönderen (Sender) and Kaynak (Source).
📦 3. Check Library Dependencies
def check_libraries():
necessary_libraries = ["cv2", "pytesseract", "PIL", "pathlib", "os", "numpy"]
for library in necessary_libraries:
try:
__import__(library)
print(f"{library} is installed.")
except ImportError:
print(f"{library} is not installed. Please install it using pip: pip install {library}")
return False
return True
📁 4. Image File Path (Default)
The script expects denek.png
to be in the same folder as the script:
image_path = Path("denek.png")
🖼️ 5. Preprocessing the Image
- Convert to grayscale
- Apply binary thresholding
- Save temporary image for OCR
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary_image = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
cv2.imwrite("preprocessed_image.png", binary_image)
🧠 6. OCR & Text Extraction
text = pytesseract.image_to_string(Image.open("preprocessed_image.png"), lang='tur')
🧾 7. Extract & Print Information
print(f"Gönderen: {extracted_data.get('Gönderen', 'Not found')}")
print(f"Kaynak: {extracted_data.get('Kaynak', 'Not found')}")
🧹 8. Clean Up
Temporary file is deleted:
if preprocessed_image_path.is_file():
preprocessed_image_path.unlink()
🔁 9. Full Code (With Default Image Path)
import cv2
import pytesseract
from PIL import Image
from pathlib import Path
import os
import numpy as np
def check_libraries():
necessary_libraries = ["cv2", "pytesseract", "PIL", "pathlib", "os", "numpy"]
for library in necessary_libraries:
try:
__import__(library)
print(f"{library} is installed.")
except ImportError:
print(f"{library} is not installed. Please install it using pip: pip install {library}")
return False
return True
def pil_to_cv2(pil_image):
return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
def main():
pytesseract.pytesseract.tesseract_cmd = r"C:/Program Files/Tesseract-OCR/tesseract.exe"
image_path = Path("denek.png")
try:
print(f"Looking for image at: {image_path}")
if not image_path.is_file():
raise FileNotFoundError(f"Image not found at: {image_path}")
pil_image = Image.open(image_path)
image = pil_to_cv2(pil_image)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary_image = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
preprocessed_image_path = Path("preprocessed_image.png")
cv2.imwrite(str(preprocessed_image_path), binary_image)
text = pytesseract.image_to_string(Image.open(preprocessed_image_path), lang='tur')
lines = text.split('\n')
extracted_data = {}
for line in lines:
if ':' in line:
key, value = line.split(':', 1)
extracted_data[key.strip()] = value.strip()
print(f"Gönderen: {extracted_data.get('Gönderen', 'Not found')}")
print(f"Kaynak: {extracted_data.get('Kaynak', 'Not found')}")
if preprocessed_image_path.is_file():
preprocessed_image_path.unlink()
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
if check_libraries():
main()
else:
print("Please install the necessary libraries and try again.")
Comments