Python OCR Image Reader with Tesseract
Python OCR Image Reader with Tesseract
💾 1. Install Required Libraries
Use pip to install all required packages:
pip install opencv-python pytesseract pillow numpy
🔍 2. Overview
This Python script reads an image file (e.g., denek.png) containing Turkish text, preprocesses it for better recognition, extracts text using pytesseract, and prints specific fields like Gönderen (Sender) and Kaynak (Source).
📦 3. Check Library Dependencies
def check_libraries():
necessary_libraries = ["cv2", "pytesseract", "PIL", "pathlib", "os", "numpy"]
for library in necessary_libraries:
try:
__import__(library)
print(f"{library} is installed.")
except ImportError:
print(f"{library} is not installed. Please install it using pip: pip install {library}")
return False
return True📁 4. Image File Path (Default)
The script expects denek.png to be in the same folder as the script:
image_path = Path("denek.png")
🖼️ 5. Preprocessing the Image
- Convert to grayscale
- Apply binary thresholding
- Save temporary image for OCR
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary_image = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
cv2.imwrite("preprocessed_image.png", binary_image)
🧠 6. OCR & Text Extraction
text = pytesseract.image_to_string(Image.open("preprocessed_image.png"), lang='tur')
🧾 7. Extract & Print Information
print(f"Gönderen: {extracted_data.get('Gönderen', 'Not found')}")
print(f"Kaynak: {extracted_data.get('Kaynak', 'Not found')}")
🧹 8. Clean Up
Temporary file is deleted:
if preprocessed_image_path.is_file():
preprocessed_image_path.unlink()
🔁 9. Full Code (With Default Image Path)
import cv2
import pytesseract
from PIL import Image
from pathlib import Path
import os
import numpy as np
def check_libraries():
necessary_libraries = ["cv2", "pytesseract", "PIL", "pathlib", "os", "numpy"]
for library in necessary_libraries:
try:
__import__(library)
print(f"{library} is installed.")
except ImportError:
print(f"{library} is not installed. Please install it using pip: pip install {library}")
return False
return True
def pil_to_cv2(pil_image):
return cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
def main():
pytesseract.pytesseract.tesseract_cmd = r"C:/Program Files/Tesseract-OCR/tesseract.exe"
image_path = Path("denek.png")
try:
print(f"Looking for image at: {image_path}")
if not image_path.is_file():
raise FileNotFoundError(f"Image not found at: {image_path}")
pil_image = Image.open(image_path)
image = pil_to_cv2(pil_image)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, binary_image = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
preprocessed_image_path = Path("preprocessed_image.png")
cv2.imwrite(str(preprocessed_image_path), binary_image)
text = pytesseract.image_to_string(Image.open(preprocessed_image_path), lang='tur')
lines = text.split('\n')
extracted_data = {}
for line in lines:
if ':' in line:
key, value = line.split(':', 1)
extracted_data[key.strip()] = value.strip()
print(f"Gönderen: {extracted_data.get('Gönderen', 'Not found')}")
print(f"Kaynak: {extracted_data.get('Kaynak', 'Not found')}")
if preprocessed_image_path.is_file():
preprocessed_image_path.unlink()
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
if check_libraries():
main()
else:
print("Please install the necessary libraries and try again.")
Comments