from joblib import Parallel, delayed
import cv2
import os
import shutil
import pytesseract
import numpy as np
from skimage.color import rgb2gray
from tqdm import tqdm
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
def is_webpage_screenshot_optimized(image_path):
img = cv2.imread(image_path)
if img is None:
print(f"Failed to load image: {image_path}")
return False
img = cv2.resize(img, None, fx=0.5, fy=0.5)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
lines = cv2.HoughLinesP(edges, 1, np.pi/180, 100, minLineLength=100, maxLineGap=10)
line_count = 0 if lines is None else len(lines)
text = ""
img_small = cv2.resize(img, (100, 100))
img_flat = img_small.reshape((-1, 3))
unique_colors = len(np.unique(img_flat, axis=0))
font_layout_score = 0.5
return line_count > 10 and len(text) > 100 and unique_colors < 128 and font_layout_score > 0.5
source_dir = "images"
dest_dir = "non_webpage_images"
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
image_paths = [os.path.join(root, file) for root, dirs, files in os.walk(source_dir)
for file in files if file.lower().endswith(('.png', '.jpg', '.jpeg'))]
results = Parallel(n_jobs=-1)(delayed(is_webpage_screenshot_optimized)(path) for path in tqdm(image_paths, desc="Processing Images"))
for path, result in zip(image_paths, results):
if not result:
shutil.move(path, dest_dir)