feat: TabOcr_BuildPHashDatabase

This commit is contained in:
2023-10-09 22:48:08 +08:00
parent de8c5d28a7
commit 4a1e20a45f
9 changed files with 746 additions and 5 deletions

View File

@ -0,0 +1,28 @@
from .build_phash import build_image_phash_database
try:
import json
from arcaea_offline_ocr.device.v1.definition import DeviceV1
from arcaea_offline_ocr.device.v2.definition import DeviceV2
def load_devices_json(filepath: str) -> list[DeviceV1]:
with open(filepath, "r", encoding="utf-8") as f:
file_content = f.read()
if len(file_content) == 0:
return []
content = json.loads(file_content)
assert isinstance(content, list)
devices = []
for item in content:
version = item["version"]
if version == 1:
devices.append(DeviceV1(**item))
elif version == 2:
devices.append(DeviceV2(**item))
return devices
except Exception:
def load_devices_json(*args, **kwargs):
pass

View File

@ -0,0 +1,61 @@
import sqlite3
import time
from pathlib import Path
from typing import Any, Callable, Optional
import cv2
from arcaea_offline_ocr.phash_db import phash_opencv
def build_image_phash_database(
images: list[Path],
labels: list[str],
*,
hash_size: int = 16,
highfreq_factor: int = 4,
progress_func: Optional[Callable[[int, int], Any]] = None,
):
assert len(images) == len(labels)
conn = sqlite3.connect(":memory:", check_same_thread=False)
with conn:
cursor = conn.cursor()
cursor.execute("CREATE TABLE properties (key TEXT, value TEXT)")
cursor.executemany(
"INSERT INTO properties VALUES (?, ?)",
[
("hash_size", hash_size),
("highfreq_factor", highfreq_factor),
],
)
image_num = len(images)
id_hashes = []
for i, label, image_path in zip(range(image_num), labels, images):
image_hash = phash_opencv(
cv2.imread(str(image_path.resolve()), cv2.IMREAD_GRAYSCALE),
hash_size=hash_size,
highfreq_factor=highfreq_factor,
)
image_hash_bytes = image_hash.flatten().tobytes()
id_hashes.append([label, image_hash_bytes])
if progress_func:
progress_func(i + 1, image_num)
hash_length = len(id_hashes[0][1])
cursor.execute(f"CREATE TABLE hashes (id TEXT, hash BLOB({hash_length}))")
cursor.executemany(
"INSERT INTO hashes VALUES (?, ?)",
id_hashes,
)
cursor.executemany(
"INSERT INTO properties VALUES (?, ?)",
[("built_timestamp", int(time.time()))],
)
conn.commit()
return conn