mirror of
https://github.com/283375/arcaea-offline-ocr.git
synced 2025-07-02 04:46:27 +00:00
Compare commits
1 Commits
619bff2ea4
...
master
Author | SHA1 | Date | |
---|---|---|---|
5215218526
|
15
.github/workflows/build-and-draft-release.yml
vendored
15
.github/workflows/build-and-draft-release.yml
vendored
@ -4,7 +4,9 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
push:
|
push:
|
||||||
tags:
|
tags:
|
||||||
- "v[0-9]+.[0-9]+.[0-9]+"
|
# regex taken from
|
||||||
|
# https://packaging.python.org/en/latest/specifications/version-specifiers/#appendix-parsing-version-strings-with-regular-expressions
|
||||||
|
- '^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\.post(0|[1-9][0-9]*))?(\.dev(0|[1-9][0-9]*))?$'
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: write
|
contents: write
|
||||||
@ -29,14 +31,6 @@ jobs:
|
|||||||
pip install build
|
pip install build
|
||||||
python -m build
|
python -m build
|
||||||
|
|
||||||
- name: Remove `v` in tag name
|
|
||||||
uses: mad9000/actions-find-and-replace-string@5
|
|
||||||
id: tagNameReplaced
|
|
||||||
with:
|
|
||||||
source: ${{ github.ref_name }}
|
|
||||||
find: "v"
|
|
||||||
replace: ""
|
|
||||||
|
|
||||||
- name: Draft a release
|
- name: Draft a release
|
||||||
uses: softprops/action-gh-release@v2
|
uses: softprops/action-gh-release@v2
|
||||||
with:
|
with:
|
||||||
@ -44,5 +38,4 @@ jobs:
|
|||||||
draft: true
|
draft: true
|
||||||
generate_release_notes: true
|
generate_release_notes: true
|
||||||
files: |
|
files: |
|
||||||
dist/arcaea_offline_ocr-${{ steps.tagNameReplaced.outputs.value }}*.whl
|
dist/*
|
||||||
dist/arcaea-offline-ocr-${{ steps.tagNameReplaced.outputs.value }}.tar.gz
|
|
||||||
|
@ -4,9 +4,11 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/psf/black
|
||||||
rev: v0.9.0
|
rev: 23.1.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: black
|
||||||
args: ["--fix"]
|
- repo: https://github.com/PyCQA/isort
|
||||||
- id: ruff-format
|
rev: 5.12.0
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
@ -1,3 +0,0 @@
|
|||||||
from .index import average, dct, difference
|
|
||||||
|
|
||||||
__all__ = ["average", "dct", "difference"]
|
|
@ -1,7 +0,0 @@
|
|||||||
import cv2
|
|
||||||
|
|
||||||
from arcaea_offline_ocr.types import Mat
|
|
||||||
|
|
||||||
|
|
||||||
def _resize_image(src: Mat, dsize: ...) -> Mat:
|
|
||||||
return cv2.resize(src, dsize, fx=0, fy=0, interpolation=cv2.INTER_AREA)
|
|
@ -1,35 +0,0 @@
|
|||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from arcaea_offline_ocr.types import Mat
|
|
||||||
|
|
||||||
from ._common import _resize_image
|
|
||||||
|
|
||||||
|
|
||||||
def average(img_gray: Mat, hash_size: int) -> Mat:
|
|
||||||
img_resized = _resize_image(img_gray, (hash_size, hash_size))
|
|
||||||
diff = img_resized > img_resized.mean()
|
|
||||||
return diff.flatten()
|
|
||||||
|
|
||||||
|
|
||||||
def difference(img_gray: Mat, hash_size: int) -> Mat:
|
|
||||||
img_size = (hash_size + 1, hash_size)
|
|
||||||
img_resized = _resize_image(img_gray, img_size)
|
|
||||||
|
|
||||||
previous = img_resized[:, :-1]
|
|
||||||
current = img_resized[:, 1:]
|
|
||||||
diff = previous > current
|
|
||||||
return diff.flatten()
|
|
||||||
|
|
||||||
|
|
||||||
def dct(img_gray: Mat, hash_size: int = 16, high_freq_factor: int = 4) -> Mat:
|
|
||||||
# TODO: consistency?
|
|
||||||
img_size_base = hash_size * high_freq_factor
|
|
||||||
img_size = (img_size_base, img_size_base)
|
|
||||||
|
|
||||||
img_resized = _resize_image(img_gray, img_size)
|
|
||||||
img_resized = img_resized.astype(np.float32)
|
|
||||||
dct_mat = cv2.dct(img_resized)
|
|
||||||
|
|
||||||
hash_mat = dct_mat[:hash_size, :hash_size]
|
|
||||||
return hash_mat > hash_mat.mean()
|
|
@ -1,18 +0,0 @@
|
|||||||
from .builder import ImageHashesDatabaseBuilder
|
|
||||||
from .index import ImageHashesDatabase, ImageHashesDatabasePropertyMissingError
|
|
||||||
from .models import (
|
|
||||||
ImageHashBuildTask,
|
|
||||||
ImageHashHashType,
|
|
||||||
ImageHashResult,
|
|
||||||
ImageHashType,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"ImageHashesDatabase",
|
|
||||||
"ImageHashesDatabasePropertyMissingError",
|
|
||||||
"ImageHashHashType",
|
|
||||||
"ImageHashResult",
|
|
||||||
"ImageHashType",
|
|
||||||
"ImageHashesDatabaseBuilder",
|
|
||||||
"ImageHashBuildTask",
|
|
||||||
]
|
|
@ -1,85 +0,0 @@
|
|||||||
import logging
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from sqlite3 import Connection
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from arcaea_offline_ocr.core import hashers
|
|
||||||
|
|
||||||
from .index import ImageHashesDatabase
|
|
||||||
from .models import ImageHash, ImageHashBuildTask, ImageHashHashType
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class ImageHashesDatabaseBuilder:
|
|
||||||
@staticmethod
|
|
||||||
def __insert_property(conn: Connection, key: str, value: str):
|
|
||||||
return conn.execute(
|
|
||||||
"INSERT INTO properties (key, value) VALUES (?, ?)",
|
|
||||||
(key, value),
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def build(
|
|
||||||
cls,
|
|
||||||
conn: Connection,
|
|
||||||
tasks: List[ImageHashBuildTask],
|
|
||||||
*,
|
|
||||||
hash_size: int = 16,
|
|
||||||
high_freq_factor: int = 4,
|
|
||||||
):
|
|
||||||
rows: List[ImageHash] = []
|
|
||||||
|
|
||||||
for task in tasks:
|
|
||||||
try:
|
|
||||||
img_gray = task.imread_function(task.image_path)
|
|
||||||
|
|
||||||
for hash_type, hash_mat in [
|
|
||||||
(
|
|
||||||
ImageHashHashType.AVERAGE,
|
|
||||||
hashers.average(img_gray, hash_size),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
ImageHashHashType.DCT,
|
|
||||||
hashers.dct(img_gray, hash_size, high_freq_factor),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
ImageHashHashType.DIFFERENCE,
|
|
||||||
hashers.difference(img_gray, hash_size),
|
|
||||||
),
|
|
||||||
]:
|
|
||||||
rows.append(
|
|
||||||
ImageHash(
|
|
||||||
hash_type=hash_type,
|
|
||||||
type=task.type,
|
|
||||||
label=task.label,
|
|
||||||
hash=ImageHashesDatabase.hash_mat_to_bytes(hash_mat),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Error processing task %r", task)
|
|
||||||
|
|
||||||
conn.execute("CREATE TABLE properties (`key` VARCHAR, `value` VARCHAR)")
|
|
||||||
conn.execute(
|
|
||||||
"CREATE TABLE hashes (`hash_type` INTEGER, `type` INTEGER, `label` VARCHAR, `hash` BLOB)"
|
|
||||||
)
|
|
||||||
|
|
||||||
now = datetime.now(tz=timezone.utc)
|
|
||||||
timestamp = int(now.timestamp() * 1000)
|
|
||||||
|
|
||||||
cls.__insert_property(conn, ImageHashesDatabase.KEY_HASH_SIZE, str(hash_size))
|
|
||||||
cls.__insert_property(
|
|
||||||
conn, ImageHashesDatabase.KEY_HIGH_FREQ_FACTOR, str(high_freq_factor)
|
|
||||||
)
|
|
||||||
cls.__insert_property(
|
|
||||||
conn, ImageHashesDatabase.KEY_BUILT_TIMESTAMP, str(timestamp)
|
|
||||||
)
|
|
||||||
|
|
||||||
conn.executemany(
|
|
||||||
"INSERT INTO hashes (hash_type, type, label, hash) VALUES (?, ?, ?, ?)",
|
|
||||||
[
|
|
||||||
(row.hash_type.value, row.type.value, row.label, row.hash)
|
|
||||||
for row in rows
|
|
||||||
],
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
@ -1,141 +0,0 @@
|
|||||||
import sqlite3
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from typing import Any, Callable, List, Optional, TypeVar
|
|
||||||
|
|
||||||
from arcaea_offline_ocr.core import hashers
|
|
||||||
from arcaea_offline_ocr.types import Mat
|
|
||||||
|
|
||||||
from .models import ImageHashHashType, ImageHashResult, ImageHashType
|
|
||||||
|
|
||||||
T = TypeVar("T")
|
|
||||||
|
|
||||||
|
|
||||||
def _sql_hamming_distance(hash1: bytes, hash2: bytes):
|
|
||||||
assert len(hash1) == len(hash2), "hash size does not match!"
|
|
||||||
count = sum(1 for byte1, byte2 in zip(hash1, hash2) if byte1 != byte2)
|
|
||||||
return count
|
|
||||||
|
|
||||||
|
|
||||||
class ImageHashesDatabasePropertyMissingError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class ImageHashesDatabase:
|
|
||||||
KEY_HASH_SIZE = "hash_size"
|
|
||||||
KEY_HIGH_FREQ_FACTOR = "high_freq_factor"
|
|
||||||
KEY_BUILT_TIMESTAMP = "built_timestamp"
|
|
||||||
|
|
||||||
def __init__(self, conn: sqlite3.Connection):
|
|
||||||
self.conn = conn
|
|
||||||
self.conn.create_function("HAMMING_DISTANCE", 2, _sql_hamming_distance)
|
|
||||||
|
|
||||||
self._hash_size: int = -1
|
|
||||||
self._high_freq_factor: int = -1
|
|
||||||
self._built_time: Optional[datetime] = None
|
|
||||||
|
|
||||||
self._hashes_count = {
|
|
||||||
ImageHashType.JACKET: 0,
|
|
||||||
ImageHashType.PARTNER_ICON: 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
self._hash_length: int = -1
|
|
||||||
|
|
||||||
self._initialize()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hash_size(self):
|
|
||||||
return self._hash_size
|
|
||||||
|
|
||||||
@property
|
|
||||||
def high_freq_factor(self):
|
|
||||||
return self._high_freq_factor
|
|
||||||
|
|
||||||
@property
|
|
||||||
def hash_length(self):
|
|
||||||
return self._hash_length
|
|
||||||
|
|
||||||
def _initialize(self):
|
|
||||||
def query_property(key, convert_func: Callable[[Any], T]) -> Optional[T]:
|
|
||||||
result = self.conn.execute(
|
|
||||||
"SELECT value FROM properties WHERE key = ?",
|
|
||||||
(key,),
|
|
||||||
).fetchone()
|
|
||||||
return convert_func(result[0]) if result is not None else None
|
|
||||||
|
|
||||||
def set_hashes_count(type: ImageHashType):
|
|
||||||
self._hashes_count[type] = self.conn.execute(
|
|
||||||
"SELECT COUNT(DISTINCT label) FROM hashes WHERE type = ?", (type.value,)
|
|
||||||
).fetchone()[0]
|
|
||||||
|
|
||||||
hash_size = query_property(self.KEY_HASH_SIZE, lambda x: int(x))
|
|
||||||
if hash_size is None:
|
|
||||||
raise ImageHashesDatabasePropertyMissingError("hash_size")
|
|
||||||
self._hash_size = hash_size
|
|
||||||
|
|
||||||
high_freq_factor = query_property(self.KEY_HIGH_FREQ_FACTOR, lambda x: int(x))
|
|
||||||
if high_freq_factor is None:
|
|
||||||
raise ImageHashesDatabasePropertyMissingError("high_freq_factor")
|
|
||||||
self._high_freq_factor = high_freq_factor
|
|
||||||
|
|
||||||
self._built_time = query_property(
|
|
||||||
self.KEY_BUILT_TIMESTAMP,
|
|
||||||
lambda ts: datetime.fromtimestamp(int(ts) / 1000, tz=timezone.utc),
|
|
||||||
)
|
|
||||||
|
|
||||||
set_hashes_count(ImageHashType.JACKET)
|
|
||||||
set_hashes_count(ImageHashType.PARTNER_ICON)
|
|
||||||
|
|
||||||
self._hash_length = self._hash_size**2
|
|
||||||
|
|
||||||
def lookup_hash(
|
|
||||||
self, type: ImageHashType, hash_type: ImageHashHashType, hash: bytes
|
|
||||||
) -> List[ImageHashResult]:
|
|
||||||
cursor = self.conn.execute(
|
|
||||||
"SELECT"
|
|
||||||
" label,"
|
|
||||||
" HAMMING_DISTANCE(hash, ?) AS distance"
|
|
||||||
" FROM hashes"
|
|
||||||
" WHERE type = ? AND hash_type = ?"
|
|
||||||
" ORDER BY distance ASC LIMIT 10",
|
|
||||||
(hash, type.value, hash_type.value),
|
|
||||||
)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for label, distance in cursor.fetchall():
|
|
||||||
results.append(
|
|
||||||
ImageHashResult(
|
|
||||||
hash_type=hash_type,
|
|
||||||
type=type,
|
|
||||||
label=label,
|
|
||||||
confidence=(self.hash_length - distance) / self.hash_length,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def hash_mat_to_bytes(hash: Mat) -> bytes:
|
|
||||||
return bytes([255 if b else 0 for b in hash.flatten()])
|
|
||||||
|
|
||||||
def identify_image(self, type: ImageHashType, img) -> List[ImageHashResult]:
|
|
||||||
results = []
|
|
||||||
|
|
||||||
ahash = hashers.average(img, self.hash_size)
|
|
||||||
dhash = hashers.difference(img, self.hash_size)
|
|
||||||
phash = hashers.dct(img, self.hash_size, self.high_freq_factor)
|
|
||||||
|
|
||||||
results.extend(
|
|
||||||
self.lookup_hash(
|
|
||||||
type, ImageHashHashType.AVERAGE, self.hash_mat_to_bytes(ahash)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
results.extend(
|
|
||||||
self.lookup_hash(
|
|
||||||
type, ImageHashHashType.DIFFERENCE, self.hash_mat_to_bytes(dhash)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
results.extend(
|
|
||||||
self.lookup_hash(type, ImageHashHashType.DCT, self.hash_mat_to_bytes(phash))
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
|
@ -1,46 +0,0 @@
|
|||||||
import dataclasses
|
|
||||||
from enum import IntEnum
|
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import cv2
|
|
||||||
|
|
||||||
from arcaea_offline_ocr.types import Mat
|
|
||||||
|
|
||||||
|
|
||||||
class ImageHashHashType(IntEnum):
|
|
||||||
AVERAGE = 0
|
|
||||||
DIFFERENCE = 1
|
|
||||||
DCT = 2
|
|
||||||
|
|
||||||
|
|
||||||
class ImageHashType(IntEnum):
|
|
||||||
JACKET = 0
|
|
||||||
PARTNER_ICON = 1
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
|
||||||
class ImageHash:
|
|
||||||
hash_type: ImageHashHashType
|
|
||||||
type: ImageHashType
|
|
||||||
label: str
|
|
||||||
hash: bytes
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
|
||||||
class ImageHashResult:
|
|
||||||
hash_type: ImageHashHashType
|
|
||||||
type: ImageHashType
|
|
||||||
label: str
|
|
||||||
confidence: float
|
|
||||||
|
|
||||||
|
|
||||||
def _default_imread_gray(image_path: str):
|
|
||||||
return cv2.cvtColor(cv2.imread(image_path, cv2.IMREAD_COLOR), cv2.COLOR_BGR2GRAY)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
|
||||||
class ImageHashBuildTask:
|
|
||||||
image_path: str
|
|
||||||
type: ImageHashType
|
|
||||||
label: str
|
|
||||||
imread_function: Callable[[str], Mat] = _default_imread_gray
|
|
@ -110,7 +110,7 @@ class DeviceOcr:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def preprocess_char_icon(img_gray: Mat):
|
def preprocess_char_icon(img_gray: Mat):
|
||||||
h, w = img_gray.shape[:2]
|
h, w = img_gray.shape[:2]
|
||||||
img = cv2.copyMakeBorder(img_gray, max(w - h, 0), 0, 0, 0, cv2.BORDER_REPLICATE)
|
img = cv2.copyMakeBorder(img_gray, w - h, 0, 0, 0, cv2.BORDER_REPLICATE)
|
||||||
h, w = img.shape[:2]
|
h, w = img.shape[:2]
|
||||||
img = cv2.fillPoly(
|
img = cv2.fillPoly(
|
||||||
img,
|
img,
|
||||||
|
Reference in New Issue
Block a user