wip: searcher

This commit is contained in:
283375 2023-08-28 20:28:40 +08:00
parent b180976284
commit 316b02cd1b
Signed by: 283375
SSH Key Fingerprint: SHA256:UcX0qg6ZOSDOeieKPGokA5h7soykG61nz2uxuQgVLSk
4 changed files with 119 additions and 0 deletions

View File

@ -13,6 +13,7 @@ dependencies = [
"beautifulsoup4==4.12.2",
"SQLAlchemy==2.0.20",
"SQLAlchemy-Utils==0.41.1",
"Whoosh==2.7.4",
]
classifiers = [
"Development Status :: 3 - Alpha",

View File

@ -1,3 +1,4 @@
beautifulsoup4==4.12.2
SQLAlchemy==2.0.20
SQLAlchemy-Utils==0.41.1
Whoosh==2.7.4

View File

@ -0,0 +1,111 @@
from typing import List, Union
from sqlalchemy import select
from sqlalchemy.orm import Session
from whoosh.analysis import LowercaseFilter, RegexTokenizer
from whoosh.fields import ID, KEYWORD, TEXT, Schema
from whoosh.filedb.filestore import RamStorage
from whoosh.qparser import FuzzyTermPlugin, MultifieldParser, OrGroup
from .models.songs import Song, SongLocalized
from .utils.search_title import recover_search_title
class Searcher:
def __init__(self):
self.text_analyzer = RegexTokenizer() | LowercaseFilter()
self.song_schema = Schema(
song_id=ID(stored=True, unique=True),
title=TEXT(analyzer=self.text_analyzer, spelling=True),
artist=TEXT(analyzer=self.text_analyzer, spelling=True),
source=TEXT(analyzer=self.text_analyzer, spelling=True),
keywords=KEYWORD(lowercase=True, stored=True, scorable=True),
)
self.storage = RamStorage()
self.index = self.storage.create_index(self.song_schema)
self.default_query_parser = MultifieldParser(
["song_id", "title", "artist", "source", "keywords"],
self.song_schema,
group=OrGroup,
)
self.default_query_parser.add_plugin(FuzzyTermPlugin())
def import_songs(self, session: Session):
writer = self.index.writer()
songs = list(session.scalars(select(Song)))
song_localize_stmt = select(SongLocalized)
for song in songs:
stmt = song_localize_stmt.where(SongLocalized.id == song.id)
sl = session.scalar(stmt)
song_id = song.id
possible_titles: List[Union[str, None]] = [song.title]
possible_artists: List[Union[str, None]] = [song.artist]
possible_sources: List[Union[str, None]] = [song.source]
if sl:
possible_titles.extend(
[sl.title_ja, sl.title_ko, sl.title_zh_hans, sl.title_zh_hant]
)
possible_titles.extend(
recover_search_title(sl.search_title_ja)
+ recover_search_title(sl.search_title_ko)
+ recover_search_title(sl.search_title_zh_hans)
+ recover_search_title(sl.search_title_zh_hant)
)
possible_artists.extend(
recover_search_title(sl.search_artist_ja)
+ recover_search_title(sl.search_artist_ko)
+ recover_search_title(sl.search_artist_zh_hans)
+ recover_search_title(sl.search_artist_zh_hant)
)
possible_sources.extend(
[
sl.source_ja,
sl.source_ko,
sl.source_zh_hans,
sl.source_zh_hant,
]
)
# remove empty items in list
titles = [t for t in possible_titles if t != "" and t is not None]
artists = [t for t in possible_artists if t != "" and t is not None]
sources = [t for t in possible_sources if t != "" and t is not None]
writer.update_document(
song_id=song_id,
title=" ".join(titles),
artist=" ".join(artists),
source=" ".join(sources),
keywords=" ".join([song_id] + titles + artists + sources),
)
writer.commit()
def did_you_mean(self, string: str):
results = set()
with self.index.searcher() as searcher:
corrector_keywords = searcher.corrector("keywords") # type: ignore
corrector_song_id = searcher.corrector("song_id") # type: ignore
corrector_title = searcher.corrector("title") # type: ignore
corrector_artist = searcher.corrector("artist") # type: ignore
corrector_source = searcher.corrector("source") # type: ignore
results.update(corrector_keywords.suggest(string))
results.update(corrector_song_id.suggest(string))
results.update(corrector_title.suggest(string))
results.update(corrector_artist.suggest(string))
results.update(corrector_source.suggest(string))
if string in results:
results.remove(string)
return list(results)
def search(self, string: str, *, limit: int = 10, fuzzy_distance: int = 10):
query_string = f"{string}"
query = self.default_query_parser.parse(query_string)
with self.index.searcher() as searcher:
results = list(searcher.search(query, limit=limit))
return results

View File

@ -0,0 +1,6 @@
import json
from typing import List, Optional
def recover_search_title(db_value: Optional[str]) -> List[str]:
return json.loads(db_value) if db_value else []