mirror of
https://github.com/283375/arcaea-offline.git
synced 2025-04-07 00:20:17 +00:00
wip: searcher
This commit is contained in:
parent
b180976284
commit
316b02cd1b
@ -13,6 +13,7 @@ dependencies = [
|
||||
"beautifulsoup4==4.12.2",
|
||||
"SQLAlchemy==2.0.20",
|
||||
"SQLAlchemy-Utils==0.41.1",
|
||||
"Whoosh==2.7.4",
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 3 - Alpha",
|
||||
|
@ -1,3 +1,4 @@
|
||||
beautifulsoup4==4.12.2
|
||||
SQLAlchemy==2.0.20
|
||||
SQLAlchemy-Utils==0.41.1
|
||||
Whoosh==2.7.4
|
||||
|
111
src/arcaea_offline/searcher.py
Normal file
111
src/arcaea_offline/searcher.py
Normal file
@ -0,0 +1,111 @@
|
||||
from typing import List, Union
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
from whoosh.analysis import LowercaseFilter, RegexTokenizer
|
||||
from whoosh.fields import ID, KEYWORD, TEXT, Schema
|
||||
from whoosh.filedb.filestore import RamStorage
|
||||
from whoosh.qparser import FuzzyTermPlugin, MultifieldParser, OrGroup
|
||||
|
||||
from .models.songs import Song, SongLocalized
|
||||
from .utils.search_title import recover_search_title
|
||||
|
||||
|
||||
class Searcher:
|
||||
def __init__(self):
|
||||
self.text_analyzer = RegexTokenizer() | LowercaseFilter()
|
||||
self.song_schema = Schema(
|
||||
song_id=ID(stored=True, unique=True),
|
||||
title=TEXT(analyzer=self.text_analyzer, spelling=True),
|
||||
artist=TEXT(analyzer=self.text_analyzer, spelling=True),
|
||||
source=TEXT(analyzer=self.text_analyzer, spelling=True),
|
||||
keywords=KEYWORD(lowercase=True, stored=True, scorable=True),
|
||||
)
|
||||
self.storage = RamStorage()
|
||||
self.index = self.storage.create_index(self.song_schema)
|
||||
|
||||
self.default_query_parser = MultifieldParser(
|
||||
["song_id", "title", "artist", "source", "keywords"],
|
||||
self.song_schema,
|
||||
group=OrGroup,
|
||||
)
|
||||
self.default_query_parser.add_plugin(FuzzyTermPlugin())
|
||||
|
||||
def import_songs(self, session: Session):
|
||||
writer = self.index.writer()
|
||||
songs = list(session.scalars(select(Song)))
|
||||
song_localize_stmt = select(SongLocalized)
|
||||
for song in songs:
|
||||
stmt = song_localize_stmt.where(SongLocalized.id == song.id)
|
||||
sl = session.scalar(stmt)
|
||||
song_id = song.id
|
||||
possible_titles: List[Union[str, None]] = [song.title]
|
||||
possible_artists: List[Union[str, None]] = [song.artist]
|
||||
possible_sources: List[Union[str, None]] = [song.source]
|
||||
if sl:
|
||||
possible_titles.extend(
|
||||
[sl.title_ja, sl.title_ko, sl.title_zh_hans, sl.title_zh_hant]
|
||||
)
|
||||
possible_titles.extend(
|
||||
recover_search_title(sl.search_title_ja)
|
||||
+ recover_search_title(sl.search_title_ko)
|
||||
+ recover_search_title(sl.search_title_zh_hans)
|
||||
+ recover_search_title(sl.search_title_zh_hant)
|
||||
)
|
||||
possible_artists.extend(
|
||||
recover_search_title(sl.search_artist_ja)
|
||||
+ recover_search_title(sl.search_artist_ko)
|
||||
+ recover_search_title(sl.search_artist_zh_hans)
|
||||
+ recover_search_title(sl.search_artist_zh_hant)
|
||||
)
|
||||
possible_sources.extend(
|
||||
[
|
||||
sl.source_ja,
|
||||
sl.source_ko,
|
||||
sl.source_zh_hans,
|
||||
sl.source_zh_hant,
|
||||
]
|
||||
)
|
||||
|
||||
# remove empty items in list
|
||||
titles = [t for t in possible_titles if t != "" and t is not None]
|
||||
artists = [t for t in possible_artists if t != "" and t is not None]
|
||||
sources = [t for t in possible_sources if t != "" and t is not None]
|
||||
|
||||
writer.update_document(
|
||||
song_id=song_id,
|
||||
title=" ".join(titles),
|
||||
artist=" ".join(artists),
|
||||
source=" ".join(sources),
|
||||
keywords=" ".join([song_id] + titles + artists + sources),
|
||||
)
|
||||
|
||||
writer.commit()
|
||||
|
||||
def did_you_mean(self, string: str):
|
||||
results = set()
|
||||
|
||||
with self.index.searcher() as searcher:
|
||||
corrector_keywords = searcher.corrector("keywords") # type: ignore
|
||||
corrector_song_id = searcher.corrector("song_id") # type: ignore
|
||||
corrector_title = searcher.corrector("title") # type: ignore
|
||||
corrector_artist = searcher.corrector("artist") # type: ignore
|
||||
corrector_source = searcher.corrector("source") # type: ignore
|
||||
|
||||
results.update(corrector_keywords.suggest(string))
|
||||
results.update(corrector_song_id.suggest(string))
|
||||
results.update(corrector_title.suggest(string))
|
||||
results.update(corrector_artist.suggest(string))
|
||||
results.update(corrector_source.suggest(string))
|
||||
|
||||
if string in results:
|
||||
results.remove(string)
|
||||
|
||||
return list(results)
|
||||
|
||||
def search(self, string: str, *, limit: int = 10, fuzzy_distance: int = 10):
|
||||
query_string = f"{string}"
|
||||
query = self.default_query_parser.parse(query_string)
|
||||
with self.index.searcher() as searcher:
|
||||
results = list(searcher.search(query, limit=limit))
|
||||
return results
|
6
src/arcaea_offline/utils/search_title.py
Normal file
6
src/arcaea_offline/utils/search_title.py
Normal file
@ -0,0 +1,6 @@
|
||||
import json
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
def recover_search_title(db_value: Optional[str]) -> List[str]:
|
||||
return json.loads(db_value) if db_value else []
|
Loading…
x
Reference in New Issue
Block a user