6
6
import re
7
7
import time
8
8
from dataclasses import dataclass
9
- from typing import TYPE_CHECKING , Dict , Iterable , Iterator , List , Optional , Tuple , Any
9
+ from typing import TYPE_CHECKING , Any , Dict , Iterable , Iterator , List , Optional , Tuple
10
10
11
11
import pystow
12
12
from linkml_runtime .dumpers import yaml_dumper
@@ -236,7 +236,9 @@ def __post_init__(self):
236
236
def _embeddings_collection_name (self ) -> str :
237
237
name = self .wrapped_adapter .resource .slug
238
238
if not name :
239
- raise ValueError (f"Wrapped adapter must have a slug: { self .wrapped_adapter } // { self .wrapped_adapter .resource } " )
239
+ raise ValueError (
240
+ f"Wrapped adapter must have a slug: { self .wrapped_adapter } // { self .wrapped_adapter .resource } "
241
+ )
240
242
return name
241
243
242
244
def entities (self , ** kwargs ) -> Iterator [CURIE ]:
@@ -281,7 +283,6 @@ def _parse_response(self, json_str: str) -> Any:
281
283
json_str = json_str [4 :].strip ()
282
284
return json .loads (json_str )
283
285
284
-
285
286
def get_model (self ):
286
287
model = self .model
287
288
if not self .model :
@@ -297,6 +298,7 @@ def get_model(self):
297
298
def _embed_terms (self ):
298
299
import llm
299
300
import sqlite_utils
301
+
300
302
adapter = self .wrapped_adapter
301
303
name = self ._embeddings_collection_name
302
304
path_to_db = pystow .join ("oaklib" , "llm" , "embeddings" )
@@ -308,14 +310,13 @@ def _embed_terms(self):
308
310
309
311
def _term_embedding (self , id : CURIE ) -> Optional [tuple ]:
310
312
import llm
313
+
311
314
db = self ._embeddings_collection .db
312
315
name = self ._embeddings_collection_name
313
316
collection_ids = list (db ["collections" ].rows_where ("name = ?" , (name ,)))
314
317
collection_id = collection_ids [0 ]["id" ]
315
318
matches = list (
316
- db ["embeddings" ].rows_where (
317
- "collection_id = ? and id = ?" , (collection_id , id )
318
- )
319
+ db ["embeddings" ].rows_where ("collection_id = ? and id = ?" , (collection_id , id ))
319
320
)
320
321
if not matches :
321
322
logger .debug (f"ID not found: { id } in { collection_id } ({ name } )" )
@@ -324,18 +325,18 @@ def _term_embedding(self, id: CURIE) -> Optional[tuple]:
324
325
comparison_vector = llm .decode (embedding )
325
326
return comparison_vector
326
327
327
-
328
328
def pairwise_similarity (
329
- self ,
330
- subject : CURIE ,
331
- object : CURIE ,
332
- predicates : List [PRED_CURIE ] = None ,
333
- subject_ancestors : List [CURIE ] = None ,
334
- object_ancestors : List [CURIE ] = None ,
335
- min_jaccard_similarity : Optional [float ] = None ,
336
- min_ancestor_information_content : Optional [float ] = None ,
329
+ self ,
330
+ subject : CURIE ,
331
+ object : CURIE ,
332
+ predicates : List [PRED_CURIE ] = None ,
333
+ subject_ancestors : List [CURIE ] = None ,
334
+ object_ancestors : List [CURIE ] = None ,
335
+ min_jaccard_similarity : Optional [float ] = None ,
336
+ min_ancestor_information_content : Optional [float ] = None ,
337
337
) -> Optional [TermPairwiseSimilarity ]:
338
338
import llm
339
+
339
340
self ._embed_terms ()
340
341
subject_embedding = self ._term_embedding (subject )
341
342
if not subject_embedding :
@@ -351,7 +352,9 @@ def pairwise_similarity(
351
352
)
352
353
return sim
353
354
354
- def _ground_term (self , term : str , categories : Optional [List [str ]] = None ) -> Optional [Tuple [str , float ]]:
355
+ def _ground_term (
356
+ self , term : str , categories : Optional [List [str ]] = None
357
+ ) -> Optional [Tuple [str , float ]]:
355
358
matches = list (self ._match_terms (term ))
356
359
system = """
357
360
Given a list of ontology terms, find the one that best matches the given term.
@@ -361,7 +364,7 @@ def _ground_term(self, term: str, categories: Optional[List[str]] = None) -> Opt
361
364
- ANAT:002 pericardium
362
365
Then a valid response is {"id": "ANAT:001", "confidence": 0.8}.
363
366
"""
364
- prompt = f" Find the best match for the term: \ "{ term } \ " .\n "
367
+ prompt = f' Find the best match for the term: "{ term } ".\n '
365
368
if categories :
366
369
if len (categories ) == 1 :
367
370
prompt += f"Term Category: { categories [0 ]} .\n "
@@ -401,7 +404,11 @@ def annotate_text(
401
404
grounded , _confidence = self ._ground_term (text , configuration .categories )
402
405
logger .info (f"Grounded { text } to { grounded } " )
403
406
if grounded :
404
- yield TextAnnotation (subject_label = text , object_id = grounded , object_label = self .wrapped_adapter .label (grounded ))
407
+ yield TextAnnotation (
408
+ subject_label = text ,
409
+ object_id = grounded ,
410
+ object_label = self .wrapped_adapter .label (grounded ),
411
+ )
405
412
return
406
413
else :
407
414
logging .info ("Delegating directly to grounder, bypassing LLM" )
@@ -495,9 +502,6 @@ def _match_terms(self, text: str) -> Iterator[Tuple[str, float]]:
495
502
logger .debug (f"Similar: { entry } " )
496
503
yield entry .id , entry .score
497
504
498
-
499
-
500
-
501
505
def _suggest_aliases (
502
506
self ,
503
507
term : str ,
0 commit comments