import os #os.environ["CUDA_VISIBLE_DEVICES"] = "1,6" # to use the GPUs 3,4 only #os.environ["HF_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" #os.environ["HUGGINGFACE_HUB_CACHE"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" #os.environ["HF_HOME"] = "/eos/jeodpp/home/users/consose/cache/huggingface/hub" from transformers import file_utils print(file_utils.default_cache_path) import pandas as pd from tqdm import tqdm from gliner import GLiNER import logging from jinja2 import Template from collections import Counter from transformers import pipeline, AutoTokenizer #os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512" os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' #import html import torch torch.cuda.empty_cache() # Clear cache ot torch device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Device: {device}...") if device.type == "cuda": print("GPU number:", torch.cuda.current_device()) import datasets import argparse import json import random import numpy as np import tiktoken from langchain.text_splitter import TokenTextSplitter import gradio as gr import re from common import strtobool, token_counter, encoding_getter, strip_quotes from nerBio import annotate, entitiesFusion, is_cross_inside, elinking from llmqueryNer import call_model, call_model_with_caching, process_list, setup_gptjrc, api_call_gptjrc, model_list_gptjrc from joblib import Memory cachedir = 'cached' mem = Memory(cachedir, verbose=False) # this is to completely delete the cache: # mem.clear(warn=False) examples = [ ["He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. " , None], ["He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. ", None], ["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. ", None], ["The Health Ministry has detected about 4000 suspected chikungunya cases nationwide this year [2008], Minister Datuk Liow Tiong Lai said Wednesday [17 Dec 2008]. He said the disease which was 1st detected in Johor had spread to Negeri Sembilan, Melaka, Perak, Selangor and the latest Kedah. \"So far, the chikungunya disease is still under control nationwide,\" he told reporters after visiting Sultanah Nur Zahirah Hospital here. Present was Terengganu Health Director Dr. Nordiyanah Hassan. Liow said that so far, there is no specific medicine to treat the chikungunya fever disease spread by _Aedes_ mosquito. \"So, I would like to call on the public to be careful particularly during the wet season now because _Aedes_ mosquito is easy to breed,\" he said. To contain the spread of the disease, he said, the ministry had taken several measures including intensifying the campaign to rid of _Aedes_ mosquito and holding lectures on the outbreak. He said the disease was 1st detected to have spread to Malaysia from Africa in 1997. Meanwhile, he said 63 health projects costing RM458 million [USD 131 230 211] had been approved for implementation in Terengganu under the Ninth Malaysia Plan and some had started.", None], ["Carcinoma", None], ["The doctor diagnosed the patient with basal cell carcinoma, a common type of skin cancer.", None], ["West Nile virus", None], ["Legionellosis", None], ["Eight years ago I started with Fosamax for 3-4 years and then took Actonel. In March, I decided not to take Actonel any longer. I had been on it for too long and was fearful of esophageal cancer and bone breakage. Now my doctor wants me to take the Prolia injections, which I am not going to do. I am not going to continue with any drugs. My bone density recently done was in the minuses. I do work with a personal trainer and execise daily. I am searching for alternative ways to deal with this problem.", None], ["Does Chicago have any stores and does Joe live here?", None], ["Cholera has been reported every week since November 1994. By 5 November 1995 at total of 12,344 with 245 deaths have been notified. Of these, 879 cases with 4 deaths were reported for the period 9 October to 5 November 1995. Control efforts have not succeeded in preventing the spread of the epidemic and when cases were detected on Sao Nicolau and Sal Islands in the period 9 October to 5 November all nine inhabited islands of Cap Verde had become infected. The last cholera epidemic in Cap Verde occurred in 1979. (See also Weekly Epidemiological Record No. 44, 3 November 1995) Côte d'Ivoire: A cholera outbreak which started in September 1995 caused 2,027 cases and 150 deaths up to 12 November 1995. The first cases were reported in Department de l'Ouest on 18 September 1995. Cases were subsequently reported in Department de Nord and most recently in Department du Centre and Department de Sud. The WHO Representative assisted in the organization of a team to visit the area and evaluate the situation as well as arranging for medical supplies. (1.12.95) Iran, Islamic Republic of,: Kordestan Province has been declared free of cholera. (1.12.95) Iraq: An outbreak of cholera reported from Sulaimaniyah Governorate in Northern Iraq has resulted in 519 cases, 264 of which have been confirmed, and 3 deaths to date. Vibrio cholerae O1 serotype Ogawa has been isolated. At the request of the Iraqi Ministry of Health, a WHO consultant has been sent to the area to assess and monitor the situation, provide guidance to the health authorities, and coordinate inputs by non-governmental organizations. WHO has also made available essential treatment supplies. An intensive media campaign to raise public awareness about essential preventive measures has been successful in containing the spread of the outbreak. (1.12.95) Senegal: Despite the fact that cholera has been endemic in countries bordering Senegal for the past two years, no cases were reported from Senegal until mid- August 1995. Between 15 August and 17 November 1995, 852 case and 43 deaths were notified. A further 731 cases with 37 deaths have been reported for the period 1 September to 12 November. Most cases were in the Departments of Dakar and Pikine in the Dakar Region and recently also Departments of Mbacke and Touba in Diourbel Region. ", None], ] models_List = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5" #models_List = ["Babelscape/wikineural-multilingual-ner", "urchade/gliner_large-v2.1", "NCBO/BioPortal" ] # "urchade/gliner_large-v2.1", "knowledgator/gliner-multitask-large-v0.5" #models_List = ["NCBO/BioPortal" ] #categories_List = ["MED","LOC","PER","ORG","DATE","MISC"] categories_List = ["MED","LOC","PER","ORG","DATE","MISC", "CONC", "BIOP", "ACTI", "ANAT", "CHEM", "DEVI", "DISO", "GENE", "GEOG", "LIVB", "OBJC", "OCCU", "ORGA", "PHEN", "PHYS" , "PROC"] POSSIBLE_KGchoices_List = ["AI", "AIO", "AEO", "BFO", "BIM", "BCGO", "CL", "CHIRO", "CHEBI", "DCM", "FMA", "GO", "GENO", "GeoSPARQL", "HL7", "DOID", "HP", "HP_O", "IDO", "IAO", "ICD10", "LOINC", "MESH", "MONDO", "NCIT", "NCBITAXON", "NCBITaxon_", "NIFCELL", "NIFSTD", "GML", "OBCS", "OCHV", "OHPI", "OPB", "TRANS", "PLOSTHES", "RADLEX", "RO", "STY", "SO", "SNOMED", "STATO", "SYMP", "FoodOn", "UBERON", "ORDO", "HOOM", "VO", "OGMS", "EuroSciVoc"] modelGliner=None modelGlinerBio=None num_cores_Gliner_forDemo = 0 # 0 means use the GPU for Gliner ! tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large') encod = encoding_getter('microsoft/deberta-v3-large') text_splitter = TokenTextSplitter( # separators=separators, encoding_name=encod.name, chunk_size=80000, chunk_overlap=50, length_function=len, add_start_index=True, ) pipe_dict = {} for modelName in models_List: tsk = "token-classification" if (("/gliner" in modelName) == False) and (("NCBO" in modelName) == False): pipe = pipeline( tsk, model=modelName, aggregation_strategy="simple", device=device, ) pipe_dict[modelName] = pipe elif ("/gliner" in modelName): if not tokenizerGliner: tokenizerGliner = AutoTokenizer.from_pretrained('microsoft/deberta-v3-large') if "_bio-" in modelName: if num_cores_Gliner_forDemo > 0: modelGlinerBio = GLiNER.from_pretrained(modelName) # "urchade/gliner_large_bio-v0.1") else: modelGlinerBio = GLiNER.from_pretrained(modelName, map_location=device) else: if num_cores_Gliner_forDemo > 0: modelGliner = GLiNER.from_pretrained( modelName) # "knowledgator/gliner-multitask-large-v0.5" - "urchade/gliner_large-v2.1" else: modelGliner = GLiNER.from_pretrained(modelName, map_location=device) #### GPT@JRC API #if args.service_provider == "gptjrc": key_gptjrc = "" fkeyname = "GPTJRC-APItoken.key" if os.path.exists(fkeyname): with open(fkeyname) as f: key_gptjrc = f.read() else: key_gptjrc = os.environ['key_gptjrc'] if key_gptjrc and key_gptjrc != "": setup_gptjrc(key_gptjrc) ##### # Add this function to handle dropdown selection def get_urls(word, df_annotated_combined): # Filter the DataFrame to get rows where 'ALLURIScontext' is not empty or None #valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])] valid_entries = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and ( isinstance(x, list) and (isinstance(x, list) and len(x) > 0) and (not (len(x) == 1 and not str(x[0]).strip()))))] # Check if the word is in the filtered DataFrame if word in valid_entries['word'].values: urls = valid_entries.loc[valid_entries['word'] == word, 'ALLURIScontext'].values[0] if 'namedEntity' in df_annotated_combined.columns: firsturlinlist = df_annotated_combined.loc[df_annotated_combined['word'] == word, 'namedEntity'] firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None if firsturlinlist and firsturlinlist in urls: # Remove the URL from its current position urls.remove(firsturlinlist) # Insert the URL at the first position urls.insert(0, firsturlinlist) #html_links = "
".join([f'{url}' for url in urls]) html_links = "
".join([f'{url}' for url in urls]) return html_links return "" ###@mem.cache def nerBio(text, ModelsSelection, CategoriesSelection, ScoreFilt, EntityLinking, KGchoices, state: dict): if EntityLinking: EnableNEL="True" else: EnableNEL="False" if not text: html_output = f"
{text}
" state = { "text": "", "df_annotated_dict": dict(), "df_annotated_combined_dict": dict(), "KGchoices": KGchoices, "ModelsSelection": ModelsSelection, "ScoreFilt": ScoreFilt, "EntityLinking": EntityLinking, "html_output": html_output } return {"text": text, "entities": []}, html_output, state, [], "" df_annotated = pd.DataFrame() parser = argparse.ArgumentParser() parser.add_argument("--model_id", type=str, default=models_List[0], help="model to use") parser.add_argument("--debug", type=str, default="True", help="set debug mode") parser.add_argument("--source_column", type=str, default="ContextToAnnotate") parser.add_argument("--entities_filter_threshold", type=int, default=ScoreFilt) parser.add_argument("--SEED", type=int, default=41) parser.add_argument("--batch_size", type=int, default=32) # 4 - 8 - 16 parser.add_argument("--num_cores_Gliner", type=int, default=num_cores_Gliner_forDemo, help="parallel processing for Gliner annotation") # 0 means use the GPU for Gliner ! parser.add_argument("--entity_linking", type=str, default=EnableNEL, help="whether to make entities linking or not") parser.add_argument("--geonameskey_filename", type=str, default="GEONAMES-API.key", help="file location where it is stored the geonames api key") parser.add_argument("--virtuosokey_filename", type=str, default="VIRTUOSO-dba.key", help="file location where it is stored the virtuoso endpoint dba pwd") parser.add_argument("--bioportalkey_filename", type=str, default="NCBO-BioPortal.key", help="file location where it is stored the NCBO BioPortal api key") # consose 20250205: # KGchoices = None # KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'NCIT'] # KGchoices = ['SNOMED', 'LOINC', 'ICD10', 'MESH', 'NCIT'] # restricts the input to these values only if KGchoices: KGchoices.sort() parser.add_argument("--KG_restriction", nargs='+', choices=KGchoices, default=KGchoices, help="List of ontologies to which restrict the entity linking task.") #consose 20250502: if Counter(KGchoices) == Counter(POSSIBLE_KGchoices_List): parser.add_argument("--USE_CACHE", type=str, default="True", help="whether to use cache for the NER and NEL tasks or not") else: #print("Lists do not have the same elements") parser.add_argument("--USE_CACHE", type=str, default="False", help="whether to use cache for the NER and NEL tasks or not") parser.add_argument("--num_cores_eLinking", type=int, default=10, help="parallel processing for the entity linking process") parser.add_argument("--computeEntityContext", type=str, default="False", help="whether to extract a readable context from the extracted triples for the concept") parser.add_argument("--computeEntityGlobalContext", type=str, default="False", help="whether to extract a readable context from the extracted triples of all the entities extracted from the endpoint for the concept") parser.add_argument("--maxTriplesGlobalContext", type=int, default=20000, help="maximum number of triples to consider for global context computation") # if 0 or None it is not considered parser.add_argument("--UseRetrieverForContextCreation", type=str, default="True", help="whether to use a retriever for the creation of the context of the entities from the triples coming from the KGs") parser.add_argument("--service_provider", type=str, default="gptjrc", help="llm service provider") parser.add_argument("--model_name", type=str, default="llama-3.1-70b-instruct", help="llm to use") parser.add_argument("--tokens_max", type=int, default=80000, help="max number of tokens to supply to the llm") parser.add_argument("--temperature", type=int, default=0.01) args = parser.parse_args() df_ToAnnotate = pd.DataFrame() previous_text = "" previous_df_annotated_dict = dict() previous_kg_choices = [] if state: previous_text = state.get("text", "") previous_df_annotated_dict = state.get("df_annotated_dict", {}) previous_df_annotated_combined_dict = state.get("df_annotated_combined_dict", {}) previous_kg_choices = state.get("KGchoices", []) previous_ModelsSelection = state.get("ModelsSelection", []) previous_ScoreFilt_from_state = float(state.get("ScoreFilt", ScoreFilt)) # Ensure ScoreFilt is a float previous_EntityLinking_from_state = bool(state.get("EntityLinking", EntityLinking)) # Ensure EntityLinking is a boolean previous_html_output = state.get("html_output", "") if previous_html_output and (previous_df_annotated_dict) and (previous_df_annotated_combined_dict) and (previous_text == text) and (sorted(previous_kg_choices) == sorted(KGchoices)) and (sorted(previous_ModelsSelection) == sorted(ModelsSelection)) and (previous_ScoreFilt_from_state == ScoreFilt) and (previous_EntityLinking_from_state == EntityLinking): ddf_annot_prev = pd.DataFrame(previous_df_annotated_combined_dict) if 'ALLURIScontext' in ddf_annot_prev.columns: # words_for_dropdown = df_annotated_combined[ # df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][ # 'word'].unique().tolist() words_for_dropdown = ddf_annot_prev[ddf_annot_prev['ALLURIScontext'].apply( lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and ( isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip()))))][ 'word'].unique().tolist() words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values()) words_for_dropdown.insert(0, "") else: words_for_dropdown = [] dict_annotated_combined_NER = ddf_annot_prev[ ["end", "entity_group", "score", "start", "word"]].to_dict(orient="records") # return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state return {"text": text, "entities": dict_annotated_combined_NER}, previous_html_output, state, gr.update( choices=words_for_dropdown), "" #print("Are all models in any row of the 'model' column, case-insensitively?", all_models_in_any_row) #if (not history_dict) or (history_dict[args.source_column][0] != text) or (all_models_in_any_row == False): #if (not history_dict) or (history_dict[args.source_column][0] != text): if (not previous_df_annotated_dict) or (previous_text != text) or (sorted(previous_kg_choices) != sorted(KGchoices) ): for model_id in models_List: # always do all the annotations, only filter them afterwards #for model_id in ModelsSelection: # if history_dict and (history_dict[args.source_column][0] == text): # if model_id in hhist['model'].unique(): # continue parser.set_defaults(model_id=model_id) args = parser.parse_args() print("ARGS:") print(args) # %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code. # Before you create the pipeline and run the text generation, set the seeds like this: random.seed(args.SEED) np.random.seed(args.SEED) torch.manual_seed(args.SEED) torch.cuda.manual_seed_all(args.SEED) ### df_ToAnnotate = pd.DataFrame({ "ToLink": [None], args.source_column: [text]}) if "SentenceRef" not in df_ToAnnotate.columns: df_ToAnnotate["SentenceRef"] = None df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if col != 'SentenceRef']] # this moves it to the first position df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1 df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby(df_ToAnnotate[args.source_column]).transform('min').astype(int) df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int) # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # if strtobool(args.debug): # print(f"Device: {device}...") # if device.type == "cuda": # print("GPU number:", torch.cuda.current_device()) pipeToUse = None if (("gliner" in args.model_id) == False) and (("NCBO" in args.model_id)== False) : pipeToUse = pipe_dict[args.model_id] new_annotations = annotate(df_ToAnnotate, args, pipeToUse, tokenizerGliner, modelGliner, modelGlinerBio, device) if not new_annotations.empty: if df_annotated.empty: # If df_annotated is empty, just assign new_annotations to it df_annotated = new_annotations else: # If df_annotated is not empty, concatenate new_annotations to it df_annotated = pd.concat([df_annotated, new_annotations], ignore_index=True) state = { "text": text, "df_annotated_dict": df_annotated.to_dict(), "df_annotated_combined_dict": dict(), "KGchoices": KGchoices, "ModelsSelection": ModelsSelection, "ScoreFilt": ScoreFilt, "EntityLinking": EntityLinking, "html_output": "" } else: print("ARGS:") print(args) # %% n machine learning tasks, particularly when dealing with models that have stochasticity involved (like text generation), it's important to set seeds for random number generators to ensure reproducibility of results. In the case of using models from the transformers library, you need to set seeds for both Python's random module, NumPy, and PyTorch to ensure that the results are the same every time you run the code. # Before you create the pipeline and run the text generation, set the seeds like this: random.seed(args.SEED) np.random.seed(args.SEED) torch.manual_seed(args.SEED) torch.cuda.manual_seed_all(args.SEED) ### history = pd.DataFrame(previous_df_annotated_dict) df_annotated = history.copy() state = { "text": text, "df_annotated_dict": df_annotated.to_dict(), "df_annotated_combined_dict": dict(), "KGchoices": KGchoices, "ModelsSelection": ModelsSelection, "ScoreFilt": ScoreFilt, "EntityLinking": EntityLinking, "html_output": "" } quoted_text = text.startswith('"') & text.endswith('"') if (not df_annotated.empty) or quoted_text: if (not df_annotated.empty): # filter now per models selection df_annotated = df_annotated[df_annotated['model'].str.lower().isin([model.lower() for model in ModelsSelection])] if df_annotated.empty and quoted_text==False: html_output = f"
{text}
" state["html_output"] = html_output return {"text": text, "entities": []}, html_output, state, [], "" df_annotated_combined = pd.DataFrame() if (not df_annotated.empty): df_annotated_combined = entitiesFusion(df_annotated,args) if df_annotated_combined.empty and quoted_text==False: html_output = f"
{text}
" state["html_output"] = html_output return {"text": text, "entities": []}, html_output, state, [], "" else: if (not df_annotated.empty): df_annotated_combined = is_cross_inside(df_annotated_combined, args, 0.999) #I cut all the cross inside with the 0.99. to avoid the linking cache_prefix_fp = "LLMQUERYNER" cache_nameLLMs = cache_prefix_fp + "___" + "__".join( [args.service_provider, args.model_name, str(args.temperature)]).replace( " ", "_") + ".json" load_map_query_input_output = None if strtobool(args.USE_CACHE): if os.path.exists(cache_nameLLMs): with open(cache_nameLLMs) as f: load_map_query_input_output = json.load(f) else: load_map_query_input_output = {} ### entity linking part: if strtobool(args.entity_linking): cache_map_geonames = None if strtobool(args.USE_CACHE): cache_filename = "CACHE_geonames.json" if os.path.exists(cache_filename): with open(cache_filename) as f: cache_map_geonames = json.load(f) else: cache_map_geonames = {} key_geonames = "" if args.geonameskey_filename and os.path.exists(args.geonameskey_filename): fkeyname = args.geonameskey_filename with open(fkeyname) as f: key_geonames = f.read() else: key_geonames = os.environ['key_geonames'] cache_map_virtuoso = None if strtobool(args.USE_CACHE): cacheVirtuoso_filename = "CACHE_virtuoso.json" if os.path.exists(cacheVirtuoso_filename): with open(cacheVirtuoso_filename) as f: cache_map_virtuoso = json.load(f) else: cache_map_virtuoso = {} key_virtuoso = "" if args.virtuosokey_filename and os.path.exists(args.virtuosokey_filename): fkeyname = args.virtuosokey_filename with open(fkeyname) as f: key_virtuoso = f.read() else: key_virtuoso = os.environ['key_virtuoso'] # Here for the EXACT MATCHING "" - if the desired term has not been identified in the NER, add to the dataframe: if df_ToAnnotate.empty: df_ToAnnotate = pd.DataFrame({"ToLink": [None], args.source_column: [text]}) if "SentenceRef" not in df_ToAnnotate.columns: df_ToAnnotate["SentenceRef"] = None df_ToAnnotate = df_ToAnnotate[['SentenceRef'] + [col for col in df_ToAnnotate.columns if col != 'SentenceRef']] # this moves it to the first position df_ToAnnotate['SentenceRef'] = df_ToAnnotate.index + 1 df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].argsort().groupby( df_ToAnnotate[args.source_column]).transform('min').astype(int) df_ToAnnotate['SentenceRef'] = df_ToAnnotate['SentenceRef'].rank(method='dense').astype(int) # Define the condition to find missing SentenceRefs missing_sentence_refs = ~df_ToAnnotate['SentenceRef'].isin(df_annotated_combined['SentenceRef']) # Define the condition to check if ContextToAnnotate starts and ends with quotes quoted_context = df_ToAnnotate[args.source_column].str.startswith('"') & df_ToAnnotate[ args.source_column].str.endswith('"') # Combine both conditions condition = missing_sentence_refs & quoted_context # Select rows from df_ToAnnotate that meet the condition rows_to_add = df_ToAnnotate[condition] rows_to_add['model'] = "Forced" rows_to_add['entity_group'] = "MISC" rows_to_add['word'] = rows_to_add[args.source_column] rows_to_add['word'] = rows_to_add[args.source_column].apply(strip_quotes) rows_to_add['score'] = 1.0 rows_to_add['start'] = int(1) rows_to_add['end'] = rows_to_add['word'].apply(len) + int(1) rows_to_add['IsGeo'] = None rows_to_add['IsBio'] = None rows_to_add['IsCrossInside'] = 0.0 if df_annotated_combined.empty: df_annotated_combined = pd.DataFrame(columns=df_ToAnnotate.columns) # Append these rows to df_annotated_combined df_annotated_combined = pd.concat([df_annotated_combined, rows_to_add], ignore_index=True) df_annotated_combined['start'] = df_annotated_combined['start'].astype(int) df_annotated_combined['end'] = df_annotated_combined['end'].astype(int) df_annotated_combined = df_annotated_combined.sort_values( by=['SentenceRef', 'start', 'ToLink', 'word', 'score'], ascending=[True, True, True, True, False]) # Now df_annotated_combined contains the additional rows df_annotated_combined, cache_map_geonames_AFTER, cache_map_virtuoso_AFTER, load_map_query_input_output_AFTER = elinking(df_annotated_combined, text_splitter, args, key_geonames, cache_map_geonames, key_virtuoso, cache_map_virtuoso, load_map_query_input_output, device) if strtobool(args.USE_CACHE): if cache_map_geonames_AFTER is not None: with open(cache_filename, "w") as f: json.dump(cache_map_geonames_AFTER, f) if cache_map_virtuoso_AFTER is not None: with open(cacheVirtuoso_filename, "w") as f: json.dump(cache_map_virtuoso_AFTER, f) if load_map_query_input_output_AFTER is not None: with open(cache_nameLLMs, "w") as f: json.dump(load_map_query_input_output_AFTER, f) ### end entity linking part ### filter by selected category only # #df_annotated_combined = df_annotated_combined[df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in CategoriesSelection])] # if "MED" in CategoriesSelection: # filter_mask = df_annotated_combined['entity_group'].str.lower().isin( # [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1) # else: # filter_mask = df_annotated_combined['entity_group'].str.lower().isin( # [cat.lower() for cat in CategoriesSelection]) # df_annotated_combined = df_annotated_combined[filter_mask] # # if "MED" in CategoriesSelection: # filter_mask = df_annotated_combined['entity_group'].str.lower().isin( # [cat.lower() for cat in CategoriesSelection]) | (df_annotated_combined['IsBio'] == 1) # elif "OTHER" in CategoriesSelection: # filter_mask = ~( # df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) # else: # filter_mask = df_annotated_combined['entity_group'].str.lower().isin( # [cat.lower() for cat in CategoriesSelection]) filter_mask = df_annotated_combined['entity_group'].str.lower().isin( [cat.lower() for cat in CategoriesSelection]) if "MED" in CategoriesSelection: filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & (df_annotated_combined['IsBio'] == 1) if "MISC" in CategoriesSelection: # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) # filter_mask |= ~(df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~(df_annotated_combined['IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC filter_mask |= ~( df_annotated_combined['entity_group'].str.lower().isin([cat.lower() for cat in categories_List])) & ~( df_annotated_combined[ 'IsBio'] == 1) # with this cluase, I'm including not only the categories labelled as MISC, but also the other that are not MED, PER, ORG, LOC df_annotated_combined = df_annotated_combined[filter_mask] if df_annotated_combined.empty: html_output = f"
{text}
" state["html_output"] = html_output return {"text": text, "entities": []}, html_output, state, [], "" ### #df_annotated_combined = is_cross_inside(df_annotated_combined, args) if 'IsCrossInside' in df_annotated_combined.columns: df_annotated_combined = df_annotated_combined[df_annotated_combined['IsCrossInside'] != 1] if df_annotated_combined.empty: html_output = f"
{text}
" state["html_output"] = html_output return {"text": text, "entities": []}, html_output, state, [], "" dict_annotated_combined_NER = df_annotated_combined[["end", "entity_group", "score", "start", "word"]].to_dict(orient="records") ### continue linking part: if strtobool(args.entity_linking): # ##### this is to pass the links: # # Create a new column for the entities with links df_annotated_combined['entity_with_link'] = df_annotated_combined.apply( # lambda row: ( # f"{row['word']}" # if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[ # 'word'] # ), lambda row: ( f"{row['word']}" if row['namedEntity'] not in [None, '', 'NaN', 'nan'] and pd.notnull(row['namedEntity']) else row[ 'word'] ), axis=1 ) # Create a new dictionary with the entity information and the link dict_annotated_combined_NEL = df_annotated_combined[ ["end", "entity_group", "score", "start", "entity_with_link"]].to_dict(orient="records") # Sort the entities by their start index dict_annotated_combined_NEL.sort(key=lambda x: x['start']) # Create a dictionary to map entity groups to colors entity_colors = { "MED": "#E6E6E6", "PER": "#FFC0CB", "ORG": "#C6F4D6", "LOC": "#FFFFCC", "MISC": "#F5DEB3" } text_with_links = text offset = 0 for entity in dict_annotated_combined_NEL: start = entity["start"] + offset end = entity["end"] + offset entity_text = entity["entity_with_link"] text_with_links = text_with_links[:start] + entity_text + text_with_links[end:] offset += len(entity_text) - (end - start) # # Create the text with entities highlighted and linked # text_with_links = text # offset = 0 # for entity in dict_annotated_combined_NEL: # start = entity["start"] + offset # end = entity["end"] + offset # entity_text = entity["entity_with_link"] # entity_group = entity["entity_group"] # # color = entity_colors.get(entity_group, "#dbeafe") # Default # darker_color = "#008080" # # if "https:" in entity_text: # text_with_links = text_with_links[ # :start] + f'")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]} {entity_group}' + text_with_links[ # end:] # offset += len( # f'")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]} {entity_group}') - ( # end - start) # # text_with_links = text_with_links[:start] + f'")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}' + text_with_links[end:] # # offset += len( # # f'")[1].split("<")[0]}">{entity_text.split(">")[1].split("<")[0]}') - ( # # end - start) # # # # text_with_links = text_with_links[:start] + entity_text + text_with_links[end:] # # offset += len(entity_text) - (end - start) # else: # text_with_links = text_with_links[ # :start] + f'{entity_text} {entity_group}' + text_with_links[end:] # offset += len( # f'{entity_text} {entity_group}') - ( # end - start) # # text_with_links = text_with_links[ # # :start] + f'{entity_text}' + text_with_links[ # # end:] # # offset += len( # # f'{entity_text}') - (end - start) # Update state with the DataFrame state["df_annotated_combined_dict"] = df_annotated_combined.to_dict() if 'ALLURIScontext' in df_annotated_combined.columns: # words_for_dropdown = df_annotated_combined[ # df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [])][ # 'word'].unique().tolist() words_for_dropdown = df_annotated_combined[df_annotated_combined['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))]['word'].unique().tolist() words_for_dropdown = list({entry.lower(): entry for entry in words_for_dropdown}.values()) words_for_dropdown.insert(0, "") else: words_for_dropdown = [] html_output = f"
{text_with_links}
" state["html_output"] = html_output #return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, gr.update(choices=words_for_dropdown), "" else: html_output = f"
{text}
" state["html_output"] = html_output return {"text": text, "entities": dict_annotated_combined_NER}, html_output, state, [], "" else: html_output = f"
{text}
" state["html_output"] = html_output return {"text": text, "entities": []}, html_output, state, [], "" # "FacebookAI/xlm-roberta-large-finetuned-conll03-english", "Babelscape/wikineural-multilingual-ner", "blaze999/Medical-NER", "urchade/gliner_large-v2.1", "urchade/gliner_large_bio-v0.1" def update_urls(selected_word, state): if "df_annotated_combined_dict" in state: # Convert the state dictionary back into a DataFrame df = pd.DataFrame(state["df_annotated_combined_dict"]) if 'ALLURIScontext' in df.columns: # # Filter the DataFrame to get rows where 'ALLURIScontextFromNCBO' is not empty or None # valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [])] # # Filter the DataFrame to get rows where 'ALLURIScontext' is not None, not an empty list, and not an empty string valid_entries = df[df['ALLURIScontext'].apply(lambda x: x is not None and x != [] and (isinstance(x, list) and len(x) > 0) and (isinstance(x, list) and (not (len(x) == 1 and not str(x[0]).strip())) ))] # Check if the selected word is in the filtered DataFrame if selected_word in valid_entries['word'].values: urls = valid_entries.loc[valid_entries['word'] == selected_word, 'ALLURIScontext'].values[0] if 'namedEntity' in df.columns: firsturlinlist = df.loc[df['word'] == selected_word, 'namedEntity'] firsturlinlist = firsturlinlist.iloc[0] if not firsturlinlist.empty else None if firsturlinlist and firsturlinlist in urls: # Remove the URL from its current position urls.remove(firsturlinlist) # Insert the URL at the first position urls.insert(0, firsturlinlist) # Convert list of URLs to HTML string with clickable links #html_links = "
".join([f'{url}' for url in urls]) html_links = "
".join([f'{url}' for url in urls]) return html_links return "" else: return"" else: return "" # demo = gr.Interface( # fn=nerBio, # inputs=[ # gr.Textbox(label= "Input text", placeholder="Enter text here..."), # gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List), # gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List), # gr.Slider(minimum=0, maximum=1.0, step=0.1, label="Score", value=0.7), # gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False), #True False # #gr.CheckboxGroup(POSSIBLE_KGchoices_List, label="KGchoices Selection", value=POSSIBLE_KGchoices_List, visible=True), # gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List), # gr.State(value={}) # ], # outputs=[ # gr.HighlightedText(label="Annotated Text"), # gr.HTML(label="Linked Text", show_label=True, visible=True), # use gr.HTML to render the annotated text with links , visible # gr.State(), # gr.Dropdown(label="Annotated Concepts", interactive=True,visible=True), # gr.Textbox(label="Linked Entities",interactive=False,visible=True) # ], # live=True, # title="BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)", # description="""Interoperability – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation. # The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come. # Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures. # # In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf). # """, # examples=examples, # cache_examples=False, # article=""" # **Categories Legend:** # - MED | Medical # - LOC | Locations # - PER | Persons # - ORG | Organizations # - MISC | Miscellanea # - CONC | Concepts & Ideas # - BIOP | Biological # - ACTI | Activities & Behaviors # - ANAT | Anatomy # - CHEM | Chemicals & Drugs # - DEVI | Devices # - DISO | Disorders # - GENE | Genes & Molecular Sequences # - GEOG | Geographic Areas # - LIVB | Living Beings # - OBJC | Objects # - OCCU | Occupations # - ORGA | Organizations # - PHEN | Phenomena # - PHYS | Physiology # - PROC | Procedures # """ # ) # Define the Gradio interface using Blocks #description="This application performs biomedical named-entity recognition and linking." with gr.Blocks(title="BioAnnotator") as demo: gr.Markdown("# BioAnnotator: Biomedical Named-Entity Recognition (NER) and Linking (NEL)") gr.Markdown(""" This application performs biomedical named-entity recognition and linking. **Description:** *Interoperability* – the capability of systems and organisations to cooperate across functional, sectoral and physical borders – is key for successful digital transformation. The [Interoperable Europe Act](https://interoperable-europe.ec.europa.eu/interoperable-europe/interoperable-europe-act) is an EU regulation that aims to strengthen public sector interoperability and will serve as a main EC policy framework for the years to come. Data exchange is vital for digital government policies, and semantic interoperability ensures systems understand each other despite different legacies and architectures. In this demo we show in particular the *BioAnnotator*, a prototype tool performing Biomedical Named-Entity Recognition (NER) and Linking (NEL). To give it a try, please select one or more NER models and enter some text to get it processed. Please select also the entity categories you want to extract, as well as the score to use as a threshold for the NER extraction. Finally, select whether you want to perform Named-Entity Linking (NEL) and if you want to enable the filtering to some specific biomedical ontologies only (acronyms description at: https://bioportal.bioontology.org/ontologies). See also: [InventoryHealthKGs.pdf](https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/ETOHA/KGs/InventoryHealthKGs.pdf). """) with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="Input text", placeholder="Enter text here...") models_selection = gr.CheckboxGroup(models_List, label="ModelsSelection", value=models_List) categories_selection = gr.CheckboxGroup(categories_List, label="CategoriesSelection", value=categories_List) score_slider = gr.Slider(minimum=0, maximum=1.0, step=0.05, label="Score", value=0.75) nel_checkbox = gr.Checkbox(label="Enable Named-Entity Linking (NEL)", value=False) kgchoices_selection = gr.Dropdown(POSSIBLE_KGchoices_List, multiselect=True, label="KGchoices Selection", value=POSSIBLE_KGchoices_List) state = gr.State(value={}) with gr.Column(): annotated_text = gr.HighlightedText(label="Annotated Text") linked_text = gr.HTML(label="Linked Text", show_label=True, visible=True) word_dropdown = gr.Dropdown(label="Annotated Concepts", show_label=True, visible=True, interactive=True) urls_html = gr.HTML(label="Linked Entities", show_label=True, visible=True) ## Define the interactions #text_input.change(fn=nerBio, inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection, state], outputs=[annotated_text, linked_text, state, word_dropdown, urls_html]) # Define the interactions for all inputs inputs = [text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection] for input_component in inputs: input_component.change(fn=nerBio, inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection, state], outputs=[annotated_text, linked_text, state, word_dropdown, urls_html]) word_dropdown.change(fn=update_urls, inputs=[word_dropdown, state], outputs=urls_html) # Add examples gr.Examples(examples=examples, inputs=[text_input, models_selection, categories_selection, score_slider, nel_checkbox, kgchoices_selection]) gr.Markdown(""" **Categories Legend:** - MED | Medical - LOC | Locations - PER | Persons - ORG | Organizations - MISC | Miscellanea - CONC | Concepts & Ideas - BIOP | Biological - ACTI | Activities & Behaviors - ANAT | Anatomy - CHEM | Chemicals & Drugs - DEVI | Devices - DISO | Disorders - GENE | Genes & Molecular Sequences - GEOG | Geographic Areas - LIVB | Living Beings - OBJC | Objects - OCCU | Occupations - ORGA | Organizations - PHEN | Phenomena - PHYS | Physiology - PROC | Procedures """) demo.launch() #demo.launch(share=True) # Share your demo with just 1 extra parameter