Spaces:

ajaykarthick
/

NER-medical-text

Sleeping

App Files Files Community

Ajay Karthick Senthil Kumar commited on Oct 15, 2024

Commit

5bd622e

1 Parent(s): eb57aa2

update

Browse files

Files changed (11) hide show

README.md +5 -5
__pycache__/config.cpython-39.pyc +0 -0
__pycache__/metrics.cpython-39.pyc +0 -0
__pycache__/utils.cpython-39.pyc +0 -0
app.py +175 -0
config.py +135 -0
data/tokenizer.pickle +3 -0
metrics.py +20 -0
model/model_1.h5 +3 -0
requirements.txt +6 -0
utils.py +123 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: NER Medical Text
-emoji: 🐨
-colorFrom: blue
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.39.0
 app_file: app.py
 pinned: false
 ---

 ---
 title: NER Medical Text
+emoji: 🐢
+colorFrom: red
+colorTo: indigo
+sdk: gradio
+sdk_version: 3.24.1
 app_file: app.py
 pinned: false
 ---

__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (2.68 kB). View file

__pycache__/metrics.cpython-39.pyc ADDED Viewed

Binary file (964 Bytes). View file

__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (3.67 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import streamlit as st
+import tensorflow as tf
+import os
+# Import your utility functions
+from utils import (
+    predict_multi_line_text,
+    tokenizer,
+)
+from config import index_to_label, acronyms_to_entities, MAX_LENGTH
+from metrics import precision, recall, f1_score
+# Register the custom metric functions
+tf.keras.utils.get_custom_objects()[precision.__name__] = precision
+tf.keras.utils.get_custom_objects()[recall.__name__] = recall
+tf.keras.utils.get_custom_objects()[f1_score.__name__] = f1_score
+# Load your trained model
+model_dir = './model'  # Adjust the path as needed
+model_1 = tf.keras.models.load_model(os.path.join(model_dir, 'model_1.h5'))
+# Define label colors for different entity types suitable for dark background
+LABEL_COLORS = {
+    'Activity': '#FF7F50',                # Coral
+    'Administration': '#6495ED',          # Cornflower Blue
+    'Age': '#FFB6C1',                     # Light Pink
+    'Area': '#7FFF00',                    # Chartreuse
+    'Biological_attribute': '#FFD700',    # Gold
+    'Biological_structure': '#00FA9A',    # Medium Spring Green
+    'Clinical_event': '#BA55D3',          # Medium Orchid
+    'Color': '#00CED1',                   # Dark Turquoise
+    'Coreference': '#FFA07A',             # Light Salmon
+    'Date': '#ADFF2F',                    # Green Yellow
+    'Detailed_description': '#DA70D6',    # Orchid
+    'Diagnostic_procedure': '#87CEFA',    # Light Sky Blue
+    'Disease_disorder': '#FF4500',        # Orange Red
+    'Distance': '#32CD32',                # Lime Green
+    'Dosage': '#8A2BE2',                  # Blue Violet
+    'Duration': '#F08080',                # Light Coral
+    'Family_history': '#20B2AA',          # Light Sea Green
+    'Frequency': '#FF6347',               # Tomato
+    'Height': '#4682B4',                  # Steel Blue
+    'History': '#EE82EE',                 # Violet
+    'Lab_value': '#FFDAB9',               # Peach Puff
+    'Mass': '#7B68EE',                    # Medium Slate Blue
+    'Medication': '#00FF7F',              # Spring Green
+    'Nonbiological_location': '#FF69B4',  # Hot Pink
+    'Occupation': '#BDB76B',              # Dark Khaki
+    'Other_entity': '#D3D3D3',            # Light Grey
+    'Other_event': '#FF1493',             # Deep Pink
+    'Outcome': '#00BFFF',                 # Deep Sky Blue
+    'Personal_background': '#00FFFF',     # Aqua
+    'Qualitative_concept': '#FFA500',     # Orange
+    'Quantitative_concept': '#FFA500',    # Orange (same as above)
+    'Severity': '#1E90FF',                # Dodger Blue
+    'Sex': '#FF00FF',                     # Magenta
+    'Shape': '#40E0D0',                   # Turquoise
+    'Sign_symptom': '#FFFF00',            # Yellow
+    'Subject': '#F0E68C',                 # Khaki
+    'Texture': '#98FB98',                 # Pale Green
+    'Therapeutic_procedure': '#8B008B',   # Dark Magenta
+    'Time': '#DC143C',                    # Crimson
+    'Volume': '#5F9EA0',                  # Cadet Blue
+    'Weight': '#FA8072',                  # Salmon
+}
+# Define the prediction function
+def predict_ner(text):
+    try:
+        # Predict entities
+        entities = predict_multi_line_text(
+            text,
+            model_1,
+            index_to_label,
+            acronyms_to_entities,
+            MAX_LENGTH
+        )
+        # Sort entities by their start position
+        entities = sorted(entities, key=lambda x: x[0])
+        # Build HTML string with highlighted entities
+        html_output = ""
+        last_idx = 0
+        for start, end, label in entities:
+            # Append text before the entity
+            if last_idx < start:
+                html_output += text[last_idx:start]
+            # Get the color for the label, default to light grey if not specified
+            color = LABEL_COLORS.get(label, '#D3D3D3')  # Light grey
+            # Wrap the entity with a span tag including style
+            entity_text = text[start:end]
+            # Include the label next to the entity
+            html_output += f'''<span style="background-color: {color}; font-weight: bold; padding: 2px; border-radius: 4px; margin: 1px;">{entity_text} <span style="font-size: smaller; font-weight: normal;">[{label}]</span></span>'''
+            last_idx = end
+        # Append any remaining text
+        if last_idx < len(text):
+            html_output += text[last_idx:]
+        return html_output
+    except Exception as e:
+        return f"<p style='color:red;'>Error: {str(e)}</p>"
+# Set up the Streamlit app with dark theme
+st.set_page_config(page_title="Medical NER", page_icon="🩺", layout="wide")
+# Apply custom CSS for dark background and text colors
+st.markdown(
+    """
+    <style>
+    /* Main app background */
+    .stApp {
+        background-color: #2E2E2E;
+        color: #FFFFFF;
+    }
+    /* Text input area */
+    .stTextArea textarea {
+        background-color: #1E1E1E;
+        color: #FFFFFF;
+    }
+    /* Adjust the Analyze button */
+    div.stButton > button:first-child {
+        background-color: #1E90FF;
+        color: #FFFFFF;
+    }
+    /* Scrollbar styling */
+    ::-webkit-scrollbar {
+        width: 10px;
+    }
+    ::-webkit-scrollbar-track {
+        background: #1E1E1E;
+    }
+    ::-webkit-scrollbar-thumb {
+        background: #888;
+    }
+    ::-webkit-scrollbar-thumb:hover {
+        background: #555;
+    }
+    /* Style for the highlighted entities */
+    .highlighted-entity {
+        padding: 2px;
+        border-radius: 4px;
+        margin: 1px;
+        font-weight: bold;
+        display: inline-block;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True
+)
+st.title("🩺 Medical Named Entity Recognition")
+st.markdown("""
+    Enter medical text below to identify and highlight entities such as diseases, medications, and anatomical terms.
+""")
+# Input text area
+text_input = st.text_area("Enter medical text here:", height=200)
+# Analyze button
+if st.button("Analyze"):
+    if text_input.strip():
+        with st.spinner("Analyzing..."):
+            result = predict_ner(text_input)
+            # Display the result with HTML rendering
+            st.markdown(f"<div style='font-size: 18px;'>{result}</div>", unsafe_allow_html=True)
+    else:
+        st.warning("Please enter some text to analyze.")

config.py ADDED Viewed

	@@ -0,0 +1,135 @@

+entity_to_acronyms = {
+    'Activity': 'ACT',
+    'Administration': 'ADM',
+    'Age': 'AGE',
+    'Area': 'ARA',
+    'Biological_attribute': 'BAT',
+    'Biological_structure': 'BST',
+    'Clinical_event': 'CLE',
+    'Color': 'COL',
+    'Coreference': 'COR',
+    'Date': 'DAT',
+    'Detailed_description': 'DET',
+    'Diagnostic_procedure': 'DIA',
+    'Disease_disorder': 'DIS',
+    'Distance': 'DIS',
+    'Dosage': 'DOS',
+    'Duration': 'DUR',
+    'Family_history': 'FAM',
+    'Frequency': 'FRE',
+    'Height': 'HEI',
+    'History': 'HIS',
+    'Lab_value': 'LAB',
+    'Mass': 'MAS',
+    'Medication': 'MED',
+    'Nonbiological_location': 'NBL',
+    'Occupation': 'OCC',
+    'Other_entity': 'OTH',
+    'Other_event': 'OTE',
+    'Outcome': 'OUT',
+    'Personal_background': 'PER',
+    'Qualitative_concept': 'QUC',
+    'Quantitative_concept': 'QUC',
+    'Severity': 'SEV',
+    'Sex': 'SEX',
+    'Shape': 'SHA',
+    'Sign_symptom': 'SIG',
+    'Subject': 'SUB',
+    'Texture': 'TEX',
+    'Therapeutic_procedure': 'THP',
+    'Time': 'TIM',
+    'Volume': 'VOL',
+    'Weight': 'WEI'
+}
+index_to_label = {1: 'B-ACT',
+ 2: 'B-ADM',
+ 3: 'B-AGE',
+ 4: 'B-ARA',
+ 5: 'B-BAT',
+ 6: 'B-BST',
+ 7: 'B-CLE',
+ 8: 'B-COL',
+ 9: 'B-COR',
+ 10: 'B-DAT',
+ 11: 'B-DET',
+ 12: 'B-DIA',
+ 13: 'B-DIS',
+ 14: 'B-DOS',
+ 15: 'B-DUR',
+ 16: 'B-FAM',
+ 17: 'B-FRE',
+ 18: 'B-HEI',
+ 19: 'B-HIS',
+ 20: 'B-LAB',
+ 21: 'B-MAS',
+ 22: 'B-MED',
+ 23: 'B-NBL',
+ 24: 'B-OCC',
+ 25: 'B-OTE',
+ 26: 'B-OTH',
+ 27: 'B-OUT',
+ 28: 'B-PER',
+ 29: 'B-QUC',
+ 30: 'B-SEV',
+ 31: 'B-SEX',
+ 32: 'B-SHA',
+ 33: 'B-SIG',
+ 34: 'B-SUB',
+ 35: 'B-TEX',
+ 36: 'B-THP',
+ 37: 'B-TIM',
+ 38: 'B-VOL',
+ 39: 'B-WEI',
+ 40: 'I-ACT',
+ 41: 'I-ADM',
+ 42: 'I-AGE',
+ 43: 'I-ARA',
+ 44: 'I-BAT',
+ 45: 'I-BST',
+ 46: 'I-CLE',
+ 47: 'I-COL',
+ 48: 'I-COR',
+ 49: 'I-DAT',
+ 50: 'I-DET',
+ 51: 'I-DIA',
+ 52: 'I-DIS',
+ 53: 'I-DOS',
+ 54: 'I-DUR',
+ 55: 'I-FAM',
+ 56: 'I-FRE',
+ 57: 'I-HEI',
+ 58: 'I-HIS',
+ 59: 'I-LAB',
+ 60: 'I-MAS',
+ 61: 'I-MED',
+ 62: 'I-NBL',
+ 63: 'I-OCC',
+ 64: 'I-OTE',
+ 65: 'I-OTH',
+ 66: 'I-OUT',
+ 67: 'I-PER',
+ 68: 'I-QUC',
+ 69: 'I-SEV',
+ 70: 'I-SHA',
+ 71: 'I-SIG',
+ 72: 'I-SUB',
+ 73: 'I-TEX',
+ 74: 'I-THP',
+ 75: 'I-TIM',
+ 76: 'I-VOL',
+ 77: 'I-WEI',
+ 78: 'O',
+ 0: '<PAD>'}
+MAX_LENGTH = 100
+acronyms_to_entities = {v: k for k, v in entity_to_acronyms.items()}
+models = {
+    "model_1": {
+        "path": "model/model_1.h5",
+        "title": "Bidirectional LSTM Model with single LSTM layer"
+    },
+}

data/tokenizer.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f6c875d9180a973c5f297a0e332404c3b79816ace97a91ae39885d20440258a
+size 277589

metrics.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from keras import backend as K
+def precision(y_true, y_pred):
+    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    _precision = true_positives / (predicted_positives + K.epsilon())
+    return _precision
+def recall(y_true, y_pred):
+    """Compute recall metric"""
+    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
+    return true_positives / (possible_positives + K.epsilon())
+def f1_score(y_true, y_pred):
+    """Compute f1-score metric"""
+    _precision = precision(y_true, y_pred)
+    _recall = recall(y_true, y_pred)
+    f1_score = 2 * ((_precision * _recall) / (_precision + _recall + K.epsilon()))
+    return f1_score

model/model_1.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f98117d817b0603a5d8daaaeaca952d6fd6e427bb45167b3602bde1ccbf6823
+size 19859592

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+tensorflow==2.13.0
+numpy>=1.21.0,<1.24.0
+pandas>=1.3.0,<1.5.0
+streamlit
+nltk>=3.6.0
+pickle-mixin

utils.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import re
+import os
+import pickle
+import numpy as np
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import nltk
+nltk.download('punkt')
+nltk.download('stopwords')
+from nltk.corpus import stopwords
+STOP_WORDS = stopwords.words('english')
+# Load the tokenizer from file
+with open('./data/tokenizer.pickle', 'rb') as handle:
+    tokenizer = pickle.load(handle)
+def clean_word(word):
+    """
+    Cleans a word by removing non-alphanumeric characters and extra whitespaces,
+    converting it to lowercase, and checking if it is a stopword.
+    Args:
+    - word (str): the word to clean
+    Returns:
+    - str: the cleaned word, or an empty string if it is a stopword
+    """
+    # remove non-alphanumeric characters and extra whitespaces
+    word = re.sub(r'[^\w\s]', '', word)
+    word = re.sub(r'\s+', ' ', word)
+    # convert to lowercase
+    word = word.lower()
+    if word not in STOP_WORDS:
+        return word
+    return ''
+def tokenize_text(text):
+    """
+    Tokenizes a text into a list of cleaned words.
+    Args:
+    - text (str): the text to tokenize
+    Returns:
+    - tokens (list of str): the list of cleaned words
+    - start_end_ranges (list of tuples): the start and end character positions for each token
+    """
+    regex_match = r'[^\s\u200a\-\u2010-\u2015\u2212\uff0d]+'  # Regex to match words
+    tokens = []
+    start_end_ranges = []
+    # Tokenize the sentences in the text
+    sentences = nltk.sent_tokenize(text)
+    start = 0
+    for sentence in sentences:
+        sentence_tokens = re.findall(regex_match, sentence)
+        curr_sent_tokens = []
+        curr_sent_ranges = []
+        for word in sentence_tokens:
+            word = clean_word(word)
+            if word.strip():
+                start = text.lower().find(word, start)
+                end = start + len(word)
+                curr_sent_ranges.append((start, end))
+                curr_sent_tokens.append(word)
+                start = end
+        if len(curr_sent_tokens) > 0:
+            tokens.append(curr_sent_tokens)
+            start_end_ranges.append(curr_sent_ranges)
+    return tokens, start_end_ranges
+def predict_multi_line_text(text, model, index_to_label, acronyms_to_entities, MAX_LENGTH):
+    """
+    Predicts named entities for multi-line input text.
+    Args:
+    - text (str): The input text
+    - model: The trained NER model
+    - index_to_label: Dictionary mapping index to label
+    - acronyms_to_entities: Dictionary mapping acronyms to entity names
+    - MAX_LENGTH: Maximum input length for the model
+    Returns:
+    - entities: A list of named entities in the format (start, end, label)
+    """
+    sequences = []
+    sent_tokens, sent_start_end = tokenize_text(text)
+    for i in range(len(sent_tokens)):
+        sequence = tokenizer.texts_to_sequences([' '.join(token for token in sent_tokens[i])])
+        sequences.extend(sequence)
+    padded_sequence = pad_sequences(sequences, maxlen=MAX_LENGTH, padding='post')
+    # Make the prediction
+    prediction = model.predict(np.array(padded_sequence))
+    # Decode the prediction
+    predicted_labels = np.argmax(prediction, axis=-1)
+    predicted_labels = [
+        [index_to_label[i] for i in sent_predicted_labels]
+        for sent_predicted_labels in predicted_labels
+    ]
+    entities = []
+    for tokens, sent_pred_labels, start_end_ranges in zip(sent_tokens, predicted_labels, sent_start_end):
+        for i, (token, label, start_end_range) in enumerate(zip(tokens, sent_pred_labels, start_end_ranges)):
+            start = start_end_range[0]
+            end = start_end_range[1]
+            if label not in ['O', '<PAD>']:
+                entity_type = acronyms_to_entities[label[2:]]
+                entity = (start, end, entity_type)
+                entities.append(entity)
+    return entities