Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 26

Commit

adb0a32

verified ·

1 Parent(s): dee9a31

Update text_processing.py

Browse files

Files changed (1) hide show

text_processing.py +23 -40

text_processing.py CHANGED Viewed

@@ -6,7 +6,6 @@ from nltk.tokenize import word_tokenize
 import nltk
 import streamlit as st
-# Download required NLTK data
 try:
     nltk.download('wordnet', quiet=True)
     nltk.download('punkt', quiet=True)
@@ -16,22 +15,17 @@ except:
 class TextProcessor:
     def __init__(self):
-        """Initialize the text processor with TF-IDF vectorizer"""
         self.vectorizer = TfidfVectorizer(
             stop_words='english',
             ngram_range=(1, 2),
             max_features=10000
         )
     def preprocess_text(self, text):
-        """Basic text preprocessing"""
-        # Convert to lower case
         text = text.lower()
-        # Tokenize
         tokens = word_tokenize(text)
-        # Get POS tags
         pos_tags = nltk.pos_tag(tokens)
-        # Extract nouns and adjectives (medical terms are often these)
         medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
         return {
             'processed_text': ' '.join(tokens),
@@ -39,7 +33,6 @@ class TextProcessor:
         }
     def get_synonyms(self, term):
-        """Get synonyms for a term using WordNet"""
         synonyms = []
         for syn in wordnet.synsets(term):
             for lemma in syn.lemmas():
@@ -47,27 +40,19 @@ class TextProcessor:
         return list(set(synonyms))
     def calculate_relevance_scores(self, question, abstracts):
-        """Calculate relevance scores using multiple methods"""
-        # Preprocess question
         proc_question = self.preprocess_text(question)
-        # 1. TF-IDF Similarity
         tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
         tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
-        # 2. Medical Term Matching
         term_scores = []
         question_terms = set(proc_question['medical_terms'])
         for abstract in abstracts:
             abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
-            # Calculate Jaccard similarity between terms
-            if len(question_terms.union(abstract_terms)) > 0:
-                score = len(question_terms.intersection(abstract_terms)) / len(question_terms.union(abstract_terms))
-            else:
-                score = 0
             term_scores.append(score)
-        # 3. Synonym Matching
         synonym_scores = []
         question_synonyms = set()
         for term in proc_question['medical_terms']:
@@ -79,40 +64,38 @@ class TextProcessor:
             for term in abstract_terms:
                 abstract_synonyms.update(self.get_synonyms(term))
-            # Calculate synonym overlap
-            if len(question_synonyms.union(abstract_synonyms)) > 0:
-                score = len(question_synonyms.intersection(abstract_synonyms)) / len(question_synonyms.union(abstract_synonyms))
-            else:
-                score = 0
             synonym_scores.append(score)
-        # Combine scores with weights
-        weights = {
-            'tfidf': 0.5,
-            'term_matching': 0.3,
-            'synonym_matching': 0.2
-        }
         combined_scores = []
         for i in range(len(abstracts)):
-            score = (
-                weights['tfidf'] * tfidf_scores[i] +
-                weights['term_matching'] * term_scores[i] +
-                weights['synonym_matching'] * synonym_scores[i]
-            )
             combined_scores.append(score)
         return np.array(combined_scores)
     def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
-        """Find the most relevant abstracts for a given question"""
-        # Calculate relevance scores
         scores = self.calculate_relevance_scores(question, abstracts)
-        # Get indices of top_k highest scoring abstracts
-        top_indices = np.argsort(scores)[-top_k:][::-1]
-        # Process question for medical terms
         proc_question = self.preprocess_text(question)
         return {
@@ -120,7 +103,7 @@ class TextProcessor:
             'scores': scores[top_indices].tolist(),
             'processed_question': {
                 'original': question,
-                'corrected': question,  # No spell checking in this version
                 'medical_entities': proc_question['medical_terms']
             }
         }

 import nltk
 import streamlit as st
 try:
     nltk.download('wordnet', quiet=True)
     nltk.download('punkt', quiet=True)
 class TextProcessor:
     def __init__(self):
         self.vectorizer = TfidfVectorizer(
             stop_words='english',
             ngram_range=(1, 2),
             max_features=10000
         )
+        self.relevance_threshold = 0.1
     def preprocess_text(self, text):
         text = text.lower()
         tokens = word_tokenize(text)
         pos_tags = nltk.pos_tag(tokens)
         medical_terms = [word for word, tag in pos_tags if tag.startswith(('NN', 'JJ'))]
         return {
             'processed_text': ' '.join(tokens),
         }
     def get_synonyms(self, term):
         synonyms = []
         for syn in wordnet.synsets(term):
             for lemma in syn.lemmas():
         return list(set(synonyms))
     def calculate_relevance_scores(self, question, abstracts):
         proc_question = self.preprocess_text(question)
         tfidf_matrix = self.vectorizer.fit_transform([proc_question['processed_text']] + abstracts)
         tfidf_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])[0]
         term_scores = []
         question_terms = set(proc_question['medical_terms'])
         for abstract in abstracts:
             abstract_terms = set(self.preprocess_text(abstract)['medical_terms'])
+            score = (len(question_terms.intersection(abstract_terms)) /
+                    len(question_terms.union(abstract_terms))) if question_terms.union(abstract_terms) else 0
             term_scores.append(score)
         synonym_scores = []
         question_synonyms = set()
         for term in proc_question['medical_terms']:
             for term in abstract_terms:
                 abstract_synonyms.update(self.get_synonyms(term))
+            score = (len(question_synonyms.intersection(abstract_synonyms)) /
+                    len(question_synonyms.union(abstract_synonyms))) if question_synonyms.union(abstract_synonyms) else 0
             synonym_scores.append(score)
+        weights = {'tfidf': 0.5, 'term_matching': 0.3, 'synonym_matching': 0.2}
         combined_scores = []
         for i in range(len(abstracts)):
+            score = (weights['tfidf'] * tfidf_scores[i] +
+                    weights['term_matching'] * term_scores[i] +
+                    weights['synonym_matching'] * synonym_scores[i])
             combined_scores.append(score)
         return np.array(combined_scores)
     def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
         scores = self.calculate_relevance_scores(question, abstracts)
+        # Filter by relevance threshold
+        relevant_indices = np.where(scores > self.relevance_threshold)[0]
+        if len(relevant_indices) == 0:
+            return {
+                'top_indices': [],
+                'scores': [],
+                'processed_question': None
+            }
+        # Get top_k from relevant papers only
+        top_k = min(top_k, len(relevant_indices))
+        top_indices = relevant_indices[np.argsort(scores[relevant_indices])[-top_k:][::-1]]
         proc_question = self.preprocess_text(question)
         return {
             'scores': scores[top_indices].tolist(),
             'processed_question': {
                 'original': question,
+                'corrected': question,
                 'medical_entities': proc_question['medical_terms']
             }
         }