Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 12

Commit

7262cba

verified ·

1 Parent(s): 0d0c8c3

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -143

app.py CHANGED Viewed

@@ -117,173 +117,122 @@ def preprocess_text(text):
     return formatted_text
 def improve_summary_generation(text, model, tokenizer):
-    """Generate improved summary with better prompt engineering and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
-    # Create a more structured prompt that enforces accurate reporting
     formatted_text = (
-        "Summarize this medical research paper accurately and concisely. "
-        "Include only factual information from the text. "
-        "Structure the summary as follows:\n"
-        "1. OBJECTIVE: State the main purpose and study population\n"
-        "2. METHODS: Describe key methodological elements\n"
-        "3. RESULTS: Report specific findings with exact numbers/percentages\n"
-        "4. CONCLUSION: State main implications\n\n"
         "Original text: " + preprocess_text(text)
     )
-    # First attempt with conservative parameters
-    summary = generate_summary_attempt(formatted_text, model, tokenizer,
-                                     conservative_params=True)
-    # Validate the generated summary
-    if not validate_summary(summary, text):
-        # If validation fails, try again with different parameters
-        summary = generate_summary_attempt(formatted_text, model, tokenizer,
-                                         conservative_params=False)
-    return post_process_summary(summary)
-def generate_summary_attempt(formatted_text, model, tokenizer, conservative_params=True):
-    """Generate a summary with specified parameters"""
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    params = {
-        "input_ids": inputs["input_ids"],
-        "attention_mask": inputs["attention_mask"],
-        "max_length": 250,  # Increased for better coverage
-        "min_length": 100,  # Increased to ensure comprehensive summary
-        "early_stopping": True,
-        "no_repeat_ngram_size": 3,
-    }
-    if conservative_params:
-        params.update({
-            "num_beams": 5,
-            "length_penalty": 1.5,
-            "temperature": 0.7,
-            "top_p": 0.9,
-            "repetition_penalty": 1.5
-        })
-    else:
-        params.update({
-            "num_beams": 4,
-            "length_penalty": 2.0,
-            "temperature": 0.8,
-            "top_p": 0.95,
-            "repetition_penalty": 2.0
-        })
     with torch.no_grad():
-        summary_ids = model.generate(**params)
-    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def validate_summary(summary, original_text):
-    """Enhanced validation of summary content"""
-    if not summary or not original_text:
         return False
-    # Extract numerical values from both texts
-    original_numbers = set(re.findall(r'(\d+(?:\.\d+)?)\s*%', original_text))
-    summary_numbers = set(re.findall(r'(\d+(?:\.\d+)?)\s*%', summary))
-    # Check if key percentages are preserved
-    if not summary_numbers.issubset(original_numbers):
         return False
-    # Check for contradictions in methodology statements
-    methods_original = extract_methods(original_text)
-    methods_summary = extract_methods(summary)
-    if methods_summary and not any(m in original_text.lower() for m in methods_summary):
         return False
-    # Verify no hallucinated content
-    sentences = summary.split('.')
-    for sentence in sentences:
-        # Check if key claims in summary are supported by original
-        if sentence.strip() and not is_supported_by_original(sentence, original_text):
-            return False
-    return True
-def extract_methods(text):
-    """Extract methodology-related terms"""
-    method_keywords = ['study', 'survey', 'analysis', 'trial', 'experiment']
-    methods = []
-    for keyword in method_keywords:
-        pattern = fr'{keyword}\s+\w+'
-        matches = re.findall(pattern, text.lower())
-        methods.extend(matches)
-    return methods
-def is_supported_by_original(claim, original):
-    """Check if a claim from summary is supported by original text"""
-    # Remove common filler phrases
-    claim = re.sub(r'(this study|the study|results show|we found that)', '', claim.lower()).strip()
-    # Split into key phrases
-    key_phrases = [p.strip() for p in claim.split(' and ')]
-    # Check if each key phrase has supporting evidence
-    for phrase in key_phrases:
-        if phrase and not has_supporting_evidence(phrase, original.lower()):
-            return False
     return True
-def has_supporting_evidence(phrase, original):
-    """Check if there's supporting evidence for a phrase"""
-    # Convert to word sets for flexible matching
-    phrase_words = set(phrase.split())
-    original_sentences = [set(s.split()) for s in original.split('.')]
-    # Check if any sentence contains most of the phrase words
-    return any(len(phrase_words.intersection(sent)) >= len(phrase_words) * 0.7
-              for sent in original_sentences)
-def post_process_summary(summary):
-    """Enhanced post-processing of generated summary"""
-    if not summary:
-        return summary
-    # Split into sections based on the structured format
-    sections = []
-    current_section = []
-    for line in summary.split('\n'):
-        line = line.strip()
-        if any(marker in line.upper() for marker in ['OBJECTIVE:', 'METHODS:', 'RESULTS:', 'CONCLUSION:']):
-            if current_section:
-                sections.append(' '.join(current_section))
-            current_section = [line]
-        elif line:
-            current_section.append(line)
-    if current_section:
-        sections.append(' '.join(current_section))
-    # Clean up each section
-    cleaned_sections = []
-    for section in sections:
-        # Fix common issues
-        section = re.sub(r'\s+', ' ', section)  # Remove multiple spaces
-        section = re.sub(r'(\d+)\s*%', r'\1%', section)  # Fix percentage formatting
-        section = re.sub(r'(\.|,)\s*(\d)', r'\1 \2', section)  # Fix number spacing
-        cleaned_sections.append(section)
-    # Join sections with proper spacing
-    final_summary = '\n'.join(cleaned_sections)
-    # Ensure proper ending
-    if final_summary and not final_summary.endswith('.'):
-        final_summary += '.'
-    return final_summary
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     # Preprocess each abstract

     return formatted_text
+def post_process_summary(summary):
+    """Clean up and improve summary coherence"""
+    if not summary:
+        return summary
+    # Split into sentences
+    sentences = [s.strip() for s in summary.split('.')]
+    sentences = [s for s in sentences if s]  # Remove empty sentences
+    # Fix common issues
+    processed_sentences = []
+    for i, sentence in enumerate(sentences):
+        # Remove redundant words/phrases
+        sentence = sentence.replace(" and and ", " and ")
+        sentence = sentence.replace("appointment and appointment", "appointment")
+        # Fix common grammatical issues
+        sentence = sentence.replace("Cancers distress", "Cancer distress")
+        sentence = sentence.replace("  ", " ")  # Remove double spaces
+        # Capitalize first letter of each sentence
+        sentence = sentence.capitalize()
+        # Add to processed sentences if not empty
+        if sentence.strip():
+            processed_sentences.append(sentence)
+    # Join sentences with proper spacing and punctuation
+    cleaned_summary = '. '.join(processed_sentences)
+    if cleaned_summary and not cleaned_summary.endswith('.'):
+        cleaned_summary += '.'
+    return cleaned_summary
 def improve_summary_generation(text, model, tokenizer):
+    """Generate improved summary with better prompt and validation"""
     if not isinstance(text, str) or not text.strip():
         return "No abstract available to summarize."
+    # Add a more specific prompt
     formatted_text = (
+        "Summarize this medical research paper following this structure exactly:\n"
+        "1. Background and objectives\n"
+        "2. Methods\n"
+        "3. Key findings with specific numbers/percentages\n"
+        "4. Main conclusions\n"
         "Original text: " + preprocess_text(text)
     )
+    # Adjust generation parameters
     inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
+        summary_ids = model.generate(
+            **{
+                "input_ids": inputs["input_ids"],
+                "attention_mask": inputs["attention_mask"],
+                "max_length": 200,
+                "min_length": 50,
+                "num_beams": 5,
+                "length_penalty": 1.5,
+                "no_repeat_ngram_size": 3,
+                "temperature": 0.7,
+                "repetition_penalty": 1.5
+            }
+        )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    # Post-process the summary
+    processed_summary = post_process_summary(summary)
+    # Validate the summary
+    if not validate_summary(processed_summary, text):
+        # If validation fails, try one more time with different parameters
+        with torch.no_grad():
+            summary_ids = model.generate(
+                **{
+                    "input_ids": inputs["input_ids"],
+                    "attention_mask": inputs["attention_mask"],
+                    "max_length": 200,
+                    "min_length": 50,
+                    "num_beams": 4,
+                    "length_penalty": 2.0,
+                    "no_repeat_ngram_size": 4,
+                    "temperature": 0.8,
+                    "repetition_penalty": 2.0
+                }
+            )
+        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+        processed_summary = post_process_summary(summary)
+    return processed_summary
 def validate_summary(summary, original_text):
+    """Validate summary content against original text"""
+    # Check for age inconsistencies
+    age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
+    if len(age_mentions) > 1:  # Multiple age mentions
         return False
+    # Check for repetitive sentences
+    sentences = summary.split('.')
+    unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
+    if len(sentences) - len(unique_sentences) > 1:  # More than one duplicate
         return False
+    # Check summary isn't too long or too short compared to original
+    summary_words = len(summary.split())
+    original_words = len(original_text.split())
+    if summary_words < 20 or summary_words > original_words * 0.8:
         return False
     return True
 def generate_focused_summary(question, abstracts, model, tokenizer):
     """Generate focused summary based on question"""
     # Preprocess each abstract