Spaces:

pendar02
/

biomedical

Sleeping

App Files Files Community

pendar02 commited on Jan 15

Commit

3ffe379

verified ·

1 Parent(s): cf44c2f

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -185

app.py CHANGED Viewed

@@ -1,12 +1,19 @@
 import streamlit as st
 import pandas as pd
 import torch
-import re
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from peft import PeftModel
 from text_processing import TextProcessor
 import gc
 from pathlib import Path
 # Configure page
 st.set_page_config(
@@ -26,6 +33,25 @@ if 'processing_started' not in st.session_state:
     st.session_state.processing_started = False
 if 'focused_summary_generated' not in st.session_state:
     st.session_state.focused_summary_generated = False
 def load_model(model_type):
     """Load appropriate model based on type with proper memory management"""
@@ -72,6 +98,26 @@ def load_model(model_type):
         st.error(f"Error loading model: {str(e)}")
         raise
 def cleanup_model(model, tokenizer):
     """Properly cleanup model resources"""
     try:
@@ -82,9 +128,7 @@ def cleanup_model(model, tokenizer):
     except Exception:
         pass
 @st.cache_data
 def process_excel(uploaded_file):
     """Process uploaded Excel file"""
     try:
@@ -119,7 +163,6 @@ def process_excel(uploaded_file):
         st.error("Please check if your file is in the correct Excel format (.xlsx or .xls)")
         return None
 def validate_excel_structure(df):
     """Validate the structure and content of the Excel file"""
     validation_messages = []
@@ -150,147 +193,142 @@ def validate_excel_structure(df):
     return len(validation_messages) == 0, validation_messages
 def preprocess_text(text):
-    """Preprocess text to add appropriate formatting before summarization"""
     if not isinstance(text, str) or not text.strip():
         return text
-    # Split text into sentences (basic implementation)
-    sentences = [s.strip() for s in text.replace('. ', '.\n').split('\n')]
-    # Remove empty sentences
-    sentences = [s for s in sentences if s]
-    # Join with proper line breaks
-    formatted_text = '\n'.join(sentences)
-    return formatted_text
-def post_process_summary(summary):
-    """Clean up and improve summary coherence"""
-    if not summary:
-        return summary
-    # Split into sentences
-    sentences = [s.strip() for s in summary.split('.')]
-    sentences = [s for s in sentences if s]  # Remove empty sentences
-    # Fix common issues
-    processed_sentences = []
-    for i, sentence in enumerate(sentences):
-        # Remove redundant words/phrases
-        sentence = sentence.replace(" and and ", " and ")
-        sentence = sentence.replace("appointment and appointment", "appointment")
-        # Fix common grammatical issues
-        sentence = sentence.replace("Cancers distress", "Cancer distress")
-        sentence = sentence.replace("  ", " ")  # Remove double spaces
-        # Capitalize first letter of each sentence
-        sentence = sentence.capitalize()
-        # Add to processed sentences if not empty
-        if sentence.strip():
-            processed_sentences.append(sentence)
-    # Join sentences with proper spacing and punctuation
-    cleaned_summary = '. '.join(processed_sentences)
-    if cleaned_summary and not cleaned_summary.endswith('.'):
-        cleaned_summary += '.'
-    return cleaned_summary
-def improve_summary_generation(text, model, tokenizer):
-    """Generate improved summary with better prompt and validation"""
-    if not isinstance(text, str) or not text.strip():
-        return "No abstract available to summarize."
-    # Add a more specific prompt
-    formatted_text = (
-        "Summarize this medical research paper following this structure exactly:\n"
-        "1. Background and objectives\n"
-        "2. Methods\n"
-        "3. Key findings with specific numbers/percentages\n"
-        "4. Main conclusions\n"
-        "Original text: " + preprocess_text(text)
-    )
-    # Adjust generation parameters
-    inputs = tokenizer(formatted_text, return_tensors="pt", max_length=1024, truncation=True)
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
-    with torch.no_grad():
-        summary_ids = model.generate(
-            **{
-                "input_ids": inputs["input_ids"],
-                "attention_mask": inputs["attention_mask"],
-                "max_length": 200,
-                "min_length": 50,
-                "num_beams": 5,
-                "length_penalty": 1.5,
-                "no_repeat_ngram_size": 3,
-                "temperature": 0.7,
-                "repetition_penalty": 1.5
-            }
-        )
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    # Post-process the summary
-    processed_summary = post_process_summary(summary)
-    # Validate the summary
-    if not validate_summary(processed_summary, text):
-        # If validation fails, try one more time with different parameters
-        with torch.no_grad():
-            summary_ids = model.generate(
-                **{
-                    "input_ids": inputs["input_ids"],
-                    "attention_mask": inputs["attention_mask"],
-                    "max_length": 200,
-                    "min_length": 50,
-                    "num_beams": 4,
-                    "length_penalty": 2.0,
-                    "no_repeat_ngram_size": 4,
-                    "temperature": 0.8,
-                    "repetition_penalty": 2.0
-                }
-            )
-        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-        processed_summary = post_process_summary(summary)
-    return processed_summary
-def validate_summary(summary, original_text):
-    """Validate summary content against original text"""
-    # Check for age inconsistencies
-    age_mentions = re.findall(r'(\d+\.?\d*)\s*years?', summary.lower())
-    if len(age_mentions) > 1:  # Multiple age mentions
-        return False
-    # Check for repetitive sentences
-    sentences = summary.split('.')
-    unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
-    if len(sentences) - len(unique_sentences) > 1:  # More than one duplicate
-        return False
-    # Check summary isn't too long or too short compared to original
-    summary_words = len(summary.split())
-    original_words = len(original_text.split())
-    if summary_words < 20 or summary_words > original_words * 0.8:
-        return False
-    return True
-def generate_focused_summary(question, abstracts, model, tokenizer):
-    """Generate focused summary based on question"""
-    # Preprocess each abstract
-    formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts]
-    combined_input = f"Question: {question} Abstracts: " + " [SEP] ".join(formatted_abstracts)
-    inputs = tokenizer(combined_input, return_tensors="pt", max_length=1024, truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
@@ -298,15 +336,33 @@ def generate_focused_summary(question, abstracts, model, tokenizer):
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
-                "max_length": 200,
-                "min_length": 50,
                 "num_beams": 4,
                 "length_penalty": 2.0,
-                "early_stopping": True
             }
         )
-    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
 def create_filter_controls(df, sort_column):
     """Create appropriate filter controls based on the selected column"""
@@ -367,6 +423,7 @@ def create_filter_controls(df, sort_column):
     return filtered_df
 def main():
     st.title("🔬 Biomedical Papers Analysis")
@@ -429,26 +486,26 @@ def main():
                 # Individual Summaries Section
                 st.header("📝 Individual Paper Summaries")
                 # Generate summaries if not already done
                 if st.session_state.summaries is None:
                     try:
                         with st.spinner("Generating individual paper summaries..."):
-                            model, tokenizer = load_model("summarize")
-                            summaries = []
-                            progress_bar = st.progress(0)
-                            for idx, abstract in enumerate(df['Abstract']):
-                                summary = improve_summary_generation(abstract, model, tokenizer)
-                                summaries.append(summary)
-                                progress_bar.progress((idx + 1) / len(df))
-                            st.session_state.summaries = summaries
-                            cleanup_model(model, tokenizer)
-                            progress_bar.empty()
                     except Exception as e:
                         st.error(f"Error generating summaries: {str(e)}")
-                        st.session_state.processing_started = False
                 # Display summaries with improved sorting and filtering
                 if st.session_state.summaries is not None:
@@ -543,7 +600,7 @@ def main():
                                 </div>
                             </div>
                             """, unsafe_allow_html=True)
                         with paper_info_cols[1]:  # SUMMARY column
                             st.markdown('<div class="paper-section"><div class="section-header">SUMMARY</div>', unsafe_allow_html=True)
                             st.markdown(f"""
@@ -554,54 +611,68 @@ def main():
                         # Add spacing between papers
                         st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
-                # Question-focused Summary Section (only if question provided)
-                if question.strip():
-                    st.header("❓ Question-focused Summary")
-                    if not st.session_state.get('focused_summary_generated', False):
-                        try:
-                            with st.spinner("Analyzing relevant papers..."):
-                                # Initialize text processor if needed
-                                if st.session_state.text_processor is None:
-                                    st.session_state.text_processor = TextProcessor()
-                                # Find relevant abstracts
-                                results = st.session_state.text_processor.find_most_relevant_abstracts(
-                                    question,
-                                    df['Abstract'].tolist(),
-                                    top_k=5
-                                )
-                                # Load question-focused model
-                                model, tokenizer = load_model("question_focused")
-                                # Generate focused summary
-                                relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
-                                focused_summary = generate_focused_summary(
-                                    question,
-                                    relevant_abstracts,
-                                    model,
-                                    tokenizer
-                                )
-                                # Store results
-                                st.session_state.focused_summary = focused_summary
-                                st.session_state.relevant_papers = df.iloc[results['top_indices']]
-                                st.session_state.relevance_scores = results['scores']
-                                st.session_state.focused_summary_generated = True
-                                # Cleanup second model
-                                cleanup_model(model, tokenizer)
-                        except Exception as e:
-                            st.error(f"Error generating focused summary: {str(e)}")
                     # Display focused summary results
                     if st.session_state.get('focused_summary_generated', False):
                         st.subheader("Summary")
                         st.write(st.session_state.focused_summary)
                         st.subheader("Most Relevant Papers")
                         relevant_papers = st.session_state.relevant_papers[
                             ['Article Title', 'Authors', 'Publication Year', 'DOI']
@@ -609,6 +680,8 @@ def main():
                         relevant_papers['Relevance Score'] = st.session_state.relevance_scores
                         relevant_papers['Publication Year'] = relevant_papers['Publication Year'].astype(int)
                         st.dataframe(relevant_papers, hide_index=True)
 if __name__ == "__main__":
     main()

 import streamlit as st
 import pandas as pd
 import torch
+import re
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from peft import PeftModel
 from text_processing import TextProcessor
 import gc
 from pathlib import Path
+import concurrent.futures
+import time
+import nltk
+from nltk.tokenize import sent_tokenize
+from concurrent.futures import ThreadPoolExecutor  # Add this import
+nltk.download('punkt')
 # Configure page
 st.set_page_config(
     st.session_state.processing_started = False
 if 'focused_summary_generated' not in st.session_state:
     st.session_state.focused_summary_generated = False
+if 'current_model' not in st.session_state:
+    st.session_state.current_model = None
+if 'current_tokenizer' not in st.session_state:
+    st.session_state.current_tokenizer = None
+if 'model_type' not in st.session_state:
+    st.session_state.model_type = None
+# TextProcessor class definition
+try:
+    from text_processing import TextProcessor
+except ImportError:
+    class TextProcessor:
+        def find_most_relevant_abstracts(self, question, abstracts, top_k=5):
+            return {
+                'top_indices': list(range(min(top_k, len(abstracts)))),
+                'scores': [1.0] * min(top_k, len(abstracts))
+            }
 def load_model(model_type):
     """Load appropriate model based on type with proper memory management"""
         st.error(f"Error loading model: {str(e)}")
         raise
+def get_model(model_type):
+    """Get model from session state or load if needed"""
+    try:
+        if (st.session_state.current_model is None or
+            st.session_state.model_type != model_type):
+            # Clean up existing model
+            if st.session_state.current_model is not None:
+                cleanup_model(st.session_state.current_model,
+                            st.session_state.current_tokenizer)
+            # Load new model
+            model, tokenizer = load_model(model_type)
+            st.session_state.current_model = model
+            st.session_state.current_tokenizer = tokenizer
+            st.session_state.model_type = model_type
+        return st.session_state.current_model, st.session_state.current_tokenizer
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        st.session_state.processing_started = False
+        return None, None
 def cleanup_model(model, tokenizer):
     """Properly cleanup model resources"""
     try:
     except Exception:
         pass
 @st.cache_data
 def process_excel(uploaded_file):
     """Process uploaded Excel file"""
     try:
         st.error("Please check if your file is in the correct Excel format (.xlsx or .xls)")
         return None
 def validate_excel_structure(df):
     """Validate the structure and content of the Excel file"""
     validation_messages = []
     return len(validation_messages) == 0, validation_messages
 def preprocess_text(text):
+    """Enhanced text preprocessing with improved header and list handling"""
     if not isinstance(text, str) or not text.strip():
         return text
+    # Initial cleanup
+    text = re.sub(r'\s+', ' ', text.strip())
+    # Standardize case for specific terms (e.g., PRIME -> Prime)
+    text = re.sub(r'\b([A-Z]{2,})\b', lambda m: m.group(1).title(), text)
+    # Fix spacing around punctuation and parentheses
+    text = re.sub(r'\s*:\s*', ': ', text)
+    text = re.sub(r'\s*,\s*', ', ', text)
+    text = re.sub(r'\(\s*([ivx\d]+)\s*\)', r'(\1)', text)
+    # Convert numbered lists to consistent format
+    text = re.sub(r'(?m)^\s*(\d+)\.\s*', r'(\1) ', text)
+    # Normalize section headers (using comprehensive patterns)
+    section_patterns = {
+        r'\b(?:Introduction|Background|Objectives|Purpose|Context)\s*:': 'Background and Objectives: ',
+        r'\b(?:Methods|Materials and Methods|Approach|Study Design|Experimental Design)\s*:': 'Methods: ',
+        r'\b(?:Results|Findings|Observations|Key Findings)\s*:': 'Results: ',
+        r'\b(?:Discussion|Analysis|Implications|Interpretation)\s*:': 'Discussion: ',
+        r'\b(?:Conclusion|Conclusions|Summary|Final Remarks)\s*:': 'Conclusions: '
+    }
+    # Remove nested headers
+    nested_header_pattern = r'\d+\.\s*(?:Background|Objectives|Methods|Results|Discussion|Conclusions)\s*:'
+    text = re.sub(nested_header_pattern, '', text)
+    # Standardize section headers
+    for pattern, replacement in section_patterns.items():
+        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
+    # Split merged section headers
+    text = re.sub(r'(?i)Results\s+and\s+Conclusions:', 'Results: ', text)
+    # Handle special characters and normalize spacing
+    text = re.sub(r'[“”]', '"', text)  # Correctly handle double quotes
+    text = re.sub(r"[‘’]", "'", text)  # Correctly handle single quotes
+    text = re.sub(r'\s*-\s*', '-', text)
+    # Tokenize and capitalize sentences
+    sentences = re.split(r'(?<=\w[.!?])\s+|\n(?=\d+\.|\(\w+\)|-)', text)
+    formatted_sentences = [s.strip().capitalize() for s in sentences if s.strip()]
+    return ' '.join(formatted_sentences)
+def post_process_summary(summary):
+    """Enhanced summary post-processing with improved formatting."""
+    if not summary:
+        return summary
+    # Step 1: Remove empty or redundant headers
+    summary = re.sub(r'\b(?:Background|Objectives|Methods|Results|Conclusions)\s*:\s*\.?\s*', '', summary)
+    # Step 2: Fix spacing issues in lists and parentheses
+    summary = re.sub(r'\(\s*([ivx\d]+)\s*\)', r'(\1)', summary)  # Fix space inside parentheses
+    summary = re.sub(r'\s*,\s*(\([ivx\d]+\))', r', \1', summary)  # Fix spacing before list items
+    # Step 3: Ensure proper punctuation and spacing
+    summary = re.sub(r'(?<=[.!?])\s*([A-Z])', r' \1', summary)  # Add space after punctuation
+    summary = re.sub(r'\s*:\s*', ': ', summary)  # Fix spacing around colons
+    # Step 4: Remove sections with too little content
+    sections = [s.strip() for s in summary.split('\n') if len(s.split()) > 3]
+    summary = ' '.join(sections)
+    # Step 5: Remove multiple periods
+    summary = re.sub(r'\.\.+', '.', summary)
+    # Step 6: Ensure summary ends with a single period
+    summary = summary.strip()
+    if not summary.endswith('.'):
+        summary += '.'
+    return summary
+def generate_focused_summary(question, abstracts, model, tokenizer):
+    """Generate a structured summary based on the given question and abstracts."""
+    # Preprocess and clean abstracts
+    formatted_abstracts = [preprocess_text(abstract) for abstract in abstracts if abstract.strip()]
+    if not formatted_abstracts:
+        raise ValueError("Abstracts list is empty or improperly formatted.")
+    # Join abstracts with separator
+    abstracts_content = " [SEP] ".join(formatted_abstracts)
+    # Create the prompt
+    prompt = f"""
+    Generate a structured summary based on the given abstracts and the question. Follow these rules STRICTLY:
+    **QUESTION:** {question}
+    **SECTION FORMATTING RULES:**
+    1. Each section MUST start with the section name followed by ": " (e.g., "Background: ").
+    2. Each section MUST end with a period.
+    3. Write complete, grammatically correct sentences.
+    4. Do not use bullet points, lists, or combined section headers.
+    5. Maintain the exact order of sections: Background, Objectives, Methods, Results, Conclusions.
+    6. Avoid redundancies, incomplete thoughts, and cutting sentences mid-way.
+    7. Use transition words (e.g., "Additionally," "Furthermore," "Moreover") to connect ideas naturally.
+    **REQUIRED SECTIONS AND CONTENT:**
+    1. **Background**:
+       - Provide the context and motivation for the study.
+       - Do not mention objectives, methods, or results in this section.
+    2. **Objectives**:
+       - Clearly state the aim(s) of the study.
+       - Avoid referencing any methods or findings.
+    3. **Methods**:
+       - Describe the approach, tools, and procedures used.
+       - Do not include any findings or results in this section.
+    4. **Results**:
+       - Summarize the key findings, including relevant statistics and outcomes.
+       - Mention implications only if explicitly stated in the abstracts.
+    5. **Conclusions**:
+       - Highlight the overall interpretation of findings.
+       - Emphasize the significance and implications of the study.
+    **CRITICAL FORMAT RULES:**
+    1. Each section title must be followed by a colon and a space.
+    2. All sentences must be grammatically complete and coherent.
+    3. Avoid bullet points, lists, and repeated sections.
+    4. End each section with a period.
+    **INPUT ABSTRACTS:** {abstracts_content}
+    """
+    # Tokenize input (use the correct variable `prompt` here)
+    inputs = tokenizer(prompt,
+                       return_tensors="pt",
+                       max_length=1024,
+                       truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
             **{
                 "input_ids": inputs["input_ids"],
                 "attention_mask": inputs["attention_mask"],
+                "max_length": 280,
+                "min_length": 100,
                 "num_beams": 4,
                 "length_penalty": 2.0,
+                "no_repeat_ngram_size": 2,
+                "temperature": 0.7,
+                "do_sample": False
             }
         )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    return post_process_summary(summary)
+def process_papers_in_batches(df, model, tokenizer, batch_size=2):
+    """Process papers in batches for better efficiency"""
+    abstracts = df['Abstract'].tolist()
+    summaries = []
+    with ThreadPoolExecutor(max_workers=4) as executor:  # Parallel processing
+        future_to_batch = {executor.submit(generate_focused_summary, "Focus on key findings and methods.", [abstract], model, tokenizer): abstract for abstract in abstracts}
+        for future in future_to_batch:
+            summaries.append(future.result())
+    return summaries
 def create_filter_controls(df, sort_column):
     """Create appropriate filter controls based on the selected column"""
     return filtered_df
 def main():
     st.title("🔬 Biomedical Papers Analysis")
                 # Individual Summaries Section
                 st.header("📝 Individual Paper Summaries")
                 # Generate summaries if not already done
                 if st.session_state.summaries is None:
                     try:
                         with st.spinner("Generating individual paper summaries..."):
+                            model, tokenizer = get_model("summarize")
+                            if model is None or tokenizer is None:
+                                reset_processing_state()
+                                return
+                            start_time = time.time()
+                            st.session_state.summaries = process_papers_in_batches(
+                                df, model, tokenizer, batch_size=2
+                            )
+                            end_time = time.time()
+                            st.write(f"Processing time: {end_time - start_time:.2f} seconds")
                     except Exception as e:
                         st.error(f"Error generating summaries: {str(e)}")
+                        reset_processing_state()
                 # Display summaries with improved sorting and filtering
                 if st.session_state.summaries is not None:
                                 </div>
                             </div>
                             """, unsafe_allow_html=True)
                         with paper_info_cols[1]:  # SUMMARY column
                             st.markdown('<div class="paper-section"><div class="section-header">SUMMARY</div>', unsafe_allow_html=True)
                             st.markdown(f"""
                         # Add spacing between papers
                         st.markdown("<div style='margin-bottom: 20px;'></div>", unsafe_allow_html=True)
+                    # Question-focused Summary Section (only if question provided)
+                    if question.strip():
+                        st.header("❓ Question-focused Summary")
+                        if not st.session_state.get('focused_summary_generated', False):
+                            try:
+                                with st.spinner("Analyzing relevant papers..."):
+                                    # Initialize text processor if needed
+                                    if st.session_state.text_processor is None:
+                                        st.session_state.text_processor = TextProcessor()
+                                    # Validate question
+                                    if not question.strip():
+                                        st.warning("Please enter a question first")
+                                        return
+                                    # Find relevant abstracts
+                                    results = st.session_state.text_processor.find_most_relevant_abstracts(
+                                        question,
+                                        df['Abstract'].tolist(),
+                                        top_k=5
+                                    )
+                                    if not results['top_indices']:
+                                        st.warning("No relevant papers found for your question")
+                                        return
+                                    # Load question-focused model
+                                    model, tokenizer = get_model("question_focused")
+                                    if model is None or tokenizer is None:
+                                        return
+                                    # Generate focused summary
+                                    try:
+                                        relevant_abstracts = df['Abstract'].iloc[results['top_indices']].tolist()
+                                        focused_summary = generate_focused_summary(
+                                            question,
+                                            relevant_abstracts,
+                                            model,
+                                            tokenizer
+                                        )
+                                        # Store results
+                                        st.session_state.focused_summary = focused_summary
+                                        st.session_state.relevant_papers = df.iloc[results['top_indices']]
+                                        st.session_state.relevance_scores = results['scores']
+                                        st.session_state.focused_summary_generated = True
+                                    finally:
+                                        # Cleanup second model
+                                        cleanup_model(model, tokenizer)
+                            except Exception as e:
+                                st.error(f"Error generating focused summary: {str(e)}")
+                                reset_processing_state()
                     # Display focused summary results
                     if st.session_state.get('focused_summary_generated', False):
                         st.subheader("Summary")
                         st.write(st.session_state.focused_summary)
                         st.subheader("Most Relevant Papers")
                         relevant_papers = st.session_state.relevant_papers[
                             ['Article Title', 'Authors', 'Publication Year', 'DOI']
                         relevant_papers['Relevance Score'] = st.session_state.relevance_scores
                         relevant_papers['Publication Year'] = relevant_papers['Publication Year'].astype(int)
                         st.dataframe(relevant_papers, hide_index=True)
 if __name__ == "__main__":
     main()