import gradio as gr from pathlib import Path from pipeline.process import process_texts from pipeline.visualize import generate_visualizations, generate_word_count_chart, generate_vocab_containment_chart from pipeline.llm_service import LLMService from pipeline.progressive_ui import ProgressiveUI, create_progressive_callback import logging import pandas as pd from datetime import datetime from dotenv import load_dotenv # Load environment variables from .env file from theme import tibetan_theme load_dotenv() logger = logging.getLogger(__name__) def main_interface(): # Theme and CSS applied here for Gradio 5.x compatibility # For Gradio 6.x, these will move to launch() - see migration guide with gr.Blocks( theme=tibetan_theme, css=tibetan_theme.get_css_string(), title="Tibetan Text Metrics Web App" ) as demo: gr.Markdown( """# Tibetan Text Metrics Compare Tibetan texts to discover how similar they are. This tool helps scholars identify shared passages, textual variations, and relationships between different versions of Tibetan manuscripts. Part of the TTM project. """, elem_classes="gr-markdown", ) with gr.Row(elem_id="steps-row"): with gr.Column(scale=1, elem_classes="step-column"): with gr.Group(elem_classes="step-box"): gr.Markdown( """ ## Step 1: Upload Your Texts Upload two or more Tibetan text files (.txt format). If your texts have chapters, separate them with the ༈ marker so the tool can compare chapter-by-chapter. """, elem_classes="gr-markdown", ) file_input = gr.File( label="Choose your Tibetan text files", file_types=[".txt"], file_count="multiple", ) gr.Markdown( "Tip: Files should be under 1MB for best performance. Use UTF-8 encoded .txt files.", elem_classes="gr-markdown" ) with gr.Column(scale=1, elem_classes="step-column"): with gr.Group(elem_classes="step-box"): gr.Markdown( """## Step 2: Choose Analysis Type Pick a preset for quick results, or use Custom for full control. """, elem_classes="gr-markdown", ) with gr.Tabs(): # ===== QUICK START TAB ===== with gr.Tab("Quick Start", id="quick_tab"): analysis_preset = gr.Radio( label="What kind of analysis do you need?", choices=[ "Standard — Vocabulary + Sequences + Fuzzy matching", "Deep — All metrics including AI meaning analysis", "Quick — Vocabulary overlap only (fastest)" ], value="Standard — Vocabulary + Sequences + Fuzzy matching", info="Standard is recommended for most users. Deep analysis takes longer but finds texts with similar meaning even when words differ." ) gr.Markdown(""" **What each preset includes:** | Preset | Jaccard | LCS | Fuzzy | Semantic AI | |--------|---------|-----|-------|-------------| | Standard | ✓ | ✓ | ✓ | — | | Deep | ✓ | ✓ | ✓ | ✓ | | Quick | ✓ | — | — | — | """, elem_classes="preset-table") process_btn_quick = gr.Button( "Compare My Texts", elem_id="run-btn-quick", variant="primary" ) # ===== CUSTOM TAB ===== with gr.Tab("Custom", id="custom_tab"): gr.Markdown("**Fine-tune each metric and option:**", elem_classes="custom-header") with gr.Accordion("Lexical Metrics", open=True): gr.Markdown("*Compare the actual words used in texts*") tokenization_mode_dropdown = gr.Dropdown( label="How to split text?", choices=[ "word - Whole words (recommended)", "syllable - Individual syllables (finer detail)" ], value="word - Whole words (recommended)", info="'Word' keeps multi-syllable words together — recommended for Jaccard." ) stopwords_dropdown = gr.Dropdown( label="Filter common words?", choices=[ "None (No filtering)", "Standard (Common particles only)", "Aggressive (All function words)" ], value="Standard (Common particles only)", info="Remove common particles (གི, ལ, ནི) before comparing." ) particle_normalization_checkbox = gr.Checkbox( label="Normalize grammatical particles?", value=False, info="Treat variants as equivalent (གི/ཀྱི/གྱི → གི). Useful for different scribal conventions." ) with gr.Accordion("Sequence Matching (LCS)", open=True): gr.Markdown("*Find shared passages in the same order*") gr.Checkbox( label="Enable sequence matching", value=True, info="Finds the longest sequence of words appearing in both texts." ) # LCS is always computed as a core metric lcs_normalization_dropdown = gr.Dropdown( label="How to handle different text lengths?", choices=[ "avg - Balanced comparison (default)", "min - Detect if one text contains the other", "max - Stricter, penalizes length differences" ], value="avg - Balanced comparison (default)", info="'min' is useful for finding quotes or excerpts." ) with gr.Accordion("Fuzzy Matching", open=True): gr.Markdown("*Detect similar but not identical text*") fuzzy_toggle_radio = gr.Radio( label="Find approximate matches?", choices=["Yes", "No"], value="Yes", info="Useful for spelling variations and scribal differences." ) fuzzy_method_dropdown = gr.Dropdown( label="Matching method", choices=[ "ngram - Syllable pairs (recommended)", "syllable_edit - Count syllable changes", "weighted_jaccard - Word frequency comparison" ], value="ngram - Syllable pairs (recommended)", info="All options work at the Tibetan syllable level." ) with gr.Accordion("Semantic Analysis", open=False): gr.Markdown("*Compare meaning using AI (slower)*") semantic_toggle_radio = gr.Radio( label="Analyze meaning similarity?", choices=["Yes", "No"], value="No", info="Finds texts that say similar things in different words." ) model_dropdown = gr.Dropdown( choices=[ "buddhist-nlp/buddhist-sentence-similarity", "buddhist-nlp/bod-eng-similarity", "sentence-transformers/LaBSE", "BAAI/bge-m3" ], label="AI Model", value="buddhist-nlp/buddhist-sentence-similarity", info="'buddhist-sentence-similarity' works best for Buddhist texts." ) batch_size_slider = gr.Slider( minimum=1, maximum=64, value=8, step=1, label="Processing batch size", info="Higher = faster but uses more memory." ) progress_bar_checkbox = gr.Checkbox( label="Show detailed progress", value=False, info="See step-by-step progress during analysis." ) process_btn_custom = gr.Button( "Compare My Texts (Custom)", elem_id="run-btn-custom", variant="primary" ) # Note: Both process_btn_quick and process_btn_custom are wired below gr.Markdown( """## Results """, elem_classes="gr-markdown", ) # The heatmap_titles and metric_tooltips dictionaries are defined here # heatmap_titles = { ... } # metric_tooltips = { ... } csv_output = gr.File(label="📥 Download Full Results (CSV spreadsheet)") metrics_preview = gr.Dataframe( label="Results Summary — Compare chapters across your texts", interactive=False, visible=True ) # States for data persistence state_text_data = gr.State() state_df_results = gr.State() # LLM Interpretation components with gr.Row(): with gr.Column(): gr.Markdown( "## Get Expert Insights\n*Let AI help you understand what the numbers mean and what patterns they reveal about your texts.*", elem_classes="gr-markdown" ) # Add the interpret button with gr.Row(): interpret_btn = gr.Button( "📊 Explain My Results", variant="primary", elem_id="interpret-btn" ) # Create a placeholder message with proper formatting and structure initial_message = """ ## Understanding Your Results *After running the analysis, click "Explain My Results" to get a plain-language interpretation of what the similarity scores mean for your texts.* """ interpretation_output = gr.Markdown( value=initial_message, elem_id="llm-analysis" ) # Heatmap tabs for each metric heatmap_titles = { "Jaccard Similarity (%)": "Shows how much vocabulary the texts share. Higher = more words in common.", "Normalized LCS": "Shows shared phrases in the same order. Higher = more passages appear in both texts.", "Fuzzy Similarity": "Finds similar text even with spelling differences. Higher = more alike.", "Semantic Similarity": "Compares actual meaning using AI. Higher = texts say similar things.", "Word Counts": "How long is each section? Helps you understand text structure.", "Vocabulary Containment": "What % of one text's vocabulary appears in the other?", } metric_tooltips = { "Jaccard Similarity (%)": """ ### Vocabulary Overlap (Jaccard Similarity) **What it measures:** How many unique words appear in both texts. **How to read it:** A score of 70% means 70% of all unique words found in either text appear in both. Higher scores = more shared vocabulary. **What it tells you:** - High scores (>70%): Texts use very similar vocabulary — possibly the same source or direct copying - Medium scores (40-70%): Texts share significant vocabulary — likely related topics or traditions - Low scores (<40%): Texts use different words — different sources or heavily edited versions **Good to know:** This metric ignores word order and how often words repeat. It only asks "does this word appear in both texts?" **Tips:** - Use the "Filter common words" option to focus on meaningful content words rather than grammatical particles. - **Word mode is recommended** for Jaccard. Syllable mode may inflate scores because common syllables (like ས, ར, ན) appear in many different words. """, "Fuzzy Similarity": """ ### Approximate Matching (Fuzzy Similarity) **What it measures:** How similar texts are, even when they're not exactly the same. **How to read it:** Scores from 0 to 1. Higher = more similar. A score of 0.85 means the texts are 85% alike. **What it tells you:** - High scores (>0.8): Very similar texts with minor differences (spelling, small edits) - Medium scores (0.5-0.8): Noticeably different but clearly related - Low scores (<0.5): Substantially different texts **Why it matters for Tibetan texts:** - Catches spelling variations between manuscripts - Finds scribal differences and regional conventions - Identifies passages that were slightly modified **Recommended methods:** - **Syllable pairs (ngram)**: Best for Tibetan — compares pairs of syllables - **Count syllable changes**: Good for finding minor edits - **Word frequency**: Useful when certain words repeat often """, "Normalized LCS": """ ### Shared Sequences (Longest Common Subsequence) **What it measures:** The longest chain of words that appears in both texts *in the same order*. **How to read it:** Higher scores mean longer shared passages. A score of 0.6 means 60% of the text follows the same word sequence. **Example:** If Text A says "the quick brown fox" and Text B says "the lazy brown dog", the shared sequence is "the brown" — words that appear in both, in the same order. **What it tells you:** - High scores (>0.6): Texts share substantial passages — likely direct copying or common source - Medium scores (0.3-0.6): Some shared phrasing — possibly related traditions - Low scores (<0.3): Different word ordering — independent compositions or heavy editing **Why this is different from vocabulary overlap:** - Vocabulary overlap asks: "Do they use the same words?" - Sequence matching asks: "Do they say things in the same order?" Two texts might share many words (high Jaccard) but arrange them differently (low LCS), suggesting they discuss similar topics but were composed independently. """, "Semantic Similarity": """ ### Meaning Similarity (Semantic Analysis) **What it measures:** Whether texts convey similar *meaning*, even if they use different words. **How to read it:** Scores from 0 to 1. Higher = more similar meaning. A score of 0.8 means the texts express very similar ideas. **What it tells you:** - High scores (>0.75): Texts say similar things, even if worded differently - Medium scores (0.5-0.75): Related topics or themes - Low scores (<0.5): Different subject matter **How it works:** An AI model (trained on Buddhist texts) reads both passages and judges how similar their meaning is. This catches similarities that word-matching would miss. **When to use it:** - Finding paraphrased passages - Identifying texts that discuss the same concepts differently - Comparing translations or commentaries **Note:** This takes longer to compute but provides insights the other metrics can't. """, "Word Counts": """ ### Text Length by Section **What it shows:** How many words are in each chapter or section of your texts. **How to read it:** Taller bars = longer sections. Compare bars to see which parts of your texts are longer or shorter. **What it tells you:** - Similar bar heights across texts suggest similar structure - Very different lengths might explain why similarity scores vary - Helps identify which sections to examine more closely **Tip:** If one text has much longer chapters, it might contain additional material not in the other version. """, "Vocabulary Containment": """ ### Vocabulary Containment (Directional) **What it shows:** What percentage of one text's unique vocabulary appears in the other text. **How to read it:** - "Text A → Text B" means: "What % of Text A's vocabulary is found in Text B?" - 90% means 90% of the unique words in the source text also appear in the target text **What it tells you:** - If Text A → Text B is 95% but Text B → Text A is 60%, then Text B contains almost all of Text A's vocabulary plus additional words - This suggests Text B might be an expansion or commentary on Text A - Asymmetric containment often indicates a base text + commentary relationship **Useful for:** - Identifying which text is the "base" (shorter vocabulary fully contained in longer text) - Understanding directionality of textual relationships - Distinguishing between shared sources vs. one text derived from another **Tip:** Unlike Jaccard (which is symmetric), containment is directional — it tells you which text's vocabulary is "inside" the other. """, "Structural Analysis": """ ### How Texts Relate to Each Other **What it shows:** An overview of how your text sections connect and relate across documents. **What it tells you:** - Which sections are most similar to each other - Possible patterns of copying or shared sources - How texts might have evolved or been edited over time **Useful for:** - Understanding textual transmission history - Identifying which version might be older or more original - Finding sections that were added, removed, or modified **Note:** This analysis combines all the other metrics to give you the big picture. """ } heatmap_tabs = {} gr.Markdown("## Visual Comparison", elem_classes="gr-markdown") with gr.Tabs(elem_id="heatmap-tab-group"): # Process all metrics metrics_to_display = heatmap_titles for metric_key, descriptive_title in metrics_to_display.items(): with gr.Tab(metric_key): # Set CSS class based on metric type if metric_key == "Jaccard Similarity (%)": css_class = "metric-info-accordion jaccard-info" accordion_title = "ℹ️ What does this mean?" elif metric_key == "Normalized LCS": css_class = "metric-info-accordion lcs-info" accordion_title = "ℹ️ What does this mean?" elif metric_key == "Fuzzy Similarity": css_class = "metric-info-accordion fuzzy-info" accordion_title = "ℹ️ What does this mean?" elif metric_key == "Semantic Similarity": css_class = "metric-info-accordion semantic-info" accordion_title = "ℹ️ What does this mean?" elif metric_key == "Word Counts": css_class = "metric-info-accordion wordcount-info" accordion_title = "ℹ️ What does this mean?" elif metric_key == "Vocabulary Containment": css_class = "metric-info-accordion vocabcontain-info" accordion_title = "ℹ️ What does this mean?" else: css_class = "metric-info-accordion" accordion_title = f"ℹ️ About {metric_key}" # Create the accordion with appropriate content with gr.Accordion(accordion_title, open=False, elem_classes=css_class): if metric_key == "Word Counts": gr.Markdown(""" ### Text Length by Section This chart shows how many words are in each chapter or section. Taller bars = longer sections. **Why it matters:** If sections have very different lengths, it might explain differences in similarity scores. """) elif metric_key in metric_tooltips: gr.Markdown(value=metric_tooltips[metric_key], elem_classes="metric-description") else: gr.Markdown(value=f"### {metric_key}\nDescription not found.") # Add the appropriate plot if metric_key == "Word Counts": word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description") elif metric_key == "Vocabulary Containment": vocab_containment_plot = gr.Plot(label="Vocabulary Containment per Chapter", show_label=False, scale=1, elem_classes="metric-description") else: heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap") # Structural Analysis Tab # Structural analysis tab removed - see dedicated collation app # For now, this modification focuses on creating the plot object and making it an output. # The visual placement depends on how Gradio renders children of gr.Tab or if there's another container. warning_box = gr.Markdown(visible=False) # Create a container for metric progress indicators with gr.Row(visible=False) as progress_container: # Progress indicators will be created dynamically by ProgressiveUI gr.Markdown("Metric progress will appear here during analysis") def run_pipeline(files, enable_semantic, enable_fuzzy, fuzzy_method, lcs_normalization, model_name, tokenization_mode, stopwords_option, normalize_particles, batch_size, show_progress, progress=gr.Progress()): """Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI. Args: files: A list of file objects uploaded by the user. enable_semantic: Whether to compute semantic similarity. enable_fuzzy: Whether to compute fuzzy string similarity. fuzzy_method: The fuzzy matching method to use. model_name: Name of the embedding model to use. tokenization_mode: How to tokenize text (syllable or word). stopwords_option: Stopword filtering level (None, Standard, or Aggressive). normalize_particles: Whether to normalize grammatical particles. batch_size: Batch size for embedding generation. show_progress: Whether to show progress bars during embedding. progress: Gradio progress indicator. Returns: tuple: Results for UI components including metrics, visualizations, and state. """ # Initialize return values with defaults csv_path_res = None metrics_preview_df_res = pd.DataFrame() word_count_fig_res = None vocab_containment_fig_res = None jaccard_heatmap_res = None lcs_heatmap_res = None fuzzy_heatmap_res = None semantic_heatmap_res = None warning_update_res = gr.update(visible=False) state_text_data_res = None state_df_results_res = None # Create a ProgressiveUI instance for handling progressive updates progressive_ui = ProgressiveUI( metrics_preview=metrics_preview, word_count_plot=word_count_plot, jaccard_heatmap=heatmap_tabs["Jaccard Similarity (%)"], lcs_heatmap=heatmap_tabs["Normalized LCS"], fuzzy_heatmap=heatmap_tabs["Fuzzy Similarity"], semantic_heatmap=heatmap_tabs["Semantic Similarity"], warning_box=warning_box, progress_container=progress_container, heatmap_titles=heatmap_titles ) # Make progress container visible during analysis progress_container.update(visible=True) # Create a progressive callback function progressive_callback = create_progressive_callback(progressive_ui) # Check if files are provided if not files: return ( None, pd.DataFrame({"Message": ["Please upload files to analyze."]}), None, # word_count_plot None, # vocab_containment_plot None, # jaccard_heatmap None, # lcs_heatmap None, # fuzzy_heatmap None, # semantic_heatmap None, # warning update None, # state_text_data None # state_df_results ) # Check file size limits (10MB per file) for file in files: file_size_mb = Path(file.name).stat().st_size / (1024 * 1024) if file_size_mb > 10: return ( None, pd.DataFrame({"Error": [f"File '{Path(file.name).name}' exceeds the 10MB size limit (size: {file_size_mb:.2f}MB)."]}), None, # word_count_plot None, # vocab_containment_plot None, # jaccard_heatmap None, # lcs_heatmap None, # fuzzy_heatmap None, # semantic_heatmap gr.update(value=f"Error: File '{Path(file.name).name}' exceeds the 10MB size limit.", visible=True), None, # state_text_data None # state_df_results ) try: if progress is not None: try: progress(0.1, desc="Preparing files...") except Exception as e: logger.warning(f"Progress update error (non-critical): {e}") # Get filenames and read file contents filenames = [ Path(file.name).name for file in files ] # Use Path().name to get just the filename text_data = {} # Read files with progress updates for i, file in enumerate(files): file_path = Path(file.name) filename = file_path.name if progress is not None: try: progress(0.1 + (0.1 * (i / len(files))), desc=f"Reading file: {filename}") except Exception as e: logger.warning(f"Progress update error (non-critical): {e}") try: text_data[filename] = file_path.read_text(encoding="utf-8-sig") except UnicodeDecodeError: # Try with different encodings if UTF-8 fails try: text_data[filename] = file_path.read_text(encoding="utf-16") except UnicodeDecodeError: return ( None, pd.DataFrame({"Error": [f"Could not decode file '{filename}'. Please ensure it contains valid Tibetan text in UTF-8 or UTF-16 encoding."]}), None, # word_count_plot None, # vocab_containment_plot None, # jaccard_heatmap None, # lcs_heatmap None, # fuzzy_heatmap None, # semantic_heatmap gr.update(value=f"Error: Could not decode file '{filename}'.", visible=True), None, # state_text_data None # state_df_results ) # Configure semantic similarity and fuzzy matching enable_semantic_bool = enable_semantic == "Yes" enable_fuzzy_bool = enable_fuzzy == "Yes" # Extract the fuzzy method from the dropdown value fuzzy_method_value = fuzzy_method.split(' - ')[0] if fuzzy_method else 'ngram' # Extract the LCS normalization from the dropdown value lcs_normalization_value = lcs_normalization.split(' - ')[0] if lcs_normalization else 'avg' # Extract the tokenization mode from the dropdown value tokenization_mode_value = tokenization_mode.split(' - ')[0] if tokenization_mode else 'syllable' if progress is not None: try: progress(0.2, desc="Loading model..." if enable_semantic_bool else "Processing text...") except Exception as e: logger.warning(f"Progress update error (non-critical): {e}") # Process texts with selected model # Convert stopword option to appropriate parameters use_stopwords = stopwords_option != "None (No filtering)" use_lite_stopwords = stopwords_option == "Standard (Common particles only)" # For Hugging Face models, the UI value is the correct model ID internal_model_id = model_name df_results, word_counts_df_data, vocab_containment_df_data, warning_raw = process_texts( text_data=text_data, filenames=filenames, enable_semantic=enable_semantic_bool, enable_fuzzy=enable_fuzzy_bool, fuzzy_method=fuzzy_method_value, lcs_normalization=lcs_normalization_value, model_name=internal_model_id, use_stopwords=use_stopwords, use_lite_stopwords=use_lite_stopwords, normalize_particles=normalize_particles, tokenization_mode=tokenization_mode_value, progress_callback=progress, progressive_callback=progressive_callback, batch_size=batch_size, show_progress_bar=show_progress ) if df_results.empty: warning_md = f"**⚠️ Warning:** {warning_raw}" if warning_raw else "" warning_message = "No common chapters found or results are empty. " + (warning_raw or "") metrics_preview_df_res = pd.DataFrame({"Message": [warning_message]}) warning_update_res = gr.update(value=warning_md or warning_message, visible=True) # No structural analysis in this app else: # Generate visualizations if progress is not None: try: progress(0.8, desc="Generating visualizations...") except Exception as e: logger.warning(f"Progress update error (non-critical): {e}") # heatmap_titles is already defined in the outer scope of main_interface heatmaps_data = generate_visualizations( df_results, descriptive_titles=heatmap_titles ) # Generate word count chart if progress is not None: try: progress(0.9, desc="Creating word count chart...") except Exception as e: logger.warning(f"Progress update error (non-critical): {e}") word_count_fig_res = generate_word_count_chart(word_counts_df_data) # Generate vocabulary containment chart vocab_containment_fig_res = generate_vocab_containment_chart(vocab_containment_df_data) # Store state data for potential future use state_text_data_res = text_data state_df_results_res = df_results logger.info("Analysis complete, storing state data") # Save results to CSV if progress is not None: try: progress(0.95, desc="Saving results...") except Exception as e: logger.warning(f"Progress update error (non-critical): {e}") csv_path_res = "results.csv" df_results.to_csv(csv_path_res, index=False) # Prepare final output warning_md = f"**⚠️ Warning:** {warning_raw}" if warning_raw else "" metrics_preview_df_res = df_results.head(10) jaccard_heatmap_res = heatmaps_data.get("Jaccard Similarity (%)") lcs_heatmap_res = heatmaps_data.get("Normalized LCS") fuzzy_heatmap_res = heatmaps_data.get("Fuzzy Similarity") semantic_heatmap_res = heatmaps_data.get("Semantic Similarity") warning_update_res = gr.update( visible=bool(warning_raw), value=warning_md ) except Exception as e: logger.error(f"Error in run_pipeline: {e}", exc_info=True) # Ensure DataFrame for metrics preview on error metrics_preview_df_res = pd.DataFrame({"Error": [str(e)]}) warning_update_res = gr.update(value=f"Error: {str(e)}", visible=True) return ( csv_path_res, metrics_preview_df_res, word_count_fig_res, vocab_containment_fig_res, jaccard_heatmap_res, lcs_heatmap_res, fuzzy_heatmap_res, semantic_heatmap_res, warning_update_res, state_text_data_res, state_df_results_res, ) # Function to interpret results using LLM def interpret_results(csv_path, progress=gr.Progress()): try: if not csv_path or not Path(csv_path).exists(): return "Please run the analysis first to generate results." # Read the CSV file df_results = pd.read_csv(csv_path) # Show detailed progress messages with percentages progress(0, desc="Preparing data for analysis...") progress(0.1, desc="Analyzing similarity patterns...") progress(0.2, desc="Connecting to Mistral 7B via OpenRouter...") # Get interpretation from LLM (using OpenRouter API) progress(0.3, desc="Generating scholarly interpretation (this may take 20-40 seconds)...") llm_service = LLMService() interpretation = llm_service.analyze_similarity(df_results) # Simulate completion steps progress(0.9, desc="Formatting results...") progress(0.95, desc="Applying scholarly formatting...") # Completed progress(1.0, desc="Analysis complete!") # Add a timestamp to the interpretation timestamp = datetime.now().strftime("%Y-%m-%d %H:%M") interpretation = f"{interpretation}\n\nAnalysis generated on {timestamp}" return interpretation except Exception as e: logger.error(f"Error in interpret_results: {e}", exc_info=True) return f"Error interpreting results: {str(e)}" def run_pipeline_preset(files, preset, progress=gr.Progress()): """Wrapper that converts preset selection to pipeline parameters.""" # Determine settings based on preset if "Quick" in preset: # Quick: Jaccard only enable_semantic = "No" enable_fuzzy = "No" elif "Deep" in preset: # Deep: All metrics including semantic enable_semantic = "Yes" enable_fuzzy = "Yes" else: # Standard: Jaccard + LCS + Fuzzy (no semantic) enable_semantic = "No" enable_fuzzy = "Yes" # Use sensible defaults for preset mode fuzzy_method = "ngram - Syllable pairs (recommended)" lcs_normalization = "avg - Balanced comparison (default)" model_name = "buddhist-nlp/buddhist-sentence-similarity" tokenization_mode = "word - Whole words (recommended)" stopwords_option = "Standard (Common particles only)" normalize_particles = False batch_size = 8 show_progress = False return run_pipeline( files, enable_semantic, enable_fuzzy, fuzzy_method, lcs_normalization, model_name, tokenization_mode, stopwords_option, normalize_particles, batch_size, show_progress, progress ) # Output components for both buttons pipeline_outputs = [ csv_output, metrics_preview, word_count_plot, vocab_containment_plot, heatmap_tabs["Jaccard Similarity (%)"], heatmap_tabs["Normalized LCS"], heatmap_tabs["Fuzzy Similarity"], heatmap_tabs["Semantic Similarity"], warning_box, state_text_data, state_df_results, ] # Quick Start button uses presets process_btn_quick.click( fn=run_pipeline_preset, inputs=[file_input, analysis_preset], outputs=pipeline_outputs ) # Custom button uses all the detailed settings process_btn_custom.click( fn=run_pipeline, inputs=[ file_input, semantic_toggle_radio, fuzzy_toggle_radio, fuzzy_method_dropdown, lcs_normalization_dropdown, model_dropdown, tokenization_mode_dropdown, stopwords_dropdown, particle_normalization_checkbox, batch_size_slider, progress_bar_checkbox ], outputs=pipeline_outputs ) # Structural analysis functionality removed - see dedicated collation app # Connect the interpret button interpret_btn.click( fn=interpret_results, inputs=[csv_output], outputs=interpretation_output ) return demo if __name__ == "__main__": demo = main_interface() demo.launch()