import gradio as gr
from pathlib import Path
from pipeline.process import process_texts
from pipeline.visualize import generate_visualizations, generate_word_count_chart, generate_vocab_containment_chart
from pipeline.llm_service import LLMService
from pipeline.progressive_ui import ProgressiveUI, create_progressive_callback
import logging
import pandas as pd
from datetime import datetime
from dotenv import load_dotenv
# Load environment variables from .env file
from theme import tibetan_theme
load_dotenv()
logger = logging.getLogger(__name__)
def main_interface():
# Theme and CSS applied here for Gradio 5.x compatibility
# For Gradio 6.x, these will move to launch() - see migration guide
with gr.Blocks(
theme=tibetan_theme,
css=tibetan_theme.get_css_string(),
title="Tibetan Text Metrics Web App"
) as demo:
gr.Markdown(
"""# Tibetan Text Metrics
Compare Tibetan texts to discover how similar they are. This tool helps scholars identify shared passages, textual variations, and relationships between different versions of Tibetan manuscripts. Part of the TTM project.
""",
elem_classes="gr-markdown",
)
with gr.Row(elem_id="steps-row"):
with gr.Column(scale=1, elem_classes="step-column"):
with gr.Group(elem_classes="step-box"):
gr.Markdown(
"""
## Step 1: Upload Your Texts
Upload two or more Tibetan text files (.txt format). If your texts have chapters, separate them with the ༈ marker so the tool can compare chapter-by-chapter.
""",
elem_classes="gr-markdown",
)
file_input = gr.File(
label="Choose your Tibetan text files",
file_types=[".txt"],
file_count="multiple",
)
gr.Markdown(
"Tip: Files should be under 1MB for best performance. Use UTF-8 encoded .txt files.",
elem_classes="gr-markdown"
)
with gr.Column(scale=1, elem_classes="step-column"):
with gr.Group(elem_classes="step-box"):
gr.Markdown(
"""## Step 2: Choose Analysis Type
Pick a preset for quick results, or use Custom for full control.
""",
elem_classes="gr-markdown",
)
with gr.Tabs():
# ===== QUICK START TAB =====
with gr.Tab("Quick Start", id="quick_tab"):
analysis_preset = gr.Radio(
label="What kind of analysis do you need?",
choices=[
"Standard — Vocabulary + Sequences + Fuzzy matching",
"Deep — All metrics including AI meaning analysis",
"Quick — Vocabulary overlap only (fastest)"
],
value="Standard — Vocabulary + Sequences + Fuzzy matching",
info="Standard is recommended for most users. Deep analysis takes longer but finds texts with similar meaning even when words differ."
)
gr.Markdown("""
**What each preset includes:**
| Preset | Jaccard | LCS | Fuzzy | Semantic AI |
|--------|---------|-----|-------|-------------|
| Standard | ✓ | ✓ | ✓ | — |
| Deep | ✓ | ✓ | ✓ | ✓ |
| Quick | ✓ | — | — | — |
""", elem_classes="preset-table")
process_btn_quick = gr.Button(
"Compare My Texts", elem_id="run-btn-quick", variant="primary"
)
# ===== CUSTOM TAB =====
with gr.Tab("Custom", id="custom_tab"):
gr.Markdown("**Fine-tune each metric and option:**", elem_classes="custom-header")
with gr.Accordion("Lexical Metrics", open=True):
gr.Markdown("*Compare the actual words used in texts*")
tokenization_mode_dropdown = gr.Dropdown(
label="How to split text?",
choices=[
"word - Whole words (recommended)",
"syllable - Individual syllables (finer detail)"
],
value="word - Whole words (recommended)",
info="'Word' keeps multi-syllable words together — recommended for Jaccard."
)
stopwords_dropdown = gr.Dropdown(
label="Filter common words?",
choices=[
"None (No filtering)",
"Standard (Common particles only)",
"Aggressive (All function words)"
],
value="Standard (Common particles only)",
info="Remove common particles (གི, ལ, ནི) before comparing."
)
particle_normalization_checkbox = gr.Checkbox(
label="Normalize grammatical particles?",
value=False,
info="Treat variants as equivalent (གི/ཀྱི/གྱི → གི). Useful for different scribal conventions."
)
with gr.Accordion("Sequence Matching (LCS)", open=True):
gr.Markdown("*Find shared passages in the same order*")
gr.Checkbox(
label="Enable sequence matching",
value=True,
info="Finds the longest sequence of words appearing in both texts."
) # LCS is always computed as a core metric
lcs_normalization_dropdown = gr.Dropdown(
label="How to handle different text lengths?",
choices=[
"avg - Balanced comparison (default)",
"min - Detect if one text contains the other",
"max - Stricter, penalizes length differences"
],
value="avg - Balanced comparison (default)",
info="'min' is useful for finding quotes or excerpts."
)
with gr.Accordion("Fuzzy Matching", open=True):
gr.Markdown("*Detect similar but not identical text*")
fuzzy_toggle_radio = gr.Radio(
label="Find approximate matches?",
choices=["Yes", "No"],
value="Yes",
info="Useful for spelling variations and scribal differences."
)
fuzzy_method_dropdown = gr.Dropdown(
label="Matching method",
choices=[
"ngram - Syllable pairs (recommended)",
"syllable_edit - Count syllable changes",
"weighted_jaccard - Word frequency comparison"
],
value="ngram - Syllable pairs (recommended)",
info="All options work at the Tibetan syllable level."
)
with gr.Accordion("Semantic Analysis", open=False):
gr.Markdown("*Compare meaning using AI (slower)*")
semantic_toggle_radio = gr.Radio(
label="Analyze meaning similarity?",
choices=["Yes", "No"],
value="No",
info="Finds texts that say similar things in different words."
)
model_dropdown = gr.Dropdown(
choices=[
"buddhist-nlp/buddhist-sentence-similarity",
"buddhist-nlp/bod-eng-similarity",
"sentence-transformers/LaBSE",
"BAAI/bge-m3"
],
label="AI Model",
value="buddhist-nlp/buddhist-sentence-similarity",
info="'buddhist-sentence-similarity' works best for Buddhist texts."
)
batch_size_slider = gr.Slider(
minimum=1,
maximum=64,
value=8,
step=1,
label="Processing batch size",
info="Higher = faster but uses more memory."
)
progress_bar_checkbox = gr.Checkbox(
label="Show detailed progress",
value=False,
info="See step-by-step progress during analysis."
)
process_btn_custom = gr.Button(
"Compare My Texts (Custom)", elem_id="run-btn-custom", variant="primary"
)
# Note: Both process_btn_quick and process_btn_custom are wired below
gr.Markdown(
"""## Results
""",
elem_classes="gr-markdown",
)
# The heatmap_titles and metric_tooltips dictionaries are defined here
# heatmap_titles = { ... }
# metric_tooltips = { ... }
csv_output = gr.File(label="📥 Download Full Results (CSV spreadsheet)")
metrics_preview = gr.Dataframe(
label="Results Summary — Compare chapters across your texts", interactive=False, visible=True
)
# States for data persistence
state_text_data = gr.State()
state_df_results = gr.State()
# LLM Interpretation components
with gr.Row():
with gr.Column():
gr.Markdown(
"## Get Expert Insights\n*Let AI help you understand what the numbers mean and what patterns they reveal about your texts.*",
elem_classes="gr-markdown"
)
# Add the interpret button
with gr.Row():
interpret_btn = gr.Button(
"📊 Explain My Results",
variant="primary",
elem_id="interpret-btn"
)
# Create a placeholder message with proper formatting and structure
initial_message = """
## Understanding Your Results
*After running the analysis, click "Explain My Results" to get a plain-language interpretation of what the similarity scores mean for your texts.*
"""
interpretation_output = gr.Markdown(
value=initial_message,
elem_id="llm-analysis"
)
# Heatmap tabs for each metric
heatmap_titles = {
"Jaccard Similarity (%)": "Shows how much vocabulary the texts share. Higher = more words in common.",
"Normalized LCS": "Shows shared phrases in the same order. Higher = more passages appear in both texts.",
"Fuzzy Similarity": "Finds similar text even with spelling differences. Higher = more alike.",
"Semantic Similarity": "Compares actual meaning using AI. Higher = texts say similar things.",
"Word Counts": "How long is each section? Helps you understand text structure.",
"Vocabulary Containment": "What % of one text's vocabulary appears in the other?",
}
metric_tooltips = {
"Jaccard Similarity (%)": """
### Vocabulary Overlap (Jaccard Similarity)
**What it measures:** How many unique words appear in both texts.
**How to read it:** A score of 70% means 70% of all unique words found in either text appear in both. Higher scores = more shared vocabulary.
**What it tells you:**
- High scores (>70%): Texts use very similar vocabulary — possibly the same source or direct copying
- Medium scores (40-70%): Texts share significant vocabulary — likely related topics or traditions
- Low scores (<40%): Texts use different words — different sources or heavily edited versions
**Good to know:** This metric ignores word order and how often words repeat. It only asks "does this word appear in both texts?"
**Tips:**
- Use the "Filter common words" option to focus on meaningful content words rather than grammatical particles.
- **Word mode is recommended** for Jaccard. Syllable mode may inflate scores because common syllables (like ས, ར, ན) appear in many different words.
""",
"Fuzzy Similarity": """
### Approximate Matching (Fuzzy Similarity)
**What it measures:** How similar texts are, even when they're not exactly the same.
**How to read it:** Scores from 0 to 1. Higher = more similar. A score of 0.85 means the texts are 85% alike.
**What it tells you:**
- High scores (>0.8): Very similar texts with minor differences (spelling, small edits)
- Medium scores (0.5-0.8): Noticeably different but clearly related
- Low scores (<0.5): Substantially different texts
**Why it matters for Tibetan texts:**
- Catches spelling variations between manuscripts
- Finds scribal differences and regional conventions
- Identifies passages that were slightly modified
**Recommended methods:**
- **Syllable pairs (ngram)**: Best for Tibetan — compares pairs of syllables
- **Count syllable changes**: Good for finding minor edits
- **Word frequency**: Useful when certain words repeat often
""",
"Normalized LCS": """
### Shared Sequences (Longest Common Subsequence)
**What it measures:** The longest chain of words that appears in both texts *in the same order*.
**How to read it:** Higher scores mean longer shared passages. A score of 0.6 means 60% of the text follows the same word sequence.
**Example:** If Text A says "the quick brown fox" and Text B says "the lazy brown dog", the shared sequence is "the brown" — words that appear in both, in the same order.
**What it tells you:**
- High scores (>0.6): Texts share substantial passages — likely direct copying or common source
- Medium scores (0.3-0.6): Some shared phrasing — possibly related traditions
- Low scores (<0.3): Different word ordering — independent compositions or heavy editing
**Why this is different from vocabulary overlap:**
- Vocabulary overlap asks: "Do they use the same words?"
- Sequence matching asks: "Do they say things in the same order?"
Two texts might share many words (high Jaccard) but arrange them differently (low LCS), suggesting they discuss similar topics but were composed independently.
""",
"Semantic Similarity": """
### Meaning Similarity (Semantic Analysis)
**What it measures:** Whether texts convey similar *meaning*, even if they use different words.
**How to read it:** Scores from 0 to 1. Higher = more similar meaning. A score of 0.8 means the texts express very similar ideas.
**What it tells you:**
- High scores (>0.75): Texts say similar things, even if worded differently
- Medium scores (0.5-0.75): Related topics or themes
- Low scores (<0.5): Different subject matter
**How it works:** An AI model (trained on Buddhist texts) reads both passages and judges how similar their meaning is. This catches similarities that word-matching would miss.
**When to use it:**
- Finding paraphrased passages
- Identifying texts that discuss the same concepts differently
- Comparing translations or commentaries
**Note:** This takes longer to compute but provides insights the other metrics can't.
""",
"Word Counts": """
### Text Length by Section
**What it shows:** How many words are in each chapter or section of your texts.
**How to read it:** Taller bars = longer sections. Compare bars to see which parts of your texts are longer or shorter.
**What it tells you:**
- Similar bar heights across texts suggest similar structure
- Very different lengths might explain why similarity scores vary
- Helps identify which sections to examine more closely
**Tip:** If one text has much longer chapters, it might contain additional material not in the other version.
""",
"Vocabulary Containment": """
### Vocabulary Containment (Directional)
**What it shows:** What percentage of one text's unique vocabulary appears in the other text.
**How to read it:**
- "Text A → Text B" means: "What % of Text A's vocabulary is found in Text B?"
- 90% means 90% of the unique words in the source text also appear in the target text
**What it tells you:**
- If Text A → Text B is 95% but Text B → Text A is 60%, then Text B contains almost all of Text A's vocabulary plus additional words
- This suggests Text B might be an expansion or commentary on Text A
- Asymmetric containment often indicates a base text + commentary relationship
**Useful for:**
- Identifying which text is the "base" (shorter vocabulary fully contained in longer text)
- Understanding directionality of textual relationships
- Distinguishing between shared sources vs. one text derived from another
**Tip:** Unlike Jaccard (which is symmetric), containment is directional — it tells you which text's vocabulary is "inside" the other.
""",
"Structural Analysis": """
### How Texts Relate to Each Other
**What it shows:** An overview of how your text sections connect and relate across documents.
**What it tells you:**
- Which sections are most similar to each other
- Possible patterns of copying or shared sources
- How texts might have evolved or been edited over time
**Useful for:**
- Understanding textual transmission history
- Identifying which version might be older or more original
- Finding sections that were added, removed, or modified
**Note:** This analysis combines all the other metrics to give you the big picture.
"""
}
heatmap_tabs = {}
gr.Markdown("## Visual Comparison", elem_classes="gr-markdown")
with gr.Tabs(elem_id="heatmap-tab-group"):
# Process all metrics
metrics_to_display = heatmap_titles
for metric_key, descriptive_title in metrics_to_display.items():
with gr.Tab(metric_key):
# Set CSS class based on metric type
if metric_key == "Jaccard Similarity (%)":
css_class = "metric-info-accordion jaccard-info"
accordion_title = "ℹ️ What does this mean?"
elif metric_key == "Normalized LCS":
css_class = "metric-info-accordion lcs-info"
accordion_title = "ℹ️ What does this mean?"
elif metric_key == "Fuzzy Similarity":
css_class = "metric-info-accordion fuzzy-info"
accordion_title = "ℹ️ What does this mean?"
elif metric_key == "Semantic Similarity":
css_class = "metric-info-accordion semantic-info"
accordion_title = "ℹ️ What does this mean?"
elif metric_key == "Word Counts":
css_class = "metric-info-accordion wordcount-info"
accordion_title = "ℹ️ What does this mean?"
elif metric_key == "Vocabulary Containment":
css_class = "metric-info-accordion vocabcontain-info"
accordion_title = "ℹ️ What does this mean?"
else:
css_class = "metric-info-accordion"
accordion_title = f"ℹ️ About {metric_key}"
# Create the accordion with appropriate content
with gr.Accordion(accordion_title, open=False, elem_classes=css_class):
if metric_key == "Word Counts":
gr.Markdown("""
### Text Length by Section
This chart shows how many words are in each chapter or section. Taller bars = longer sections.
**Why it matters:** If sections have very different lengths, it might explain differences in similarity scores.
""")
elif metric_key in metric_tooltips:
gr.Markdown(value=metric_tooltips[metric_key], elem_classes="metric-description")
else:
gr.Markdown(value=f"### {metric_key}\nDescription not found.")
# Add the appropriate plot
if metric_key == "Word Counts":
word_count_plot = gr.Plot(label="Word Counts per Segment", show_label=False, scale=1, elem_classes="metric-description")
elif metric_key == "Vocabulary Containment":
vocab_containment_plot = gr.Plot(label="Vocabulary Containment per Chapter", show_label=False, scale=1, elem_classes="metric-description")
else:
heatmap_tabs[metric_key] = gr.Plot(label=f"Heatmap: {metric_key}", show_label=False, elem_classes="metric-heatmap")
# Structural Analysis Tab
# Structural analysis tab removed - see dedicated collation app
# For now, this modification focuses on creating the plot object and making it an output.
# The visual placement depends on how Gradio renders children of gr.Tab or if there's another container.
warning_box = gr.Markdown(visible=False)
# Create a container for metric progress indicators
with gr.Row(visible=False) as progress_container:
# Progress indicators will be created dynamically by ProgressiveUI
gr.Markdown("Metric progress will appear here during analysis")
def run_pipeline(files, enable_semantic, enable_fuzzy, fuzzy_method, lcs_normalization, model_name, tokenization_mode, stopwords_option, normalize_particles, batch_size, show_progress, progress=gr.Progress()):
"""Processes uploaded files, computes metrics, generates visualizations, and prepares outputs for the UI.
Args:
files: A list of file objects uploaded by the user.
enable_semantic: Whether to compute semantic similarity.
enable_fuzzy: Whether to compute fuzzy string similarity.
fuzzy_method: The fuzzy matching method to use.
model_name: Name of the embedding model to use.
tokenization_mode: How to tokenize text (syllable or word).
stopwords_option: Stopword filtering level (None, Standard, or Aggressive).
normalize_particles: Whether to normalize grammatical particles.
batch_size: Batch size for embedding generation.
show_progress: Whether to show progress bars during embedding.
progress: Gradio progress indicator.
Returns:
tuple: Results for UI components including metrics, visualizations, and state.
"""
# Initialize return values with defaults
csv_path_res = None
metrics_preview_df_res = pd.DataFrame()
word_count_fig_res = None
vocab_containment_fig_res = None
jaccard_heatmap_res = None
lcs_heatmap_res = None
fuzzy_heatmap_res = None
semantic_heatmap_res = None
warning_update_res = gr.update(visible=False)
state_text_data_res = None
state_df_results_res = None
# Create a ProgressiveUI instance for handling progressive updates
progressive_ui = ProgressiveUI(
metrics_preview=metrics_preview,
word_count_plot=word_count_plot,
jaccard_heatmap=heatmap_tabs["Jaccard Similarity (%)"],
lcs_heatmap=heatmap_tabs["Normalized LCS"],
fuzzy_heatmap=heatmap_tabs["Fuzzy Similarity"],
semantic_heatmap=heatmap_tabs["Semantic Similarity"],
warning_box=warning_box,
progress_container=progress_container,
heatmap_titles=heatmap_titles
)
# Make progress container visible during analysis
progress_container.update(visible=True)
# Create a progressive callback function
progressive_callback = create_progressive_callback(progressive_ui)
# Check if files are provided
if not files:
return (
None,
pd.DataFrame({"Message": ["Please upload files to analyze."]}),
None, # word_count_plot
None, # vocab_containment_plot
None, # jaccard_heatmap
None, # lcs_heatmap
None, # fuzzy_heatmap
None, # semantic_heatmap
None, # warning update
None, # state_text_data
None # state_df_results
)
# Check file size limits (10MB per file)
for file in files:
file_size_mb = Path(file.name).stat().st_size / (1024 * 1024)
if file_size_mb > 10:
return (
None,
pd.DataFrame({"Error": [f"File '{Path(file.name).name}' exceeds the 10MB size limit (size: {file_size_mb:.2f}MB)."]}),
None, # word_count_plot
None, # vocab_containment_plot
None, # jaccard_heatmap
None, # lcs_heatmap
None, # fuzzy_heatmap
None, # semantic_heatmap
gr.update(value=f"Error: File '{Path(file.name).name}' exceeds the 10MB size limit.", visible=True),
None, # state_text_data
None # state_df_results
)
try:
if progress is not None:
try:
progress(0.1, desc="Preparing files...")
except Exception as e:
logger.warning(f"Progress update error (non-critical): {e}")
# Get filenames and read file contents
filenames = [
Path(file.name).name for file in files
] # Use Path().name to get just the filename
text_data = {}
# Read files with progress updates
for i, file in enumerate(files):
file_path = Path(file.name)
filename = file_path.name
if progress is not None:
try:
progress(0.1 + (0.1 * (i / len(files))), desc=f"Reading file: {filename}")
except Exception as e:
logger.warning(f"Progress update error (non-critical): {e}")
try:
text_data[filename] = file_path.read_text(encoding="utf-8-sig")
except UnicodeDecodeError:
# Try with different encodings if UTF-8 fails
try:
text_data[filename] = file_path.read_text(encoding="utf-16")
except UnicodeDecodeError:
return (
None,
pd.DataFrame({"Error": [f"Could not decode file '{filename}'. Please ensure it contains valid Tibetan text in UTF-8 or UTF-16 encoding."]}),
None, # word_count_plot
None, # vocab_containment_plot
None, # jaccard_heatmap
None, # lcs_heatmap
None, # fuzzy_heatmap
None, # semantic_heatmap
gr.update(value=f"Error: Could not decode file '{filename}'.", visible=True),
None, # state_text_data
None # state_df_results
)
# Configure semantic similarity and fuzzy matching
enable_semantic_bool = enable_semantic == "Yes"
enable_fuzzy_bool = enable_fuzzy == "Yes"
# Extract the fuzzy method from the dropdown value
fuzzy_method_value = fuzzy_method.split(' - ')[0] if fuzzy_method else 'ngram'
# Extract the LCS normalization from the dropdown value
lcs_normalization_value = lcs_normalization.split(' - ')[0] if lcs_normalization else 'avg'
# Extract the tokenization mode from the dropdown value
tokenization_mode_value = tokenization_mode.split(' - ')[0] if tokenization_mode else 'syllable'
if progress is not None:
try:
progress(0.2, desc="Loading model..." if enable_semantic_bool else "Processing text...")
except Exception as e:
logger.warning(f"Progress update error (non-critical): {e}")
# Process texts with selected model
# Convert stopword option to appropriate parameters
use_stopwords = stopwords_option != "None (No filtering)"
use_lite_stopwords = stopwords_option == "Standard (Common particles only)"
# For Hugging Face models, the UI value is the correct model ID
internal_model_id = model_name
df_results, word_counts_df_data, vocab_containment_df_data, warning_raw = process_texts(
text_data=text_data,
filenames=filenames,
enable_semantic=enable_semantic_bool,
enable_fuzzy=enable_fuzzy_bool,
fuzzy_method=fuzzy_method_value,
lcs_normalization=lcs_normalization_value,
model_name=internal_model_id,
use_stopwords=use_stopwords,
use_lite_stopwords=use_lite_stopwords,
normalize_particles=normalize_particles,
tokenization_mode=tokenization_mode_value,
progress_callback=progress,
progressive_callback=progressive_callback,
batch_size=batch_size,
show_progress_bar=show_progress
)
if df_results.empty:
warning_md = f"**⚠️ Warning:** {warning_raw}" if warning_raw else ""
warning_message = "No common chapters found or results are empty. " + (warning_raw or "")
metrics_preview_df_res = pd.DataFrame({"Message": [warning_message]})
warning_update_res = gr.update(value=warning_md or warning_message, visible=True)
# No structural analysis in this app
else:
# Generate visualizations
if progress is not None:
try:
progress(0.8, desc="Generating visualizations...")
except Exception as e:
logger.warning(f"Progress update error (non-critical): {e}")
# heatmap_titles is already defined in the outer scope of main_interface
heatmaps_data = generate_visualizations(
df_results, descriptive_titles=heatmap_titles
)
# Generate word count chart
if progress is not None:
try:
progress(0.9, desc="Creating word count chart...")
except Exception as e:
logger.warning(f"Progress update error (non-critical): {e}")
word_count_fig_res = generate_word_count_chart(word_counts_df_data)
# Generate vocabulary containment chart
vocab_containment_fig_res = generate_vocab_containment_chart(vocab_containment_df_data)
# Store state data for potential future use
state_text_data_res = text_data
state_df_results_res = df_results
logger.info("Analysis complete, storing state data")
# Save results to CSV
if progress is not None:
try:
progress(0.95, desc="Saving results...")
except Exception as e:
logger.warning(f"Progress update error (non-critical): {e}")
csv_path_res = "results.csv"
df_results.to_csv(csv_path_res, index=False)
# Prepare final output
warning_md = f"**⚠️ Warning:** {warning_raw}" if warning_raw else ""
metrics_preview_df_res = df_results.head(10)
jaccard_heatmap_res = heatmaps_data.get("Jaccard Similarity (%)")
lcs_heatmap_res = heatmaps_data.get("Normalized LCS")
fuzzy_heatmap_res = heatmaps_data.get("Fuzzy Similarity")
semantic_heatmap_res = heatmaps_data.get("Semantic Similarity")
warning_update_res = gr.update(
visible=bool(warning_raw), value=warning_md
)
except Exception as e:
logger.error(f"Error in run_pipeline: {e}", exc_info=True)
# Ensure DataFrame for metrics preview on error
metrics_preview_df_res = pd.DataFrame({"Error": [str(e)]})
warning_update_res = gr.update(value=f"Error: {str(e)}", visible=True)
return (
csv_path_res,
metrics_preview_df_res,
word_count_fig_res,
vocab_containment_fig_res,
jaccard_heatmap_res,
lcs_heatmap_res,
fuzzy_heatmap_res,
semantic_heatmap_res,
warning_update_res,
state_text_data_res,
state_df_results_res,
)
# Function to interpret results using LLM
def interpret_results(csv_path, progress=gr.Progress()):
try:
if not csv_path or not Path(csv_path).exists():
return "Please run the analysis first to generate results."
# Read the CSV file
df_results = pd.read_csv(csv_path)
# Show detailed progress messages with percentages
progress(0, desc="Preparing data for analysis...")
progress(0.1, desc="Analyzing similarity patterns...")
progress(0.2, desc="Connecting to Mistral 7B via OpenRouter...")
# Get interpretation from LLM (using OpenRouter API)
progress(0.3, desc="Generating scholarly interpretation (this may take 20-40 seconds)...")
llm_service = LLMService()
interpretation = llm_service.analyze_similarity(df_results)
# Simulate completion steps
progress(0.9, desc="Formatting results...")
progress(0.95, desc="Applying scholarly formatting...")
# Completed
progress(1.0, desc="Analysis complete!")
# Add a timestamp to the interpretation
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
interpretation = f"{interpretation}\n\nAnalysis generated on {timestamp}"
return interpretation
except Exception as e:
logger.error(f"Error in interpret_results: {e}", exc_info=True)
return f"Error interpreting results: {str(e)}"
def run_pipeline_preset(files, preset, progress=gr.Progress()):
"""Wrapper that converts preset selection to pipeline parameters."""
# Determine settings based on preset
if "Quick" in preset:
# Quick: Jaccard only
enable_semantic = "No"
enable_fuzzy = "No"
elif "Deep" in preset:
# Deep: All metrics including semantic
enable_semantic = "Yes"
enable_fuzzy = "Yes"
else:
# Standard: Jaccard + LCS + Fuzzy (no semantic)
enable_semantic = "No"
enable_fuzzy = "Yes"
# Use sensible defaults for preset mode
fuzzy_method = "ngram - Syllable pairs (recommended)"
lcs_normalization = "avg - Balanced comparison (default)"
model_name = "buddhist-nlp/buddhist-sentence-similarity"
tokenization_mode = "word - Whole words (recommended)"
stopwords_option = "Standard (Common particles only)"
normalize_particles = False
batch_size = 8
show_progress = False
return run_pipeline(
files, enable_semantic, enable_fuzzy, fuzzy_method,
lcs_normalization, model_name, tokenization_mode,
stopwords_option, normalize_particles, batch_size,
show_progress, progress
)
# Output components for both buttons
pipeline_outputs = [
csv_output,
metrics_preview,
word_count_plot,
vocab_containment_plot,
heatmap_tabs["Jaccard Similarity (%)"],
heatmap_tabs["Normalized LCS"],
heatmap_tabs["Fuzzy Similarity"],
heatmap_tabs["Semantic Similarity"],
warning_box,
state_text_data,
state_df_results,
]
# Quick Start button uses presets
process_btn_quick.click(
fn=run_pipeline_preset,
inputs=[file_input, analysis_preset],
outputs=pipeline_outputs
)
# Custom button uses all the detailed settings
process_btn_custom.click(
fn=run_pipeline,
inputs=[
file_input,
semantic_toggle_radio,
fuzzy_toggle_radio,
fuzzy_method_dropdown,
lcs_normalization_dropdown,
model_dropdown,
tokenization_mode_dropdown,
stopwords_dropdown,
particle_normalization_checkbox,
batch_size_slider,
progress_bar_checkbox
],
outputs=pipeline_outputs
)
# Structural analysis functionality removed - see dedicated collation app
# Connect the interpret button
interpret_btn.click(
fn=interpret_results,
inputs=[csv_output],
outputs=interpretation_output
)
return demo
if __name__ == "__main__":
demo = main_interface()
demo.launch()