File size: 4,465 Bytes
b94af5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import re
import fitz  # PyMuPDF
import pandas as pd
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
import nltk
import gradio as gr

# ----------------------------
# πŸ“¦ Setup
# ----------------------------
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words('english'))

# ----------------------------
# πŸ“˜ PDF Text Extraction
# ----------------------------
def extract_text_from_pdf(pdf_file):
    """Extract text from uploaded PDF file"""
    text = ""
    with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
        for page in doc:
            text += page.get_text("text")
    return text.strip()

# ----------------------------
# 🧹 Text Cleaning
# ----------------------------
def clean_text(text):
    """Clean and remove stopwords"""
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2]
    return words

# ----------------------------
# πŸ€– Topic Modeling Function
# ----------------------------
def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5):
    """Cluster sentences into topics using transformer embeddings"""
    model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot')
    embeddings = model.encode(sentences, show_progress_bar=False)

    # --- Auto-select topic number ---
    if auto_topics:
        if len(sentences) < 3:
            num_topics = 1
        else:
            scores = []
            for k in range(2, min(max_k, len(sentences))):
                kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings)
                try:
                    score = silhouette_score(embeddings, kmeans.labels_)
                    scores.append((k, score))
                except:
                    continue
            num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2
    else:
        num_topics = fixed_k

    # --- Clustering ---
    kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
    kmeans.fit(embeddings)
    df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_})

    # --- Build topic summaries ---
    topic_data = []
    for topic_id in range(num_topics):
        topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist()
        words = []
        for s in topic_sentences:
            words.extend(clean_text(s))
        word_freq = Counter(words)
        top_words = [w for w, _ in word_freq.most_common(3)]
        title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous"
        examples = topic_sentences[:3]
        topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples)))

    return topic_data, num_topics

# ----------------------------
# πŸš€ Gradio Interface Logic
# ----------------------------
def analyze_input(pdf_file, essay_text):
    pdf_text = ""
    if pdf_file:
        pdf_text = extract_text_from_pdf(pdf_file)

    full_text = (pdf_text + "\n" + (essay_text or "")).strip()
    if not full_text:
        return "❌ Please upload a PDF or write an essay."

    sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
    if len(sentences) < 2:
        return "⚠️ Not enough text for topic modeling."

    topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)

    # --- Display output ---
    output_text = f"βœ… **Detected {num_topics} Topics:**\n\n"
    for title, examples in topic_data:
        output_text += f"### {title}\n{examples}\n\n"

    return output_text

# ----------------------------
# 🎨 Gradio Interface
# ----------------------------
demo = gr.Interface(
    fn=analyze_input,
    inputs=[
        gr.File(label="πŸ“‚ Upload PDF (optional)"),
        gr.Textbox(lines=10, placeholder="✍️ Write or paste your essay here...", label="Essay Text")
    ],
    outputs=gr.Markdown(label="🧠 Detected Topics"),
    title="PDF + Essay Topic Discovery (Transformer-Based)",
    description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
)

if __name__ == "__main__":
    demo.launch()