Spaces:

jfataphd
/

OncoDigger

Running

App Files Files Community

jfataphd commited on Apr 18, 2023

Commit

6ce67d2

1 Parent(s): 6337933

Update app.py

Browse files

Files changed (1) hide show

app.py +848 -272

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
 import streamlit as st
 import time
 import json
 from gensim.models import Word2Vec
 import pandas as pd
 import matplotlib.pyplot as plt
 import squarify
 import numpy as np
@@ -12,12 +14,13 @@ import random
 import plotly.express as px
 st.set_page_config(
-    page_title="FATA4 Science",
                 page_icon=":microscope:",
                 layout="wide", #centered
                 initial_sidebar_state="auto",
                 menu_items={
-                    'About': "FATA4 Science is a Natural Language Processing (NLP) that ...."
                 }
                 )
@@ -44,38 +47,70 @@ st.markdown("""
     </style>
     """, unsafe_allow_html=True)
-opt=st.sidebar.radio("Select a PubMed Corpus", options=('Clotting corpus', 'Neuroblastoma corpus'))
-if opt == "Clotting corpus":
-    model_used = ("pubmed_model_clotting")
-    num_abstracts = 45493
-    database_name = "Clotting"
-if opt == "Neuroblastoma corpus":
-    model_used = ("pubmed_model_neuroblastoma")
-    num_abstracts = 29032
-    database_name = "Neuroblastoma"
-# if opt == "Breast Cancer corpus":
-#     model_used = ("pubmed_model_breast_cancer")
-#     num_abstracts = 290320
-#     database_name = "Breast_cancer"
-# if opt == "Mammary gland corpus":
-#     model_used = ("pubmed_model_mammary_gland")
-#     num_abstracts = 79032
-#     database_name = "Mammary_gland"
-st.header(":red[*F*]ast :red[*A*]cting :red[*T*]ext :red[*A*]nalysis (:red[*FATA*]) 4 Science")
-st.subheader("Uncovering knowledge through Natural Language Processing (NLP)")
 st.markdown("---")
 st.header(f":blue[{database_name} Pubmed corpus.]")
 text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
 query = text_input_value
 query = query.lower()
-query = re.sub("[,.?!&*;: ]", "", query)
-matches = [" "]
-if any([x in query for x in matches]):
-    st.write("Please only enter one term or a term without spaces")
-# query = input ("Enter your keyword(s):")
 if query:
     bar = st.progress(0)
     time.sleep(.05)
@@ -86,313 +121,839 @@ if query:
         time.sleep(.1)
     # try:
-    model = Word2Vec.load(model_used)  # you can continue training with the loaded model!
     words = list(model.wv.key_to_index)
     X = model.wv[model.wv.key_to_index]
     model2 = model.wv[query]
     df = pd.DataFrame(X)
     # except:
     #     st.error("Term occurrence is too low - please try another term")
     #     st.stop()
     st.markdown("---")
-    # def findRelationships(query, df):
     table = model.wv.most_similar_cosmul(query, topn=10000)
     table = (pd.DataFrame(table))
     table.index.name = 'Rank'
     table.columns = ['Word', 'SIMILARITY']
-    # print()
-    # print("Similarity to " + str(query))
     pd.set_option('display.max_rows', None)
     table2 = table.copy()
-    # print(table.head(50))
-    # table.head(10).to_csv("clotting_sim1.csv", index=True)
-    # short_table = table.head(50)
-    # print(table)
-    # Create the slider with increments of 5 up to 100
     st.markdown(
-        f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
-        f"<span style='color:red; font-style: italic;'>words</span> contextually "
-        f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
-        f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
         unsafe_allow_html=True)
-    value_word = st.slider("Words", 0, 100, step=5)
-    if value_word > 0:
-        # st.subheader(f"Top {value} genes closely related to {query}: "
-        #              f"Click on the Pubmed and NCBI links for more gene information")
-        st.markdown(
-            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
-            f"</span>words similar to "
-            f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Wikipaedia links for more word information</span></p></b>",
-            unsafe_allow_html=True)
-        # calculate the sizes of the squares in the treemap
-        short_table = table2.head(value_word).round(2)
-        short_table.index += 1
-        short_table.index = (1 / short_table.index)*10
-        sizes = short_table.index.tolist()
-        short_table.set_index('Word', inplace=True)
-        # label = short_table.index.tolist()
-        # print(short_table.index)
-        table2["SIMILARITY"] = 'Similarity Score ' + table2.head(10)["SIMILARITY"].round(2).astype(str)
-        rank_num = list(short_table.index.tolist())
-        # avg_size = sum(sizes) / len(short_table.index)
-        df = short_table
-        try:
-            # Define the `text` column for labels and `href` column for links
-            df['text'] = short_table.index
-            df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
-                  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
-            df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
-            df.loc[:,'database'] = database_name
-            # print(sizes)
-            # '{0} in {1}'.format(unicode(self.author, 'utf-8'), unicode(self.publication, 'utf-8'))
-            # Create the treemap using `px.treemap`
-            fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
                          hover_name=(table2.head(value_word)['SIMILARITY']))
-            fig.update(layout_coloraxis_showscale=False)
-            fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
-            fig.update_annotations(visible=False)
-            fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
-                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
-                          texttemplate="</b><br><span "
-                                       "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
-                                       "<a href='%{customdata[0]}'>PubMed"
-                                       "</a><br><a href='%{customdata[3]}'>Wikipedia"
-                                       "</span></a>")
-            fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
-            # st.pyplot(fig2)
-            st.plotly_chart(fig, use_container_width=True)
-            # st.caption(
-            #     "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
-            # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
-            csv = table2.head(value_word).to_csv().encode('utf-8')
-            st.download_button(label=f"download top {value_word} words (csv)", data=csv, file_name=f'{database_name}_words.csv',
-                           mime='text/csv')
-        except:
-            st.warning(
-            f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
-    # st.write(short_table)
     #
     # print()
     # print("Human genes similar to " + str(query))
     df1 = table.copy()
-    df2 = pd.read_csv('Human_Genes.csv')
-    m = df1.Word.isin(df2.symbol)
-    df1 = df1[m].loc[:,:]
-    df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
-    df1["Human Gene"] = df1["Human Gene"].str.upper()
     # print(df1.head(50))
     # print()
     # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
     # time.sleep(2)
     # Create the slider with increments of 5 up to 100
-    st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
-                f"<span style='color:red; font-style: italic;'>genes</span> contextually "
-                f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
-                f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
-                    unsafe_allow_html=True)
-    value_gene = st.slider("Gene", 0, 100, step=5)
-    if value_gene > 0:
-        # st.subheader(f"Top {value} genes closely related to {query}: "
-        #              f"Click on the Pubmed and NCBI links for more gene information")
-        st.markdown(
-            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
-            f"</span>genes similar to "
-            f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and NCBI links for more gene information</span></p></b>",
-            unsafe_allow_html=True)
-        df10 = df1.head(value_gene).copy()
-        df10.index = (1 / df10.index)*10000
-        sizes = df10.index.tolist()
-        df10.set_index('Human Gene', inplace=True)
-        df3 = df1.copy()
-        df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
-        df3.reset_index(inplace=True)
-        df3 = df3.rename(columns={'Human Gene': 'symbol2'})
-        # Use df.query to get a subset of df1 based on ids in df2
-        subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
-        # Use merge to join the two DataFrames on id
-        result = pd.merge(subset, df2, on='symbol2')
-        # Show the result
-        # print(result)
-        # label = df10.index.tolist()
-        # df2 = df10
-        # print(df2)
-        try:
-            # Define the `text` column for labels and `href` column for links
-            df10['text'] = df10.index
-            df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
-                  '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
-            df10['href2'] = [f'https://www.ncbi.nlm.nih.gov/gene/?term=' + c for c in df10['text']]
-            df10['name'] = [c for c in result['Approved name']]
-            assert isinstance(df10, object)
-            df10.loc[:,'database'] = database_name
-            # print(df['name'])
-            # Create the treemap using `px.treemap`
-            fig = px.treemap(df10, path=[df10['text']], values=sizes,
-                     custom_data=['href', 'name', 'database', 'href2', 'text'], hover_name=(df3.head(value_gene)['SIMILARITY']))
-            fig.update(layout_coloraxis_showscale=False)
-            fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
-            fig.update_annotations(visible=False)
-            fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
-                      hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
-                      texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}</span></b><br><span "
-                                   "style='font-family: Arial; font-size: 15px;'>%{customdata[1]}<br>"
-                                   "<a href='%{customdata[0]}'>PubMed"
-                                   "</a><br><a href='%{customdata[3]}'>NCBI"
-                                   "</span></a>")
-            fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
-            # # display the treemap in Streamlit
-            # with treemap2:
-            # st.pyplot(fig2)
-            st.plotly_chart(fig, use_container_width=True)
-            st.caption("Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
-            st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
-            csv = df1.head(value_gene).to_csv().encode('utf-8')
-            st.download_button(label=f"download top {value_gene} genes (csv)", data=csv, file_name=f'{database_name}_genes.csv',
-                       mime='text/csv')
-        except:
-            st.warning(
-            f"This selection exceeds the number of similar genes related to {query} within the {database_name} corpus, please choose a lower number")
-    st.markdown("---")
-    # st.write(short_table)
-    #
     # print()
     # print("Human genes similar to " + str(query))
     df1 = table.copy()
-    df2 = pd.read_csv('protein.csv')
-    m = df1.Word.isin(df2.protein)
     df1 = df1[m]
-    df1.rename(columns={'Word': 'Protein'}, inplace=True)
-    # print(df1)
     df_len = len(df1)
-    # df1["Protein"] = df1["Protein"].str.upper()
     # print(df1.head(50))
     # print()
     # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
     # time.sleep(2)
     # Create the slider with increments of 5 up to 100
     st.markdown(
-        f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap with the slider below to visualize "
-        f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
-        f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
-        f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
         unsafe_allow_html=True)
-    value_protein = st.slider("Protein", 0, 100, step=5)
-    # print(value_protein)
-    if value_protein > 0:
-        # st.subheader(f"Top {value} genes closely related to {query}: "
-        #              f"Click on the Pubmed and NCBI links for more gene information")
-        st.markdown(
-            f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_protein} "
-            f"</span>proteins similar to "
-            f"<span style='color:red; font-style: italic;'>{query}:</span> Click on the squares to expand and the Pubmed and Wikipedia links for more protein information</span></p></b>",
-            unsafe_allow_html=True)
-        df11 = df1.head(value_protein).copy()
-        df11.index = (1 / df11.index) * 10000
-        sizes = df11.index.tolist()
-        df11.set_index('Protein', inplace=True)
-        df4 = df1.copy()
-        # print(df4.head(10))
-        df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_protein)["SIMILARITY"].round(2).astype(str)
-        df4.reset_index(inplace=True)
-        # df4 = df4.rename(columns={'Protein': 'symbol2'})
-        # print(df4)
-        # # Use df.query to get a subset of df1 based on ids in df2
-        # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
-        # # Use merge to join the two DataFrames on id
-        # result = pd.merge(subset, df2b, on='symbol2')
-        # print(result)
-        if value_protein <= df_len:
-            # Define the `text` column for labels and `href` column for links
-            df11['text'] = df11.index
-            df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
-                       '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
-            df11['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df11['text']]
-            assert isinstance(df11, object)
-            df11['database'] = database_name
-            # df11['name'] = [c for c in result['Approved name']]
-            # Create the treemap using `px.treemap`
-            fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
-                         hover_name=(df4.head(value_protein)['SIMILARITY']))
-            fig.update(layout_coloraxis_showscale=False)
-            fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
-            fig.update_annotations(visible=False)
-            fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                           hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
-                          texttemplate="<b><span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}</span></b><br>"
                                        "<a href='%{customdata[0]}'>PubMed"
-                                       "</a><br><a href='%{customdata[2]}'>Wikipedia"
                                        "</span></a>")
-            fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
-            # # display the treemap in Streamlit
-            # with treemap2:
-            # st.pyplot(fig2)
-            st.plotly_chart(fig, use_container_width=True)
-            st.caption(
-            "Protein designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
-            csv = df1.head(value_protein).to_csv().encode('utf-8')
-            st.download_button(label=f"download top {value_protein} proteins (csv)", data=csv, file_name=f'{database_name}_genes.csv',
-                           mime='text/csv')
-        else:
-            st.warning(f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
     st.subheader("Cancer-related videos")
     if query:
-        idlist=[]
         search_keyword = {query}
         html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
         html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
@@ -418,15 +979,30 @@ if query:
         c1, c2, c3 = st.columns(3)
         with c1:
-           st.video("https://www.youtube.com/watch?v=" + video_ids[0])
         with c2:
-           st.video("https://www.youtube.com/watch?v=" + video_ids[1])
         with c3:
-           st.video("https://www.youtube.com/watch?v=" + video_ids[2])
     st.markdown("---")

 import streamlit as st
 import time
+import concurrent.futures
 import json
 from gensim.models import Word2Vec
 import pandas as pd
+import threading
 import matplotlib.pyplot as plt
 import squarify
 import numpy as np
 import plotly.express as px
 st.set_page_config(
+    page_title="Abstractalytics",
                 page_icon=":microscope:",
                 layout="wide", #centered
                 initial_sidebar_state="auto",
                 menu_items={
+                    'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
+                             " insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"
                 }
                 )
     </style>
     """, unsafe_allow_html=True)
+st.header(":red[*Abstractalytics*]")
+st.subheader("*A web app designed to explore :red[*PubMed abstracts*] for deeper understanding and fresh insights, driven "
+             "by Natural Language Processing (NLP) techniques.*")
+def custom_subheader(text, identifier, font_size):
+    st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)
+custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
+             "within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword "
+             "you wish to explore within the corpus. Abstractalytics powerful Natural Language "
+             "Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, "
+             "genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
+             "to your input. This advanced text-mining technique enables you to explore and understand complex "
+             "relationships, uncovering new discoveries and connections in your field of research across a massive "
+             "amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.", "unique-id", 18)
 st.markdown("---")
+#Define the correct password
+# CORRECT_PASSWORD = "123"
+# Define a function to check if the password is correct
+# def authenticate(password):
+#     if password == CORRECT_PASSWORD:
+#         return True
+#     else:
+#         return False
+#
+# # Create a Streamlit input field for the password
+# password = st.text_input("Enter password:", type="password")
+#
+# # If the password is correct, show the app content
+# if authenticate(password):
+opt = st.sidebar.radio("Select a PubMed Corpus",
+                           options=(
+                                    'Breast Cancer corpus', 'Lung Cancer corpus'))
+# if opt == "Clotting corpus":
+#     model_used = ("pubmed_model_clotting")
+#     num_abstracts = 45493
+#     database_name = "Clotting"
+# if opt == "Neuroblastoma corpus":
+#     model_used = ("pubmed_model_neuroblastoma")
+#     num_abstracts = 29032
+#     database_name = "Neuroblastoma"
+if opt == "Breast Cancer corpus":
+    model_used = ("pubmed_model_breast_cancer2")
+    num_abstracts = 290320
+    database_name = "Breast_cancer"
+if opt == "Lung Cancer corpus":
+    model_used = ("lung_cancer_pubmed_model")
+    num_abstracts = 210320
+    database_name = "Lung_cancer"
 st.header(f":blue[{database_name} Pubmed corpus.]")
 text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
 query = text_input_value
 query = query.lower()
+query = re.sub("[,.?!&*;:]", "", query)
+query = re.sub(" ", "-", query)
+# matches = [" "]
+# if any([x in query for x in matches]):
+#     st.write("Please only enter one term or a term without spaces")
+# # query = input ("Enter your keyword(s):")
 if query:
     bar = st.progress(0)
     time.sleep(.05)
         time.sleep(.1)
     # try:
+    model = Word2Vec.load(f"{model_used}")  # you can continue training with the loaded model!
     words = list(model.wv.key_to_index)
     X = model.wv[model.wv.key_to_index]
+    # print(model.wv['bfgf'])
     model2 = model.wv[query]
+    # print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
     df = pd.DataFrame(X)
+    def get_compound_ids(compound_names):
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            compound_ids = list(executor.map(get_compound_id, compound_names))
+        return compound_ids
+    import requests
+    def get_compound_id(compound_name):
+        url = f"http://rest.kegg.jp/find/compound/{compound_name}"
+        response = requests.get(url)
+        if response.status_code == 200:
+            result = response.text.split('\n')
+            if result[0]:
+                compound_id = result[0].split('\t')[0]
+                return compound_id
+        return None
     # except:
     #     st.error("Term occurrence is too low - please try another term")
     #     st.stop()
     st.markdown("---")
     table = model.wv.most_similar_cosmul(query, topn=10000)
     table = (pd.DataFrame(table))
     table.index.name = 'Rank'
     table.columns = ['Word', 'SIMILARITY']
     pd.set_option('display.max_rows', None)
     table2 = table.copy()
+    # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
+    #             f"<span style='color:red; font-style: italic;'>words</span> contextually "
+    #             f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+    #             f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+    #     unsafe_allow_html=True)
+    # Set the max number of words to display
+    value_word = min(100, len(table2))
     st.markdown(
+        f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
+        f"</span>words contextually and semantically similar to "
+        f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
+        f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>",
         unsafe_allow_html=True)
+    short_table = table2.head(value_word).round(2)
+    short_table.index += 1
+    short_table.index = (1 / short_table.index) * 10
+    sizes = short_table.index.tolist()
+    short_table.set_index('Word', inplace=True)
+    table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str)
+    rank_num = list(short_table.index.tolist())
+    df = short_table
+    try:
+        df['text'] = short_table.index
+        df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                      '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
+        df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]
+        df.loc[:, 'database'] = database_name
+        fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
                          hover_name=(table2.head(value_word)['SIMILARITY']))
+        fig.update(layout_coloraxis_showscale=False)
+        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+        fig.update_annotations(visible=False)
+        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                              hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                              texttemplate="<br><span "
+                                           "style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
+                                           "<a href='%{customdata[0]}'>PubMed"
+                                           "</a><br><br><a href='%{customdata[3]}'>Wikipedia"
+                                           "</span></a>")
+        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])
+        # st.pyplot(fig2)
+        st.plotly_chart(fig, use_container_width=True)
+        # st.caption(
+        #     "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
+        # st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
+        csv = table2.head(value_word).to_csv().encode('utf-8')
+        st.download_button(label=f"download top {value_word} words (csv)", data=csv,
+                               file_name=f'{database_name}_words.csv', mime='text/csv')
+    except:
+        st.warning(
+                f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")
+    # st.markdown("---")
+    # # st.write(short_table)
+    # #
+    #
+    # # print()
+    # # print("Human genes similar to " + str(query))
+    # df1 = table.copy()
+    # df2 = pd.read_csv('Human Genes.csv')
+    # m = df1.Word.isin(df2.symbol)
+    # df1 = df1[m]
+    # df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
+    # df1["Human Gene"] = df1["Human Gene"].str.upper()
+    # # print(df1.head(50))
+    # # print()
+    # # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
+    # # time.sleep(2)
+    # # Create the slider with increments of 5 up to 100
+    #
+    # # Set the maximum number of genes to display up to 100
+    # value_gene = min(len(df1), 100)
+    #
+    # if value_gene > 0:
+    #     # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Treemap visualization of "
+    #     #             f"<span style='color:red; font-style: italic;'>genes</span> contextually "
+    #     #             f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+    #     #             f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+    #     #     unsafe_allow_html=True)
+    #
+    #     st.markdown(
+    #         f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
+    #         f"</span>genes contextually and semantically similar to "
+    #         f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. "
+    #         f"Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
+    #         unsafe_allow_html=True)
+    #
+    #     df10 = df1.head(value_gene).copy()
+    #     df10.index = (1 / df10.index) * 100000
+    #     sizes = df10.index.tolist()
+    #     df10.set_index('Human Gene', inplace=True)
+    #
+    #     df3 = df1.copy()
+    #     df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
+    #     df3.reset_index(inplace=True)
+    #     df3 = df3.rename(columns={'Human Gene': 'symbol2'})
+    #     # Use df.query to get a subset of df1 based on ids in df2
+    #     subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
+    #     # Use merge to join the two DataFrames on id
+    #     result = pd.merge(subset, df2, on='symbol2')
+    #     # Show the result
+    #     # print(result)
+    #     # label = df10.index.tolist()
+    #     # df2 = df10
+    #     # print(df2)
+    #     try:
+    #         # Define the `text` column for labels and `href` column for links
+    #         df10['text'] = df10.index
+    #         df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+    #                         '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
+    #         df10['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df10['text']]
+    #
+    #         df10['name'] = [c for c in result['Approved name']]
+    #         assert isinstance(df10, object)
+    #         df10.loc[:, 'database'] = database_name
+    #
+    #         # print(df['name'])
+    #
+    #         # Create the treemap using `px.treemap`
+    #         fig = px.treemap(df10, path=[df10['text']], values=sizes,
+    #                          custom_data=['href', 'name', 'database', 'href2', 'text'],
+    #                          hover_name=(df3.head(value_gene)['SIMILARITY']))
+    #
+    #         fig.update(layout_coloraxis_showscale=False)
+    #         fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+    #         fig.update_annotations(visible=False)
+    #         fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+    #                           hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+    #                           texttemplate="<br><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}<br><br>"
+    #                                        "%{customdata[1]}<br><br>"
+    #                                        "<a href='%{customdata[0]}'>PubMed"
+    #                                        "</a><br><br><a href='%{customdata[3]}'>GeneCard"
+    #                                        "</span></a>")
+    #         fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
+    #         # # display the treemap in Streamlit
+    #         # with treemap2:
+    #
+    #         # st.pyplot(fig2)
+    #         st.plotly_chart(fig, use_container_width=True)
+    #
+    #         st.caption(
+    #             "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
+    #         st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
+    #         st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
+    #
+    #         csv = df1.head(value_gene).to_csv().encode('utf-8')
+    #         st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
+    #                            file_name=f'{database_name}_genes.csv', mime='text/csv')
+    #
+    #
+    #     except:
+    #         st.warning(f"No similar genes related to {query} within the {database_name} corpus were found.")
+    st.markdown("---")
+    df1 = table.copy()
+    df2 = pd.read_csv('Human Genes.csv')
+    m = df1.Word.isin(df2.symbol)
+    df1 = df1[m]
+    df1.rename(columns={'Word': 'Genes'}, inplace=True)
+    df_len = len(df1)
+    print(len(df1))
+    # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
+    #             f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
+    #             f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+    #             f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+    #     unsafe_allow_html=True)
+    # Set the number of proteins to display
+    value_gene = min(df_len, 100)
+    st.markdown(
+        f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
+        f"</span>human genes contextually and semantically similar to "
+        f"<span style='color:red; font-style: italic;'>{query}  </span>within the <span style='color:red; font-style: italic;'>{database_name} </span>corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
+        unsafe_allow_html=True)
+    df11 = df1.head(value_gene).copy()
+    df11.index = (1 / df11.index) * 10000
+    sizes = df11.index.tolist()
+    df11.set_index('Genes', inplace=True)
+    df4 = df1.copy()
+    # print(df4.head(10))
+    df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str)
+    df4.reset_index(inplace=True)
+    # df4 = df4.rename(columns={'Protein': 'symbol2'})
+    # print(df4)
+    # # Use df.query to get a subset of df1 based on ids in df2
+    # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
+    # # Use merge to join the two DataFrames on id
+    # result = pd.merge(subset, df2b, on='symbol2')
+    # print(result)
+    if value_gene <= df_len:
+        # Define the `text` column for labels and `href` column for links
+        df11['text'] = df11.index
+        df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                            '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
+        df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
+        assert isinstance(df11, object)
+        df11['database'] = database_name
+        # df11['name'] = [c for c in result['Approved name']]
+        # Create the treemap using `px.treemap`
+        fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
+                             hover_name=(df4.head(value_gene)['SIMILARITY']))
+        fig.update(layout_coloraxis_showscale=False)
+        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+        fig.update_annotations(visible=False)
+        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                              hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                              texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
+                                           "<a href='%{customdata[0]}'>PubMed"
+                                           "</a><br><br><a href='%{customdata[2]}'>GeneCard"
+                                           "</span></a>")
+        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
+        # # display the treemap in Streamlit
+        # with treemap2:
+        # st.pyplot(fig2)
+        st.plotly_chart(fig, use_container_width=True)
+        # st.caption(
+        #         "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
+        # st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
+        st.caption("Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
+        st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
+        st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
+        csv = df1.head(value_gene).to_csv().encode('utf-8')
+        st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
+                               file_name=f'{database_name}_genes.csv', mime='text/csv')
+    else:
+        st.warning(
+                f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
+    st.markdown("---")
+    # print()
+    # print("Human genes similar to " + str(query))
+    df1 = table.copy()
+    df2 = pd.read_csv('kegg_drug_list_lowercase.csv')
+    m = df1.Word.isin(df2.drugs)
+    df1 = df1[m]
+    df1.rename(columns={'Word': 'Drugs'}, inplace=True)
+    df_len = len(df1)
+    # print(len(df1))
+    # df1["Human Gene"] = df1["Human Gene"].str.upper()
+    # print(df1.head(50))
+    # print()
+    # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
+    # time.sleep(2)
+    # Create the slider with increments of 5 up to 100
+    # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
+    value_drug = min(df1.shape[0], 100)
+    # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
+    #             f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
+    #             f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+    #             f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+    #     unsafe_allow_html=True)
+    st.markdown(
+        f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_drug} "
+        f"</span>Drugs contextually and semantically similar to "
+        f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
+        unsafe_allow_html=True)
+    df13 = df1.head(value_drug).copy()
+    df13.index = (1 / df13.index) * 10000
+    sizes = df13.index.tolist()
+    df13.set_index('Drugs', inplace=True)
+    df6 = df1.copy()
+    # print(df4.head(10))
+    df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str)
+    df6.reset_index(inplace=True)
+    # df4 = df4.rename(columns={'Protein': 'symbol2'})
+    # print(df4)
+    # # Use df.query to get a subset of df1 based on ids in df2
+    # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
+    # # Use merge to join the two DataFrames on id
+    # result = pd.merge(subset, df2b, on='symbol2')
+    # print(result)
+    if value_drug <= df_len:
+        # Define the `text` column for labels and `href` column for links
+        # Reset the index
+        df13.reset_index(inplace=True)
+        # Replace hyphens with spaces in the 'text' column
+        df13['Drugs'] = df13['Drugs'].str.replace('-', ' ')
+        # Set the 'text' column back as the index
+        df13.set_index('Drugs', inplace=True)
+        df13['text'] = df13.index
+        df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                            '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
+        df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
+        assert isinstance(df13, object)
+        df13['database'] = database_name
+        # df11['name'] = [c for c in result['Approved name']]
+        # Create the treemap using `px.treemap`
+        fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
+                             hover_name=(df6.head(value_drug)['SIMILARITY']))
+        fig.update(layout_coloraxis_showscale=False)
+        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+        fig.update_annotations(visible=False)
+        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                              hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                              texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
+                                           "<a href='%{customdata[0]}'>PubMed"
+                                           "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
+                                           "</span></a>")
+        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
+        # # display the treemap in Streamlit
+        # with treemap2:
+        # st.pyplot(fig2)
+        st.plotly_chart(fig, use_container_width=True)
+        st.caption(
+                "Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")
+        csv = df1.head(value_drug).to_csv().encode('utf-8')
+        st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
+                               file_name=f'{database_name}_drugs.csv', mime='text/csv')
+    else:
+        st.warning(
+                f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
     #
+    # st.markdown("---")
+    # # print()
+    # # print("Human genes similar to " + str(query))
+    # df1 = table.copy()
+    # df2 = pd.read_csv('diseasesKegg.csv')
+    # m = df1.Word.isin(df2.disease)
+    # df1 = df1[m]
+    # df1.rename(columns={'Word': 'Disease'}, inplace=True)
+    # df_len = len(df1)
+    # # print(len(df1))
+    # # df1["Human Gene"] = df1["Human Gene"].str.upper()
+    # # print(df1.head(50))
+    # # print()
+    # # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
+    # # time.sleep(2)
+    # # Create the slider with increments of 5 up to 100
+    #
+    # # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
+    # value_disease = min(df1.shape[0], 100)
+    #
+    # # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
+    # #             f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
+    # #             f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+    # #             f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+    # #     unsafe_allow_html=True)
+    #
+    # st.markdown(
+    #     f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} "
+    #     f"</span>Diseases contextually and semantically similar to "
+    #     f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
+    #     unsafe_allow_html=True)
+    #
+    # df14 = df1.head(value_disease).copy()
+    #
+    # df14.index = (1 / df14.index) * 10000
+    # sizes = df14.index.tolist()
+    #
+    # df14.set_index('Disease', inplace=True)
+    #
+    # df7 = df1.copy()
+    # # print(df4.head(10))
+    # df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str)
+    # df7.reset_index(inplace=True)
+    # # df4 = df4.rename(columns={'Protein': 'symbol2'})
+    # # print(df4)
+    # # # Use df.query to get a subset of df1 based on ids in df2
+    # # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
+    # # # Use merge to join the two DataFrames on id
+    # # result = pd.merge(subset, df2b, on='symbol2')
+    # # print(result)
+    # if value_disease <= df_len:
+    #     # Define the `text` column for labels and `href` column for links
+    #     # Reset the index
+    #     df14.reset_index(inplace=True)
+    #
+    #     # Replace hyphens with spaces in the 'text' column
+    #     df14['Disease'] = df14['Disease'].str.replace('-', ' ')
+    #
+    #     # Set the 'text' column back as the index
+    #     df14.set_index('Disease', inplace=True)
+    #     df14['text'] = df14.index
+    #     df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+    #                     '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']]
+    #     df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']]
+    #     assert isinstance(df14, object)
+    #     df14['database'] = database_name
+    #
+    #     # df11['name'] = [c for c in result['Approved name']]
+    #
+    #     # Create the treemap using `px.treemap`
+    #     fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
+    #                      hover_name=(df7.head(value_disease)['SIMILARITY']))
+    #
+    #     fig.update(layout_coloraxis_showscale=False)
+    #     fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+    #     fig.update_annotations(visible=False)
+    #     fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+    #                       hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+    #                       texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
+    #                                    "<a href='%{customdata[0]}'>PubMed"
+    #                                    "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
+    #                                    "</span></a>")
+    #     fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"])
+    #     # # display the treemap in Streamlit
+    #     # with treemap2:
+    #
+    #     # st.pyplot(fig2)
+    #     st.plotly_chart(fig, use_container_width=True)
+    #
+    #     st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/")
+    #
+    #     csv = df1.head(value_disease).to_csv().encode('utf-8')
+    #     st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv,
+    #                        file_name=f'{database_name}_disease.csv', mime='text/csv')
+    #
+    #
+    # else:
+    #     st.warning(
+    #         f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number")
+    # st.markdown("---")
+    # st.markdown("---")
+    # # print()
+    # # print("Human genes similar to " + str(query))
+    # df1 = table.copy()
+    # df2 = pd.read_csv('pathwaysKegg.csv')
+    # m = df1.Word.isin(df2.pathway)
+    # df1 = df1[m]
+    # df1.rename(columns={'Word': 'Pathway'}, inplace=True)
+    # df_len = len(df1)
+    # # print(len(df1))
+    # # df1["Human Gene"] = df1["Human Gene"].str.upper()
+    # # print(df1.head(50))
+    # # print()
+    # # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
+    # # time.sleep(2)
+    # # Create the slider with increments of 5 up to 100
+    #
+    # # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
+    # value_pathway = min(df1.shape[0], 100)
+    #
+    # # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
+    # #             f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
+    # #             f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+    # #             f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+    # #     unsafe_allow_html=True)
+    #
+    # st.markdown(
+    #     f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_pathway} "
+    #     f"</span>Pathways contextually and semantically similar to "
+    #     f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
+    #     unsafe_allow_html=True)
+    #
+    # df16 = df1.head(value_pathway).copy()
+    #
+    # df16.index = (1 / df16.index) * 10000
+    # sizes = df16.index.tolist()
+    #
+    # df16.set_index('Pathway', inplace=True)
+    #
+    # df9 = df1.copy()
+    # # print(df4.head(10))
+    # df9["SIMILARITY"] = 'Similarity Score ' + df9.head(value_pathway)["SIMILARITY"].round(2).astype(str)
+    # df9.reset_index(inplace=True)
+    # # df4 = df4.rename(columns={'Protein': 'symbol2'})
+    # # print(df4)
+    # # # Use df.query to get a subset of df1 based on ids in df2
+    # # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
+    # # # Use merge to join the two DataFrames on id
+    # # result = pd.merge(subset, df2b, on='symbol2')
+    # # print(result)
+    # if value_pathway <= df_len:
+    #     # Define the `text` column for labels and `href` column for links
+    #     # Reset the index
+    #     df16.reset_index(inplace=True)
+    #
+    #     # Replace hyphens with spaces in the 'text' column
+    #     df16['Pathway'] = df16['Pathway'].str.replace('-', ' ')
+    #
+    #     # Set the 'text' column back as the index
+    #     df16.set_index('Pathway', inplace=True)
+    #     df16['text'] = df16.index
+    #     df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+    #                     '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']]
+    #     df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']]
+    #     assert isinstance(df16, object)
+    #     df16['database'] = database_name
+    #
+    #     # df11['name'] = [c for c in result['Approved name']]
+    #
+    #     # Create the treemap using `px.treemap`
+    #     fig = px.treemap(df16, path=[df16['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
+    #                      hover_name=(df9.head(value_pathway)['SIMILARITY']))
+    #
+    #     fig.update(layout_coloraxis_showscale=False)
+    #     fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+    #     fig.update_annotations(visible=False)
+    #     fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+    #                       hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+    #                       texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
+    #                                    "<a href='%{customdata[0]}'>PubMed"
+    #                                    "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
+    #                                    "</span></a>")
+    #     fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["FloralWhite"])
+    #     # # display the treemap in Streamlit
+    #     # with treemap2:
+    #
+    #     # st.pyplot(fig2)
+    #     st.plotly_chart(fig, use_container_width=True)
+    #
+    #     st.caption("Pathway designation and database provided by KEGG: https://www.genome.jp/kegg/pathway.html")
+    #
+    #     csv = df1.head(value_pathway).to_csv().encode('utf-8')
+    #     st.download_button(label=f"download top {value_pathway} pathways (csv)", data=csv,
+    #                        file_name=f'{database_name}_pathways.csv', mime='text/csv')
+    #
+    #
+    # else:
+    #     st.warning(
+    #         f"This selection exceeds the number of similar pathways related to {query} within the {database_name} corpus, please choose a lower number")
+    # st.markdown("---")
+    st.markdown("---")
     # print()
     # print("Human genes similar to " + str(query))
     df1 = table.copy()
+    df2 = pd.read_csv('phytochemicals.csv')
+    m = df1.Word.isin(df2.phyto)
+    df1 = df1[m]
+    df1.rename(columns={'Word': 'Phytochemical'}, inplace=True)
+    df_len = len(df1)
+    # print(len(df1))
+    # df1["Human Gene"] = df1["Human Gene"].str.upper()
     # print(df1.head(50))
     # print()
     # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
     # time.sleep(2)
     # Create the slider with increments of 5 up to 100
+    # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
+    value_phyto = min(df1.shape[0], 100)
+    # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
+    #             f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
+    #             f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+    #             f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+    #     unsafe_allow_html=True)
+    st.markdown(
+        f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} "
+        f"</span>Phytochemicals contextually and semantically similar to "
+        f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
+        f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>",
+        unsafe_allow_html=True)
+    df15 = df1.head(value_phyto).copy()
+    df15.index = (1 / df15.index) * 10000
+    sizes = df15.index.tolist()
+    df15.set_index('Phytochemical', inplace=True)
+    df8 = df1.copy()
+    # print(df4.head(10))
+    df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str)
+    df8.reset_index(inplace=True)
+    # df4 = df4.rename(columns={'Protein': 'symbol2'})
+    # print(df4)
+    # # Use df.query to get a subset of df1 based on ids in df2
+    # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
+    # # Use merge to join the two DataFrames on id
+    # result = pd.merge(subset, df2b, on='symbol2')
+    # print(result)
+    if value_phyto <= df_len:
+        # Define the `text` column for labels and `href` column for links
+        # Reset the index
+        df15.reset_index(inplace=True)
+        # Replace hyphens with spaces in the 'text' column
+        df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ')
+        # Set the 'text' column back as the index
+        df15.set_index('Phytochemical', inplace=True)
+        df15['text'] = df15.index
+        df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                        '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']]
+        df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']]
+        assert isinstance(df15, object)
+        df15['database'] = database_name
+        # df11['name'] = [c for c in result['Approved name']]
+        # Create the treemap using `px.treemap`
+        fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
+                         hover_name=(df8.head(value_phyto)['SIMILARITY']))
+        fig.update(layout_coloraxis_showscale=False)
+        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+        fig.update_annotations(visible=False)
+        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
+                          hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                          texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
+                                       "<a href='%{customdata[0]}'>PubMed"
+                                       "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
+                                       "</span></a>")
+        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"])
+        # # display the treemap in Streamlit
+        # with treemap2:
+        # st.pyplot(fig2)
+        st.plotly_chart(fig, use_container_width=True)
+        st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/")
+        csv = df1.head(value_phyto).to_csv().encode('utf-8')
+        st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv,
+                           file_name=f'{database_name}_phytochemicals.csv', mime='text/csv')
+    else:
+        st.warning(
+            f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
+    st.markdown("---")
     # print()
     # print("Human genes similar to " + str(query))
     df1 = table.copy()
+    df2 = pd.read_csv('kegg_compounds_lowercase.csv')
+    m = df1.Word.isin(df2.compound)
     df1 = df1[m]
+    df1.rename(columns={'Word': 'Compounds'}, inplace=True)
     df_len = len(df1)
+    # df1["Human Gene"] = df1["Human Gene"].str.upper()
     # print(df1.head(50))
     # print()
     # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
     # time.sleep(2)
     # Create the slider with increments of 5 up to 100
+    # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
+    value_compound = min(df1.shape[0], 100)
+    # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
+    #             f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
+    #             f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
+    #             f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
+    #     unsafe_allow_html=True)
     st.markdown(
+        f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} "
+        f"</span>Compounds contextually and semantically similar to "
+        f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
+        f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>",
         unsafe_allow_html=True)
+    df12 = df1.head(value_compound).copy()
+    df12.index = (1 / df12.index) * 10000
+    sizes = df12.index.tolist()
+    df12.set_index('Compounds', inplace=True)
+    df5 = df1.copy()
+    # print(df4.head(10))
+    df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str)
+    df5.reset_index(inplace=True)
+    # df4 = df4.rename(columns={'Protein': 'symbol2'})
+    # print(df4)
+    # # Use df.query to get a subset of df1 based on ids in df2
+    # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
+    # # Use merge to join the two DataFrames on id
+    # result = pd.merge(subset, df2b, on='symbol2')
+    # print(result)
+    if value_compound <= df_len:
+        # Define the `text` column for labels and `href` column for links
+        # Reset the index
+        df12.reset_index(inplace=True)
+        # Replace hyphens with spaces in the 'text' column
+        df12['Compounds'] = df12['Compounds'].str.replace('-', ' ')
+        # Set the 'text' column back as the index
+        df12.set_index('Compounds', inplace=True)
+        df12['text'] = df12.index
+        df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
+                        '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']]
+        df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']]
+        df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in get_compound_ids(df12['text'])]
+        assert isinstance(df12, object)
+        df12['database'] = database_name
+        # df11['name'] = [c for c in result['Approved name']]
+        # Create the treemap using `px.treemap`
+        fig = px.treemap(df12, path=[df12['text']], values=sizes,
+                         custom_data=['href', 'database', 'href2', 'text', 'href3'],
+                         hover_name=(df5.head(value_compound)['SIMILARITY']))
+        fig.update(layout_coloraxis_showscale=False)
+        fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
+        fig.update_annotations(visible=False)
+        fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
                           hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
+                          texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
                                        "<a href='%{customdata[0]}'>PubMed"
+                                       "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
+                                       "</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page"
                                        "</span></a>")
+        fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"])
+        # # display the treemap in Streamlit
+        # with treemap2:
+        # st.pyplot(fig2)
+        st.plotly_chart(fig, use_container_width=True)
+        st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/")
+        csv = df1.head(value_compound).to_csv().encode('utf-8')
+        st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv,
+                           file_name=f'{database_name}_compounds.csv', mime='text/csv')
+    else:
+        st.warning(
+            f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
     st.markdown("---")
+    def save_comment(comment):
+        with open('comments.txt', 'a') as f:
+            f.write(f'{comment}\n')
+    def save_comment_threaded(comment):
+        t = threading.Thread(target=save_comment, args=(comment,))
+        t.start()
+    st.title("Abstractalytics Web App")
+    st.write("We appreciate your feedback!")
+    user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: "
+                                "(app will pause while we save your comments)")
+    if st.button("Submit"):
+        if user_comment:
+            save_comment_threaded(user_comment)
+            st.success("Your comment has been saved. Thank you for your feedback!")
+        else:
+            st.warning("Please enter a comment before submitting.")
+    st.markdown("---")
     st.subheader("Cancer-related videos")
     if query:
+        idlist = []
         search_keyword = {query}
         html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
         html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
         c1, c2, c3 = st.columns(3)
         with c1:
+            st.video("https://www.youtube.com/watch?v=" + video_ids[0])
         with c2:
+            st.video("https://www.youtube.com/watch?v=" + video_ids[1])
         with c3:
+            st.video("https://www.youtube.com/watch?v=" + video_ids[2])
     st.markdown("---")
+# else:
+#     st.error("The password you entered is incorrect.")