Spaces:

jfataphd
/

OncoDigger

Running

App Files Files Community

OncoDigger / app.py

jfataphd

Update app.py

6ce67d2 over 2 years ago

raw

history blame

48.2 kB

	import streamlit as st
	import time
	import concurrent.futures
	import json
	from gensim.models import Word2Vec
	import pandas as pd
	import threading
	import matplotlib.pyplot as plt
	import squarify
	import numpy as np
	import re
	import urllib.request
	import random
	import plotly.express as px

	st.set_page_config(
	page_title="Abstractalytics",
	page_icon=":microscope:",
	layout="wide", #centered
	initial_sidebar_state="auto",
	menu_items={
	'About': "Abstractalytics is a Natural Language Processing (NLP) that harnesses Word2Vec to mine"
	" insight from pubmed abstracts. Created by Jimmie E. Fata, PhD"
	}
	)

	# Define the HTML and CSS styles
	st.markdown("""
	<style>
	[data-testid=stSidebar] {
	background-color: #99CCFF;
	}
	</style>
	""", unsafe_allow_html=True)
	st.markdown("""
	<style>
	body {
	background-color: #CCFFFF;
	# color: #ffffff;
	# font-size: 1px
	}
	.stApp {
	background-color: #CCFFFF;
	# color: #ffffff;
	# font-size: 1px
	}
	</style>
	""", unsafe_allow_html=True)

	st.header(":red[Abstractalytics]")

	st.subheader("A web app designed to explore :red[PubMed abstracts*] for deeper understanding and fresh insights, driven "
	"by Natural Language Processing (NLP) techniques.*")

	def custom_subheader(text, identifier, font_size):
	st.markdown(f"<h3 id='{identifier}' style='font-size: {font_size}px;'>{text}</h3>", unsafe_allow_html=True)

	custom_subheader("Welcome to our innovative web2vec app designed to unlock the wealth of knowledge and insights hidden "
	"within PubMed abstracts! To begin, simply select a corpus that interests you. Next, enter a single keyword "
	"you wish to explore within the corpus. Abstractalytics powerful Natural Language "
	"Processing (NLP) algorithms will analyze the chosen corpus and present you with a list of top words, "
	"genes, drugs, phytochemicals, and compounds that are contextually and semantically related "
	"to your input. This advanced text-mining technique enables you to explore and understand complex "
	"relationships, uncovering new discoveries and connections in your field of research across a massive "
	"amount of abstracts. Dive in and enjoy the exploration! More oncology-related corpora comming soon.", "unique-id", 18)

	st.markdown("---")

	#Define the correct password
	# CORRECT_PASSWORD = "123"

	# Define a function to check if the password is correct
	# def authenticate(password):
	# if password == CORRECT_PASSWORD:
	# return True
	# else:
	# return False
	#
	# # Create a Streamlit input field for the password
	# password = st.text_input("Enter password:", type="password")
	#
	# # If the password is correct, show the app content
	# if authenticate(password):
	opt = st.sidebar.radio("Select a PubMed Corpus",
	options=(
	'Breast Cancer corpus', 'Lung Cancer corpus'))
	# if opt == "Clotting corpus":
	# model_used = ("pubmed_model_clotting")
	# num_abstracts = 45493
	# database_name = "Clotting"
	# if opt == "Neuroblastoma corpus":
	# model_used = ("pubmed_model_neuroblastoma")
	# num_abstracts = 29032
	# database_name = "Neuroblastoma"
	if opt == "Breast Cancer corpus":
	model_used = ("pubmed_model_breast_cancer2")
	num_abstracts = 290320
	database_name = "Breast_cancer"
	if opt == "Lung Cancer corpus":
	model_used = ("lung_cancer_pubmed_model")
	num_abstracts = 210320
	database_name = "Lung_cancer"

	st.header(f":blue[{database_name} Pubmed corpus.]")
	text_input_value = st.text_input(f"Enter one term to search within the {database_name} corpus")
	query = text_input_value
	query = query.lower()
	query = re.sub("[,.?!&*;:]", "", query)
	query = re.sub(" ", "-", query)
	# matches = [" "]
	# if any([x in query for x in matches]):
	# st.write("Please only enter one term or a term without spaces")
	# # query = input ("Enter your keyword(s):")
	if query:
	bar = st.progress(0)
	time.sleep(.05)
	st.caption(f"Searching {num_abstracts} {database_name} PubMed abstracts covering 1990-2022")

	for i in range(10):
	bar.progress((i + 1) * 10)
	time.sleep(.1)

	# try:
	model = Word2Vec.load(f"{model_used}") # you can continue training with the loaded model!
	words = list(model.wv.key_to_index)
	X = model.wv[model.wv.key_to_index]
	# print(model.wv['bfgf'])
	model2 = model.wv[query]
	# print(model.wv.similar_by_word('bfgf', topn=50, restrict_vocab=None))
	df = pd.DataFrame(X)

	def get_compound_ids(compound_names):
	with concurrent.futures.ThreadPoolExecutor() as executor:
	compound_ids = list(executor.map(get_compound_id, compound_names))
	return compound_ids


	import requests


	def get_compound_id(compound_name):
	url = f"http://rest.kegg.jp/find/compound/{compound_name}"
	response = requests.get(url)
	if response.status_code == 200:
	result = response.text.split('\n')
	if result[0]:
	compound_id = result[0].split('\t')[0]
	return compound_id
	return None

	# except:
	# st.error("Term occurrence is too low - please try another term")
	# st.stop()
	st.markdown("---")

	table = model.wv.most_similar_cosmul(query, topn=10000)
	table = (pd.DataFrame(table))
	table.index.name = 'Rank'
	table.columns = ['Word', 'SIMILARITY']

	pd.set_option('display.max_rows', None)
	table2 = table.copy()

	# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
	# f"<span style='color:red; font-style: italic;'>words</span> contextually "
	# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	# unsafe_allow_html=True)

	# Set the max number of words to display
	value_word = min(100, len(table2))

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_word} "
	f"</span>words contextually and semantically similar to "
	f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
	f"Click on the squares to expand and also the PubMed and Wikipedia links for more word information</span></p></b>",
	unsafe_allow_html=True)

	short_table = table2.head(value_word).round(2)
	short_table.index += 1
	short_table.index = (1 / short_table.index) * 10
	sizes = short_table.index.tolist()

	short_table.set_index('Word', inplace=True)
	table2["SIMILARITY"] = 'Similarity Score ' + table2.head(value_word)["SIMILARITY"].round(2).astype(str)
	rank_num = list(short_table.index.tolist())

	df = short_table
	try:
	df['text'] = short_table.index
	df['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in short_table.index]
	df['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in short_table.index]

	df.loc[:, 'database'] = database_name

	fig = px.treemap(df, path=[short_table.index], values=sizes, custom_data=['href', 'text', 'database', 'href2'],
	hover_name=(table2.head(value_word)['SIMILARITY']))

	fig.update(layout_coloraxis_showscale=False)
	fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	fig.update_annotations(visible=False)
	fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	texttemplate="<br><span "
	"style='font-family: Arial; font-size: 20px;'>%{customdata[1]}<br><br>"
	"<a href='%{customdata[0]}'>PubMed"
	"</a><br><br><a href='%{customdata[3]}'>Wikipedia"
	"</span></a>")
	fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightgreen"])

	# st.pyplot(fig2)
	st.plotly_chart(fig, use_container_width=True)

	# st.caption(
	# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
	# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")

	csv = table2.head(value_word).to_csv().encode('utf-8')
	st.download_button(label=f"download top {value_word} words (csv)", data=csv,
	file_name=f'{database_name}_words.csv', mime='text/csv')
	except:
	st.warning(
	f"This selection exceeds the number of similar words related to {query} within the {database_name} corpus, please choose a lower number")

	# st.markdown("---")
	# # st.write(short_table)
	# #
	#
	# # print()
	# # print("Human genes similar to " + str(query))
	# df1 = table.copy()
	# df2 = pd.read_csv('Human Genes.csv')
	# m = df1.Word.isin(df2.symbol)
	# df1 = df1[m]
	# df1.rename(columns={'Word': 'Human Gene'}, inplace=True)
	# df1["Human Gene"] = df1["Human Gene"].str.upper()
	# # print(df1.head(50))
	# # print()
	# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# # time.sleep(2)
	# # Create the slider with increments of 5 up to 100
	#
	# # Set the maximum number of genes to display up to 100
	# value_gene = min(len(df1), 100)
	#
	# if value_gene > 0:
	# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Treemap visualization of "
	# # f"<span style='color:red; font-style: italic;'>genes</span> contextually "
	# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	# # unsafe_allow_html=True)
	#
	# st.markdown(
	# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
	# f"</span>genes contextually and semantically similar to "
	# f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. "
	# f"Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
	# unsafe_allow_html=True)
	#
	# df10 = df1.head(value_gene).copy()
	# df10.index = (1 / df10.index) * 100000
	# sizes = df10.index.tolist()
	# df10.set_index('Human Gene', inplace=True)
	#
	# df3 = df1.copy()
	# df3["SIMILARITY"] = 'Similarity Score ' + df3.head(value_gene)["SIMILARITY"].round(2).astype(str)
	# df3.reset_index(inplace=True)
	# df3 = df3.rename(columns={'Human Gene': 'symbol2'})
	# # Use df.query to get a subset of df1 based on ids in df2
	# subset = df3.head(value_gene).query('symbol2 in @df2.symbol2')
	# # Use merge to join the two DataFrames on id
	# result = pd.merge(subset, df2, on='symbol2')
	# # Show the result
	# # print(result)
	# # label = df10.index.tolist()
	# # df2 = df10
	# # print(df2)
	# try:
	# # Define the `text` column for labels and `href` column for links
	# df10['text'] = df10.index
	# df10['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df10['text']]
	# df10['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df10['text']]
	#
	# df10['name'] = [c for c in result['Approved name']]
	# assert isinstance(df10, object)
	# df10.loc[:, 'database'] = database_name
	#
	# # print(df['name'])
	#
	# # Create the treemap using `px.treemap`
	# fig = px.treemap(df10, path=[df10['text']], values=sizes,
	# custom_data=['href', 'name', 'database', 'href2', 'text'],
	# hover_name=(df3.head(value_gene)['SIMILARITY']))
	#
	# fig.update(layout_coloraxis_showscale=False)
	# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	# fig.update_annotations(visible=False)
	# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	# texttemplate="<br><span style='font-family: Arial; font-size: 20px;'>%{customdata[4]}<br><br>"
	# "%{customdata[1]}<br><br>"
	# "<a href='%{customdata[0]}'>PubMed"
	# "</a><br><br><a href='%{customdata[3]}'>GeneCard"
	# "</span></a>")
	# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["lightblue"])
	# # # display the treemap in Streamlit
	# # with treemap2:
	#
	# # st.pyplot(fig2)
	# st.plotly_chart(fig, use_container_width=True)
	#
	# st.caption(
	# "Gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
	# st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
	# st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
	#
	# csv = df1.head(value_gene).to_csv().encode('utf-8')
	# st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
	# file_name=f'{database_name}_genes.csv', mime='text/csv')
	#
	#
	# except:
	# st.warning(f"No similar genes related to {query} within the {database_name} corpus were found.")

	st.markdown("---")

	df1 = table.copy()
	df2 = pd.read_csv('Human Genes.csv')
	m = df1.Word.isin(df2.symbol)
	df1 = df1[m]
	df1.rename(columns={'Word': 'Genes'}, inplace=True)
	df_len = len(df1)
	print(len(df1))

	# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Populate a treemap to visualize "
	# f"<span style='color:red; font-style: italic;'>proteins</span> contextually "
	# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	# unsafe_allow_html=True)

	# Set the number of proteins to display
	value_gene = min(df_len, 100)

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_gene} "
	f"</span>human genes contextually and semantically similar to "
	f"<span style='color:red; font-style: italic;'>{query} </span>within the <span style='color:red; font-style: italic;'>{database_name} </span>corpus. Click on the squares to expand and also the Pubmed and GeneCard links for more gene information</span></p></b>",
	unsafe_allow_html=True)

	df11 = df1.head(value_gene).copy()

	df11.index = (1 / df11.index) * 10000
	sizes = df11.index.tolist()

	df11.set_index('Genes', inplace=True)

	df4 = df1.copy()
	# print(df4.head(10))
	df4["SIMILARITY"] = 'Similarity Score ' + df4.head(value_gene)["SIMILARITY"].round(2).astype(str)
	df4.reset_index(inplace=True)
	# df4 = df4.rename(columns={'Protein': 'symbol2'})
	# print(df4)
	# # Use df.query to get a subset of df1 based on ids in df2
	# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
	# # Use merge to join the two DataFrames on id
	# result = pd.merge(subset, df2b, on='symbol2')
	# print(result)
	if value_gene <= df_len:
	# Define the `text` column for labels and `href` column for links
	df11['text'] = df11.index
	df11['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df11['text']]
	df11['href2'] = [f'https://www.genecards.org/cgi-bin/carddisp.pl?gene=' + c for c in df11['text']]
	assert isinstance(df11, object)
	df11['database'] = database_name

	# df11['name'] = [c for c in result['Approved name']]

	# Create the treemap using `px.treemap`
	fig = px.treemap(df11, path=[df11['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
	hover_name=(df4.head(value_gene)['SIMILARITY']))

	fig.update(layout_coloraxis_showscale=False)
	fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	fig.update_annotations(visible=False)
	fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
	"<a href='%{customdata[0]}'>PubMed"
	"</a><br><br><a href='%{customdata[2]}'>GeneCard"
	"</span></a>")
	fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightPink"])
	# # display the treemap in Streamlit
	# with treemap2:

	# st.pyplot(fig2)
	st.plotly_chart(fig, use_container_width=True)

	# st.caption(
	# "Gene designation and database provided by KEGG homo sapien gene list: https://rest.kegg.jp/list/hsa")
	# st.caption("Gene information provided by GeneCards: https://www.genecards.org//")
	st.caption("Human gene designation and database provided by HUGO Gene Nomenclature Committee (HGNC): https://www.genenames.org/")
	st.caption("Gene designation add in exceptions [p21, p53, her2, her3]")
	st.caption("Gene information provided by GeneCards: https://www.genecards.org//")

	csv = df1.head(value_gene).to_csv().encode('utf-8')
	st.download_button(label=f"download top {value_gene} genes (csv)", data=csv,
	file_name=f'{database_name}_genes.csv', mime='text/csv')


	else:
	st.warning(
	f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
	st.markdown("---")
	# print()
	# print("Human genes similar to " + str(query))
	df1 = table.copy()
	df2 = pd.read_csv('kegg_drug_list_lowercase.csv')
	m = df1.Word.isin(df2.drugs)
	df1 = df1[m]
	df1.rename(columns={'Word': 'Drugs'}, inplace=True)
	df_len = len(df1)
	# print(len(df1))
	# df1["Human Gene"] = df1["Human Gene"].str.upper()
	# print(df1.head(50))
	# print()
	# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# time.sleep(2)
	# Create the slider with increments of 5 up to 100

	# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
	value_drug = min(df1.shape[0], 100)

	# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
	# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
	# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	# unsafe_allow_html=True)

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_drug} "
	f"</span>Drugs contextually and semantically similar to "
	f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
	unsafe_allow_html=True)

	df13 = df1.head(value_drug).copy()

	df13.index = (1 / df13.index) * 10000
	sizes = df13.index.tolist()

	df13.set_index('Drugs', inplace=True)

	df6 = df1.copy()
	# print(df4.head(10))
	df6["SIMILARITY"] = 'Similarity Score ' + df6.head(value_drug)["SIMILARITY"].round(2).astype(str)
	df6.reset_index(inplace=True)
	# df4 = df4.rename(columns={'Protein': 'symbol2'})
	# print(df4)
	# # Use df.query to get a subset of df1 based on ids in df2
	# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
	# # Use merge to join the two DataFrames on id
	# result = pd.merge(subset, df2b, on='symbol2')
	# print(result)
	if value_drug <= df_len:
	# Define the `text` column for labels and `href` column for links
	# Reset the index
	df13.reset_index(inplace=True)

	# Replace hyphens with spaces in the 'text' column
	df13['Drugs'] = df13['Drugs'].str.replace('-', ' ')

	# Set the 'text' column back as the index
	df13.set_index('Drugs', inplace=True)
	df13['text'] = df13.index
	df13['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df13['text']]
	df13['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df13['text']]
	assert isinstance(df13, object)
	df13['database'] = database_name

	# df11['name'] = [c for c in result['Approved name']]

	# Create the treemap using `px.treemap`
	fig = px.treemap(df13, path=[df13['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
	hover_name=(df6.head(value_drug)['SIMILARITY']))

	fig.update(layout_coloraxis_showscale=False)
	fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	fig.update_annotations(visible=False)
	fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
	"<a href='%{customdata[0]}'>PubMed"
	"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
	"</span></a>")
	fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["Thistle"])
	# # display the treemap in Streamlit
	# with treemap2:

	# st.pyplot(fig2)
	st.plotly_chart(fig, use_container_width=True)

	st.caption(
	"Drug designation and database provided by KEGG: https://www.kegg.jp/kegg/drug/")

	csv = df1.head(value_drug).to_csv().encode('utf-8')
	st.download_button(label=f"download top {value_drug} drugs (csv)", data=csv,
	file_name=f'{database_name}_drugs.csv', mime='text/csv')


	else:
	st.warning(
	f"This selection exceeds the number of similar drugs related to {query} within the {database_name} corpus, please choose a lower number")
	st.markdown("---")
	#
	# st.markdown("---")
	# # print()
	# # print("Human genes similar to " + str(query))
	# df1 = table.copy()
	# df2 = pd.read_csv('diseasesKegg.csv')
	# m = df1.Word.isin(df2.disease)
	# df1 = df1[m]
	# df1.rename(columns={'Word': 'Disease'}, inplace=True)
	# df_len = len(df1)
	# # print(len(df1))
	# # df1["Human Gene"] = df1["Human Gene"].str.upper()
	# # print(df1.head(50))
	# # print()
	# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# # time.sleep(2)
	# # Create the slider with increments of 5 up to 100
	#
	# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
	# value_disease = min(df1.shape[0], 100)
	#
	# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
	# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
	# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	# # unsafe_allow_html=True)
	#
	# st.markdown(
	# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_disease} "
	# f"</span>Diseases contextually and semantically similar to "
	# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
	# unsafe_allow_html=True)
	#
	# df14 = df1.head(value_disease).copy()
	#
	# df14.index = (1 / df14.index) * 10000
	# sizes = df14.index.tolist()
	#
	# df14.set_index('Disease', inplace=True)
	#
	# df7 = df1.copy()
	# # print(df4.head(10))
	# df7["SIMILARITY"] = 'Similarity Score ' + df7.head(value_disease)["SIMILARITY"].round(2).astype(str)
	# df7.reset_index(inplace=True)
	# # df4 = df4.rename(columns={'Protein': 'symbol2'})
	# # print(df4)
	# # # Use df.query to get a subset of df1 based on ids in df2
	# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
	# # # Use merge to join the two DataFrames on id
	# # result = pd.merge(subset, df2b, on='symbol2')
	# # print(result)
	# if value_disease <= df_len:
	# # Define the `text` column for labels and `href` column for links
	# # Reset the index
	# df14.reset_index(inplace=True)
	#
	# # Replace hyphens with spaces in the 'text' column
	# df14['Disease'] = df14['Disease'].str.replace('-', ' ')
	#
	# # Set the 'text' column back as the index
	# df14.set_index('Disease', inplace=True)
	# df14['text'] = df14.index
	# df14['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df14['text']]
	# df14['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df14['text']]
	# assert isinstance(df14, object)
	# df14['database'] = database_name
	#
	# # df11['name'] = [c for c in result['Approved name']]
	#
	# # Create the treemap using `px.treemap`
	# fig = px.treemap(df14, path=[df14['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
	# hover_name=(df7.head(value_disease)['SIMILARITY']))
	#
	# fig.update(layout_coloraxis_showscale=False)
	# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	# fig.update_annotations(visible=False)
	# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
	# "<a href='%{customdata[0]}'>PubMed"
	# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
	# "</span></a>")
	# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["PaleGoldenRod"])
	# # # display the treemap in Streamlit
	# # with treemap2:
	#
	# # st.pyplot(fig2)
	# st.plotly_chart(fig, use_container_width=True)
	#
	# st.caption("Disease designation and database provided by KEGG: https://www.genome.jp/kegg/disease/")
	#
	# csv = df1.head(value_disease).to_csv().encode('utf-8')
	# st.download_button(label=f"download top {value_disease} diseases (csv)", data=csv,
	# file_name=f'{database_name}_disease.csv', mime='text/csv')
	#
	#
	# else:
	# st.warning(
	# f"This selection exceeds the number of similar diseases related to {query} within the {database_name} corpus, please choose a lower number")
	# st.markdown("---")

	# st.markdown("---")
	# # print()
	# # print("Human genes similar to " + str(query))
	# df1 = table.copy()
	# df2 = pd.read_csv('pathwaysKegg.csv')
	# m = df1.Word.isin(df2.pathway)
	# df1 = df1[m]
	# df1.rename(columns={'Word': 'Pathway'}, inplace=True)
	# df_len = len(df1)
	# # print(len(df1))
	# # df1["Human Gene"] = df1["Human Gene"].str.upper()
	# # print(df1.head(50))
	# # print()
	# # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# # time.sleep(2)
	# # Create the slider with increments of 5 up to 100
	#
	# # Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
	# value_pathway = min(df1.shape[0], 100)
	#
	# # st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
	# # f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
	# # f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	# # f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	# # unsafe_allow_html=True)
	#
	# st.markdown(
	# f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_pathway} "
	# f"</span>Pathways contextually and semantically similar to "
	# f"<span style='color:red; font-style: italic;'>{query}:</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> database. Click on the squares to expand and the Pubmed and Wikipedia links for more compound information</span></p></b>",
	# unsafe_allow_html=True)
	#
	# df16 = df1.head(value_pathway).copy()
	#
	# df16.index = (1 / df16.index) * 10000
	# sizes = df16.index.tolist()
	#
	# df16.set_index('Pathway', inplace=True)
	#
	# df9 = df1.copy()
	# # print(df4.head(10))
	# df9["SIMILARITY"] = 'Similarity Score ' + df9.head(value_pathway)["SIMILARITY"].round(2).astype(str)
	# df9.reset_index(inplace=True)
	# # df4 = df4.rename(columns={'Protein': 'symbol2'})
	# # print(df4)
	# # # Use df.query to get a subset of df1 based on ids in df2
	# # subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
	# # # Use merge to join the two DataFrames on id
	# # result = pd.merge(subset, df2b, on='symbol2')
	# # print(result)
	# if value_pathway <= df_len:
	# # Define the `text` column for labels and `href` column for links
	# # Reset the index
	# df16.reset_index(inplace=True)
	#
	# # Replace hyphens with spaces in the 'text' column
	# df16['Pathway'] = df16['Pathway'].str.replace('-', ' ')
	#
	# # Set the 'text' column back as the index
	# df16.set_index('Pathway', inplace=True)
	# df16['text'] = df16.index
	# df16['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	# '+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df16['text']]
	# df16['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df16['text']]
	# assert isinstance(df16, object)
	# df16['database'] = database_name
	#
	# # df11['name'] = [c for c in result['Approved name']]
	#
	# # Create the treemap using `px.treemap`
	# fig = px.treemap(df16, path=[df16['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
	# hover_name=(df9.head(value_pathway)['SIMILARITY']))
	#
	# fig.update(layout_coloraxis_showscale=False)
	# fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	# fig.update_annotations(visible=False)
	# fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	# hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	# texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
	# "<a href='%{customdata[0]}'>PubMed"
	# "</a><br><br><a href='%{customdata[2]}'>Wikipedia"
	# "</span></a>")
	# fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["FloralWhite"])
	# # # display the treemap in Streamlit
	# # with treemap2:
	#
	# # st.pyplot(fig2)
	# st.plotly_chart(fig, use_container_width=True)
	#
	# st.caption("Pathway designation and database provided by KEGG: https://www.genome.jp/kegg/pathway.html")
	#
	# csv = df1.head(value_pathway).to_csv().encode('utf-8')
	# st.download_button(label=f"download top {value_pathway} pathways (csv)", data=csv,
	# file_name=f'{database_name}_pathways.csv', mime='text/csv')
	#
	#
	# else:
	# st.warning(
	# f"This selection exceeds the number of similar pathways related to {query} within the {database_name} corpus, please choose a lower number")
	# st.markdown("---")

	st.markdown("---")
	# print()
	# print("Human genes similar to " + str(query))
	df1 = table.copy()
	df2 = pd.read_csv('phytochemicals.csv')
	m = df1.Word.isin(df2.phyto)
	df1 = df1[m]
	df1.rename(columns={'Word': 'Phytochemical'}, inplace=True)
	df_len = len(df1)
	# print(len(df1))
	# df1["Human Gene"] = df1["Human Gene"].str.upper()
	# print(df1.head(50))
	# print()
	# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# time.sleep(2)
	# Create the slider with increments of 5 up to 100

	# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
	value_phyto = min(df1.shape[0], 100)

	# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
	# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
	# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	# unsafe_allow_html=True)

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_phyto} "
	f"</span>Phytochemicals contextually and semantically similar to "
	f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
	f"Click on the squares to expand and also the Pubmed and Wikipedia links for more compound information</span></p></b>",
	unsafe_allow_html=True)

	df15 = df1.head(value_phyto).copy()

	df15.index = (1 / df15.index) * 10000
	sizes = df15.index.tolist()

	df15.set_index('Phytochemical', inplace=True)

	df8 = df1.copy()
	# print(df4.head(10))
	df8["SIMILARITY"] = 'Similarity Score ' + df8.head(value_phyto)["SIMILARITY"].round(2).astype(str)
	df8.reset_index(inplace=True)
	# df4 = df4.rename(columns={'Protein': 'symbol2'})
	# print(df4)
	# # Use df.query to get a subset of df1 based on ids in df2
	# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
	# # Use merge to join the two DataFrames on id
	# result = pd.merge(subset, df2b, on='symbol2')
	# print(result)
	if value_phyto <= df_len:
	# Define the `text` column for labels and `href` column for links
	# Reset the index
	df15.reset_index(inplace=True)

	# Replace hyphens with spaces in the 'text' column
	df15['Phytochemical'] = df15['Phytochemical'].str.replace('-', ' ')

	# Set the 'text' column back as the index
	df15.set_index('Phytochemical', inplace=True)
	df15['text'] = df15.index
	df15['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df15['text']]
	df15['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df15['text']]
	assert isinstance(df15, object)
	df15['database'] = database_name

	# df11['name'] = [c for c in result['Approved name']]

	# Create the treemap using `px.treemap`
	fig = px.treemap(df15, path=[df15['text']], values=sizes, custom_data=['href', 'database', 'href2', 'text'],
	hover_name=(df8.head(value_phyto)['SIMILARITY']))

	fig.update(layout_coloraxis_showscale=False)
	fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	fig.update_annotations(visible=False)
	fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
	"<a href='%{customdata[0]}'>PubMed"
	"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
	"</span></a>")
	fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightSeaGreen"])
	# # display the treemap in Streamlit
	# with treemap2:

	# st.pyplot(fig2)
	st.plotly_chart(fig, use_container_width=True)

	st.caption("Phytochemical designation and database provided by PhytoHub: https://phytohub.eu/")

	csv = df1.head(value_phyto).to_csv().encode('utf-8')
	st.download_button(label=f"download top {value_phyto} phytochemicals (csv)", data=csv,
	file_name=f'{database_name}_phytochemicals.csv', mime='text/csv')


	else:
	st.warning(
	f"This selection exceeds the number of similar pythochemicals related to {query} within the {database_name} corpus, please choose a lower number")
	st.markdown("---")

	# print()
	# print("Human genes similar to " + str(query))
	df1 = table.copy()
	df2 = pd.read_csv('kegg_compounds_lowercase.csv')
	m = df1.Word.isin(df2.compound)
	df1 = df1[m]
	df1.rename(columns={'Word': 'Compounds'}, inplace=True)
	df_len = len(df1)
	# df1["Human Gene"] = df1["Human Gene"].str.upper()
	# print(df1.head(50))
	# print()
	# df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False)
	# time.sleep(2)
	# Create the slider with increments of 5 up to 100

	# Remove the slider and set the value_compound to the minimum of the number of rows in the dataframe and 100
	value_compound = min(df1.shape[0], 100)

	# st.markdown(f"<b><p style='font-family: Arial; font-size: 20px;'>Visualize "
	# f"<span style='color:red; font-style: italic;'>KEGG compounds</span> contextually "
	# f"and semantically similar to <span style='color:red; font-style: italic;'>{query}</span> "
	# f"within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus.</p></b>",
	# unsafe_allow_html=True)

	st.markdown(
	f"<b><p style='font-family: Arial; font-size: 20px; font-style: Bold;'>Top <span style='color:red; font-style: italic;'>{value_compound} "
	f"</span>Compounds contextually and semantically similar to "
	f"<span style='color:red; font-style: italic;'>{query}</span> within the <span style='color:red; font-style: italic;'>{database_name}</span> corpus. "
	f"Click on the squares to expand and the Pubmed, Wikipedia, and KEGG links for more compound information (may take time to load)</span></p></b>",
	unsafe_allow_html=True)

	df12 = df1.head(value_compound).copy()

	df12.index = (1 / df12.index) * 10000
	sizes = df12.index.tolist()

	df12.set_index('Compounds', inplace=True)

	df5 = df1.copy()
	# print(df4.head(10))
	df5["SIMILARITY"] = 'Similarity Score ' + df5.head(value_compound)["SIMILARITY"].round(2).astype(str)
	df5.reset_index(inplace=True)
	# df4 = df4.rename(columns={'Protein': 'symbol2'})
	# print(df4)
	# # Use df.query to get a subset of df1 based on ids in df2
	# subset = df4.head(value_gene).query('symbol2 in @df2b.symbol2')
	# # Use merge to join the two DataFrames on id
	# result = pd.merge(subset, df2b, on='symbol2')
	# print(result)

	if value_compound <= df_len:
	# Define the `text` column for labels and `href` column for links
	# Reset the index
	df12.reset_index(inplace=True)

	# Replace hyphens with spaces in the 'text' column
	df12['Compounds'] = df12['Compounds'].str.replace('-', ' ')

	# Set the 'text' column back as the index
	df12.set_index('Compounds', inplace=True)
	df12['text'] = df12.index
	df12['href'] = [f'https://pubmed.ncbi.nlm.nih.gov/?term={database_name}%5Bmh%5D+NOT+review%5Bpt%5D' \
	'+AND+english%5Bla%5D+AND+hasabstract+AND+1990:2022%5Bdp%5D+AND+' + c for c in df12['text']]
	df12['href2'] = [f'https://en.wikipedia.org/wiki/' + c for c in df12['text']]
	df12['href3'] = [f'https://www.genome.jp/entry/{compound_id}' for compound_id in get_compound_ids(df12['text'])]
	assert isinstance(df12, object)
	df12['database'] = database_name

	# df11['name'] = [c for c in result['Approved name']]

	# Create the treemap using `px.treemap`
	fig = px.treemap(df12, path=[df12['text']], values=sizes,
	custom_data=['href', 'database', 'href2', 'text', 'href3'],
	hover_name=(df5.head(value_compound)['SIMILARITY']))

	fig.update(layout_coloraxis_showscale=False)
	fig.update_layout(autosize=True, paper_bgcolor="#CCFFFF", margin=dict(t=0, b=0, l=0, r=0))
	fig.update_annotations(visible=False)
	fig.update_traces(marker=dict(cornerradius=5), root_color="#CCFFFF", hovertemplate=None,
	hoverlabel_bgcolor="lightblue", hoverlabel_bordercolor="#000000",
	texttemplate="<span style='font-family: Arial; font-size: 20px;'>%{customdata[3]}<br><br>"
	"<a href='%{customdata[0]}'>PubMed"
	"</a><br><br><a href='%{customdata[2]}'>Wikipedia"
	"</a><br><br><a href='%{customdata[4]}'>KEGG Compound Page"
	"</span></a>")

	fig.update_layout(uniformtext=dict(minsize=15), treemapcolorway=["LightYellow"])
	# # display the treemap in Streamlit
	# with treemap2:

	# st.pyplot(fig2)
	st.plotly_chart(fig, use_container_width=True)

	st.caption("Compound designation and database provided by KEGG: https://www.kegg.jp/kegg/compound/")

	csv = df1.head(value_compound).to_csv().encode('utf-8')
	st.download_button(label=f"download top {value_compound} compounds (csv)", data=csv,
	file_name=f'{database_name}_compounds.csv', mime='text/csv')


	else:
	st.warning(
	f"This selection exceeds the number of similar proteins related to {query} within the {database_name} corpus, please choose a lower number")
	st.markdown("---")


	def save_comment(comment):
	with open('comments.txt', 'a') as f:
	f.write(f'{comment}\n')


	def save_comment_threaded(comment):
	t = threading.Thread(target=save_comment, args=(comment,))
	t.start()


	st.title("Abstractalytics Web App")
	st.write("We appreciate your feedback!")

	user_comment = st.text_area("Please send us your anonymous remarks/suggestions about the Abstractalytics Web App: "
	"(app will pause while we save your comments)")

	if st.button("Submit"):
	if user_comment:
	save_comment_threaded(user_comment)
	st.success("Your comment has been saved. Thank you for your feedback!")
	else:
	st.warning("Please enter a comment before submitting.")

	st.markdown("---")

	st.subheader("Cancer-related videos")
	if query:
	idlist = []
	search_keyword = {query}
	html = urllib.request.urlopen("https://www.youtube.com/@NCIgov/search?query=cancer")
	html2 = urllib.request.urlopen("https://www.youtube.com/@CancerCenter/search?query=cancer")
	html3 = urllib.request.urlopen("https://www.youtube.com/@NorthwesternMedicine/search?query=cancer")
	html4 = urllib.request.urlopen("https://www.youtube.com/@TEDEd/search?query=cancer")
	html5 = urllib.request.urlopen("https://www.youtube.com/@CancerResearchUK/search?query=cancer")
	video_ids = re.findall(r"watch\?v=(\S{11})", html.read().decode())
	video_ids2 = re.findall(r"watch\?v=(\S{11})", html2.read().decode())
	video_ids3 = re.findall(r"watch\?v=(\S{11})", html3.read().decode())
	video_ids4 = re.findall(r"watch\?v=(\S{11})", html4.read().decode())
	video_ids5 = re.findall(r"watch\?v=(\S{11})", html5.read().decode())

	for i in video_ids2:
	video_ids.append(i)
	for i in video_ids3:
	video_ids.append(i)
	for i in video_ids4:
	video_ids.append(i)
	for i in video_ids5:
	video_ids.append(i)

	random.shuffle(video_ids)

	c1, c2, c3 = st.columns(3)

	with c1:
	st.video("https://www.youtube.com/watch?v=" + video_ids[0])
	with c2:
	st.video("https://www.youtube.com/watch?v=" + video_ids[1])
	with c3:
	st.video("https://www.youtube.com/watch?v=" + video_ids[2])
	st.markdown("---")

	# else:
	# st.error("The password you entered is incorrect.")