diff --git a/010-Mustjaab/Analysis-of-wait-times.py b/010-Mustjaab/Analysis-of-wait-times.py new file mode 100644 index 0000000..a9f2a2e --- /dev/null +++ b/010-Mustjaab/Analysis-of-wait-times.py @@ -0,0 +1,313 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "pandas==2.2.3", +# "plotly==5.24.1", +# "marimo", +# ] +# /// +import marimo + +__generated_with = "0.9.7-dev1" +app = marimo.App(width="medium") + + +@app.cell +def __(): + import marimo as mo + import pandas as pd + import plotly.express as px + return mo, pd, px + + +@app.cell +def __(mo): + mo.md(r"""

Analysis of Wait Times for Priority Procedures

""").style( + {"background-color": "crimson"} + ) + return + + +@app.cell +def __(mo): + mo.md( + r""" + ## Sections + + """ + ) + return + + +@app.cell +def __(pd): + Wait_Times = pd.read_csv("Wait_Times_Data.csv") + return (Wait_Times,) + + +@app.cell +def __(mo): + mo.md( + r""" + ## General Overview +

This notebook will compare the percentile of wait days (50th, and 90th) for various critical procedures in hospitals across Canada. Data was obtained from the Canadian Institute for Health Information (CIHI). Percentiles help us get a better understanding where a value in a dataset stands in comparison to others - is it on the lower end of the set? Is it on the higher end? Or is it smack in the middle? You can learn more about percentiles here.

+ """ + ) + return + + +@app.cell +def __(mo): + _df = mo.sql( + f""" + SELECT * + FROM "Wait_Times" + WHERE Indicator_result !='n/a' + LIMIT 50 + """ + ) + return + + +@app.cell +def __(mo): + mo.md(r"""

50th Percentiles

""") + return + + +@app.cell +def __(Bladder_Cancer_df, CABG_df, Lung_Cancer_df, mo): + Operation_Options = { + "Bladder Cancer Surgery": Bladder_Cancer_df, + "CABG": CABG_df, + "Lung Cancer Surgery": Lung_Cancer_df, + } + + Operation_Choice = mo.ui.dropdown( + options=["Bladder Cancer Surgery", "CABG", "Lung Cancer Surgery"], + value="Bladder Cancer Surgery", + ) + return Operation_Choice, Operation_Options + + +@app.cell +def __(Operation_Choice, Operation_Options): + Operation_Bar = Operation_Options[Operation_Choice.value] + return (Operation_Bar,) + + +@app.cell +def __(mo): + Bladder_Cancer_df = mo.sql( + f""" + SELECT Province_territory,Indicator,Metric,Data_year,Unit_of_measurement,Indicator_result + FROM "Wait_Times" + WHERE Province_territory !='Canada' + AND Indicator = 'Bladder Cancer Surgery' + AND Metric = '50th Percentile' + AND Data_year IN ('2013', '2023') + And Unit_of_measurement = 'Days' + AND Indicator_result !='n/a' + ORDER BY Indicator_result ASC + """, output=False + ) + return (Bladder_Cancer_df,) + + +@app.cell +def __(mo): + CABG_df = mo.sql( + f""" + SELECT Province_territory,Indicator,Metric,Data_year,Unit_of_measurement,Indicator_result + FROM "Wait_Times" + WHERE Province_territory !='Canada' + AND Indicator = 'CABG' + AND Metric = '50th Percentile' + AND Data_year IN ('2013', '2023') + And Unit_of_measurement = 'Days' + AND Indicator_result !='n/a' + ORDER BY Indicator_result ASC + """, output=False + ) + return (CABG_df,) + + +@app.cell +def __(mo): + Lung_Cancer_df = mo.sql( + f""" + SELECT Province_territory,Indicator,Metric,Data_year,Unit_of_measurement,Indicator_result + FROM "Wait_Times" + WHERE Province_territory !='Canada' + AND Indicator = 'Lung Cancer Surgery' + AND Metric = '50th Percentile' + AND Data_year IN ('2013', '2023') + And Unit_of_measurement = 'Days' + AND Indicator_result !='n/a' + ORDER BY Indicator_result ASC + """, output=False + ) + return (Lung_Cancer_df,) + + +@app.cell +def __(Operation_Choice): + Operation_Choice + return + + +@app.cell +def __(Operation_Bar, px): + Bladder_Bar = px.bar(Operation_Bar,x='Province_territory',y='Indicator_result', color='Data_year', barmode='group') + Bladder_Bar + return (Bladder_Bar,) + + +@app.cell +def __(mo): + mo.md(r""" Bar chart comparing 50th percentiles of wait times between 2013 and 2023 across provinces.""").style({'background-color':'brown','color':'white'}) + return + + +@app.cell +def __(mo): + mo.md(r"""## Comparing 90th Percentiles""") + return + + +@app.cell +def __(mo): + Bladder_Cancer_90th_Percentile_2013_df = mo.sql( + f""" + SELECT Province_territory,Indicator,Metric,Data_year,Unit_of_measurement,Indicator_result + FROM "Wait_Times" + WHERE Province_territory !='Canada' + AND Indicator = 'Bladder Cancer Surgery' + AND Metric = '90th Percentile' + AND Data_year = '2013' + And Unit_of_measurement = 'Days' + AND Indicator_result !='n/a' + ORDER BY Indicator_result ASC + """, output=False + ) + return (Bladder_Cancer_90th_Percentile_2013_df,) + + +@app.cell +def __(mo): + Bladder_Cancer_90th_Percentile_2023_df = mo.sql( + f""" + SELECT Province_territory,Indicator,Metric,Data_year,Unit_of_measurement,Indicator_result + FROM "Wait_Times" + WHERE Province_territory != 'Canada' + AND Indicator = 'Bladder Cancer Surgery' + AND Metric = '90th Percentile' + AND Data_year IN ('2023') + And Unit_of_measurement = 'Days' + AND Indicator_result !='n/a' + ORDER BY Indicator_result ASC + """, output=False + ) + return (Bladder_Cancer_90th_Percentile_2023_df,) + + +@app.cell +def __(mo): + Lung_Cancer_90th_Percentile_2013_df = mo.sql( + f""" + SELECT Province_territory,Indicator,Metric,Data_year,Unit_of_measurement,Indicator_result + FROM "Wait_Times" + WHERE Province_territory != 'Canada' + AND Indicator = 'Lung Cancer Surgery' + AND Metric = '90th Percentile' + AND Data_year IN ('2013') + And Unit_of_measurement = 'Days' + AND Indicator_result !='n/a' + ORDER BY Indicator_result ASC + """, output=False + ) + return (Lung_Cancer_90th_Percentile_2013_df,) + + +@app.cell +def __(mo): + Lung_Cancer_90th_Percentile_2023_df = mo.sql( + f""" + SELECT Province_territory,Indicator,Metric,Data_year,Unit_of_measurement,Indicator_result + FROM "Wait_Times" + WHERE Province_territory != 'Canada' + AND Indicator = 'Bladder Cancer Surgery' + AND Metric = '90th Percentile' + AND Data_year IN ('2023') + And Unit_of_measurement = 'Days' + AND Indicator_result !='n/a' + ORDER BY Indicator_result ASC + """, output=False + ) + return (Lung_Cancer_90th_Percentile_2023_df,) + + +@app.cell +def __( + Bladder_Cancer_90th_Percentile_2013_df, + Bladder_Cancer_90th_Percentile_2023_df, + Lung_Cancer_90th_Percentile_2013_df, + Lung_Cancer_90th_Percentile_2023_df, + px, +): + Bladder_Cancer_2013 = px.pie(Bladder_Cancer_90th_Percentile_2013_df,values='Indicator_result', names='Province_territory') + + Bladder_Cancer_2023 = px.pie(Bladder_Cancer_90th_Percentile_2023_df,values='Indicator_result', names='Province_territory') + + Lung_Cancer_2013 = px.pie(Lung_Cancer_90th_Percentile_2013_df,values='Indicator_result', names='Province_territory') + + Lung_Cancer_2023 = px.pie(Lung_Cancer_90th_Percentile_2023_df,values='Indicator_result', names='Province_territory') + return ( + Bladder_Cancer_2013, + Bladder_Cancer_2023, + Lung_Cancer_2013, + Lung_Cancer_2023, + ) + + +@app.cell +def __(mo): + Ten_Year_Journey = mo.ui.slider(2013,2023,10,value=2023, label = 'Ten Year Slider:') + Ten_Year_Journey + return (Ten_Year_Journey,) + + +@app.cell +def __(Ten_Year_Journey): + Time_Parameter = Ten_Year_Journey.value + return (Time_Parameter,) + + +@app.cell +def __(Bladder_Cancer_2013, Bladder_Cancer_2023): + def Percentile_Time_Machine(Time_Parameter): + if Time_Parameter == 2013: + return Bladder_Cancer_2013 + if Time_Parameter == 2023: + return Bladder_Cancer_2023 + return (Percentile_Time_Machine,) + + +@app.cell +def __(Percentile_Time_Machine, Time_Parameter): + Percentile_Time_Machine(Time_Parameter) + return + + +@app.cell +def __(mo): + mo.md(r""" Pie charts of 90th percentile values for each province over a span of 10 years.""").style({'background-color':'indigo'}) + return + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Article_Summarizer.py b/010-Mustjaab/Article_Summarizer.py new file mode 100644 index 0000000..f586805 --- /dev/null +++ b/010-Mustjaab/Article_Summarizer.py @@ -0,0 +1,180 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "marimo", +# "beautifulsoup4==4.12.3", +# "nltk==3.9.1", +# "requests==2.32.3", +# ] +# /// + +import marimo + +__generated_with = "0.9.10" +app = marimo.App() + + +@app.cell +def __(mo): + mo.md(rf"

Summarize that Article!

") + return + + +@app.cell +def __(): + import marimo as mo + import nltk + import requests + from bs4 import BeautifulSoup + from nltk.sentiment import SentimentIntensityAnalyzer + from nltk.tokenize import word_tokenize, sent_tokenize + from nltk.probability import FreqDist + from nltk.corpus import stopwords + from string import punctuation + import heapq + return ( + BeautifulSoup, + FreqDist, + SentimentIntensityAnalyzer, + heapq, + mo, + nltk, + punctuation, + requests, + sent_tokenize, + stopwords, + word_tokenize, + ) + + +@app.cell +def __(mo): + Article = mo.ui.text(label='Article:', + value="https://www.cbc.ca/news/politics/hackers-threat-national-security-1.6949645").form() + + Points = mo.ui.number(5,10,label='Number of Bullet Points:') + return Article, Points + + +@app.cell +def __(Article, Points, mo): + mo.hstack([ + Article, + Points + ]) + return + + +@app.cell +def __(Article, mo): + mo.stop(Article.value is None, mo.md("Submit an article continue")) + return + + +@app.cell +def __(Article, BeautifulSoup, requests): + Article_url = Article.value + + def CBC_article_reader(url): + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + + # Find the HTML elements containing the news article content + article_content = soup.find('div', class_='story') + + # Extract text from the article content + text = article_content.get_text(separator=' ') + return text + return Article_url, CBC_article_reader + + +@app.cell +def __(Article_url, CBC_article_reader): + article_text = CBC_article_reader(Article_url) + return (article_text,) + + +@app.cell +def __( + FreqDist, + Points, + SentimentIntensityAnalyzer, + article_text, + heapq, + punctuation, + sent_tokenize, + stopwords, + word_tokenize, +): + tokens = word_tokenize(article_text.lower()) + + + stop_words = set(stopwords.words('english') + list(punctuation)) + filtered_tokens = [token for token in tokens if token not in stop_words] + + # Step 3: Calculate word frequencies + word_freq = FreqDist(filtered_tokens) + + + tfidf = {} + for word, freq in word_freq.items(): + tfidf[word] = freq * (len(tokens) / word_freq[word]) + + + sia = SentimentIntensityAnalyzer() + sentiment_score = sia.polarity_scores(article_text)['compound'] + + + summary = [] + sentences = sent_tokenize(article_text) + + + sentence_scores = {} + for sentence in sentences: + words = word_tokenize(sentence.lower()) + score = sentiment_score * sum(tfidf[word] for word in words if word in tfidf) + sentence_scores[sentence] = score + + num_sentences_in_summary = Points.value + summary_sentences = heapq.nlargest(num_sentences_in_summary, sentence_scores, key=sentence_scores.get) + + + bulleted_summary = ['- ' + sentence for sentence in summary_sentences] + final_summary = '\n'.join(bulleted_summary) + return ( + bulleted_summary, + filtered_tokens, + final_summary, + freq, + num_sentences_in_summary, + score, + sentence, + sentence_scores, + sentences, + sentiment_score, + sia, + stop_words, + summary, + summary_sentences, + tfidf, + tokens, + word, + word_freq, + words, + ) + + +@app.cell +def __(mo): + mo.md("""

Final Summary

""") + return + + +@app.cell +def __(final_summary, mo): + mo.md(rf"{final_summary}") + return + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/C__Users_mustj_Fact_Checking_Model.py b/010-Mustjaab/C__Users_mustj_Fact_Checking_Model.py new file mode 100644 index 0000000..3e8cf88 --- /dev/null +++ b/010-Mustjaab/C__Users_mustj_Fact_Checking_Model.py @@ -0,0 +1,426 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "scikit-learn==1.5.2", +# "pandas==2.2.3", +# "altair==5.4.1", +# "nltk==3.9.1", +# "marimo", +# "plotly==5.24.1", +# "numpy==1.26.4", +# "gensim==4.3.3", +# ] +# /// +import marimo + +__generated_with = "0.9.10" +app = marimo.App(width="medium") + + +@app.cell +def __(): + import marimo as mo + import pandas as pd + import numpy as np + from gensim.models import Word2Vec + from nltk.tokenize import word_tokenize + import nltk + from sklearn.model_selection import train_test_split + from sklearn.svm import SVC + from sklearn.metrics import accuracy_score, classification_report + import plotly.express as px + from sklearn.manifold import TSNE + import altair as alt + return ( + SVC, + TSNE, + Word2Vec, + accuracy_score, + alt, + classification_report, + mo, + nltk, + np, + pd, + px, + train_test_split, + word_tokenize, + ) + + +@app.cell +def __(mo): + mo.md( + r""" + # Resource Fact Checking Model + + ## Table of Contents + + - General Overview + - Model Choice + - Using the Model + - Performance Analysis + """ + ) + return + + +@app.cell +def __(mo): + mo.md( + r""" + ## General Overview + +

The purpose of this notebook is to build a fact-checking model. We can start off simple, and add features to fortify its scalability, reliability, validity, and (relevantly) accuracy.

+ """ + ) + return + + +@app.cell +def __(): + Statement_about_Resources = [ + ("Solar energy is a renewable resource.", True), + ("Coal is a renewable resource.", False), + ("Wind power can be depleted.", False), + ("Nuclear energy is considered a non-renewable resource.", True), + ("Hydroelectric power is a form of renewable energy.", True), + ("Natural gas is a clean, renewable resource.", False), + ("Biomass energy comes from renewable organic materials.", True), + ("Geothermal energy is inexhaustible.", True), + ("Tidal energy is a type of renewable energy.", True), + ("Fossil fuels are formed from renewable sources.", False), + ("Wind turbines generate electricity without consuming fuel.", True), + ("Oil reserves will replenish themselves within a human lifetime.", False), + ("Solar panels work efficiently at night.", False), + ("Uranium used in nuclear power plants is a renewable resource.", False), + ("Wave energy harnesses the power of ocean surface motion.", True), + ("Burning coal releases no greenhouse gases.", False), + ("Hydropower relies on the water cycle, which is naturally replenished.", True), + ("Geothermal energy taps into Earth's internal heat.", True), + ("Wind energy production causes significant air pollution.", False), + ("Biomass fuels are carbon-neutral.", True), + ("Solar energy can be harnessed in cloudy weather.", True), + ("Tidal power is predictable and consistent.", True), + ("Nuclear fusion is currently a widely used energy source.", False), + ("Offshore wind farms produce more energy than onshore ones.", True), + ("Fossil fuels will never run out.", False), + ("Photovoltaic cells convert sunlight directly into electricity.", True), + ("Hydroelectric dams have no environmental impact.", False), + ("Geothermal energy is only available in volcanic regions.", False), + ("Wind turbines kill more birds than any other human activity.", False), + ("Biomass energy always reduces greenhouse gas emissions.", False), + ("Solar panels require more energy to produce than they generate in their lifetime.", False), + ("Tidal barrages can affect local ecosystems.", True), + ("Nuclear power plants produce no radioactive waste.", False), + ("Wind energy is only viable in constantly windy areas.", False), + ("Oil shale is a renewable energy source.", False), + ("Concentrated solar power can store energy for nighttime use.", True), + ("Large hydroelectric dams can cause methane emissions.", True), + ("Geothermal power plants can trigger earthquakes.", True), + ("Wind turbines have a lifespan of over 20 years.", True), + ("Biomass fuels compete with food production for land use.", True), + ("Solar energy is not viable in cold climates.", False), + ("Tidal energy generation is widely used globally.", False), + ("Nuclear fission produces no carbon dioxide during operation.", True), + ("Wind energy is more expensive than fossil fuels.", False), + ("Natural gas is the cleanest burning fossil fuel.", True), + ("Solar farms require no water for operation.", True), + ("Pumped-storage hydroelectricity is a form of energy storage.", True), + ("Geothermal energy is available 24/7.", True), + ("Wind turbines cannot operate in very high winds.", True), + ("All biomass sources are environmentally friendly.", False), + ("Thin-film solar cells are less efficient than traditional silicon cells.", True), + ("Tidal energy can be harvested using underwater turbines.", True), + ("Nuclear power plants can be powered down quickly in emergencies.", False), + ("Vertical axis wind turbines are more efficient than horizontal axis turbines.", False), + ("Fracking for natural gas is a completely safe process.", False), + ("Passive solar design can reduce heating and cooling costs in buildings.", True), + ("Run-of-river hydroelectricity always requires a large dam.", False), + ("Enhanced geothermal systems can make geothermal energy viable in more locations.", True), + ("Wind energy cannot be stored.", False), + ("Algae-based biofuels are currently widely used in transportation.", False), + ("Perovskite solar cells are a promising new technology.", True), + ("Ocean thermal energy conversion (OTEC) works best in tropical regions.", True), + ("Thorium reactors are widely used in nuclear power generation.", False), + ("Airborne wind energy systems can harness high-altitude winds.", True), + ("Shale gas is a form of renewable energy.", False), + ("Community solar gardens allow multiple users to share solar power.", True), + ("Micro-hydropower systems can power individual homes.", True), + ("Geothermal heat pumps can be used for both heating and cooling.", True), + ("Wind power cannot provide baseload power.", False), + ("Cellulosic ethanol is made from non-food plant materials.", True), + ("Solar thermal collectors can be used for water heating.", True), + ("Tidal fences are less environmentally impactful than tidal barrages.", True), + ("Breeder reactors can produce more fissile material than they consume.", True), + ("Kite power systems are a form of wind energy.", True), + ("Tar sands oil extraction is environmentally friendly.", False), + ("Building-integrated photovoltaics can replace conventional building materials.", True), + ("Small-scale hydropower always disrupts river ecosystems.", False), + ("Hot dry rock geothermal systems require water injection.", True), + ("Wind turbines cannot be recycled at the end of their life.", False), + ("Biogas can be produced from animal waste.", True), + ("Solar roads can generate electricity from streets and parking lots.", True), + ("Wave energy converters can affect marine ecosystems.", True), + ("Pebble bed reactors are a type of nuclear fission reactor.", True), + ("Bladeless wind turbines produce no noise pollution.", True), + ("Coal seam gas is a renewable resource.", False), + ("Floatovoltaics are solar panels designed to float on water.", True), + ("All hydroelectric power requires damming rivers.", False), + ("Geothermal energy can be used directly for heating.", True), + ("Wind energy production causes significant noise pollution in nearby communities.", False), + ("Pyrolysis of biomass produces biochar, which can improve soil quality.", True), + ("Solar updraft towers use greenhouse effect and chimney effect.", True), + ("Tidal streams and ocean currents are the same thing.", False), + ("Molten salt reactors are a type of nuclear fission reactor.", True), + ("Vortex bladeless is a new type of wind energy technology.", True), + ("Lignite is a clean-burning type of coal.", False), + ("Agrivoltaics combines agriculture with solar energy production.", True), + ("Pumped-storage hydroelectricity facilities can only be built in mountainous areas.", False), + ("Ground source heat pumps can provide heating and cooling in all climates.", True), + ("Wind turbines kill more bats than birds.", True), + ("Biodiesel can be produced from used cooking oil.", True), + ("Transparent solar cells can be used in windows.", True), + ("Dynamic tidal power doesn't require a barrage or lagoon.", True), + ("Fast breeder reactors have been widely adopted globally.", False), + ("Airborne wind energy systems are commercially available.", False), + ("Oil drilling in the Arctic has no environmental risks.", False), + ("Solar thermal energy can be used for industrial processes.", True), + ("Run-of-river hydroelectricity has less environmental impact than large dams.", True), + ("Magma geothermal energy systems tap into underground magma chambers.", True), + ("Offshore wind turbines are less efficient than onshore turbines.", False), + ("Waste-to-energy plants can reduce landfill use.", True), + ("Luminescent solar concentrators can be used in building windows.", True), + ("Salinity gradient power harnesses energy from where rivers meet the sea.", True), + ("Small modular reactors are currently widely used in nuclear power generation.", False), + ("High-altitude wind power can provide more consistent energy than ground-level wind.", True), + ("Hydraulic fracturing only uses water and sand.", False), + ("Solar water heating systems can work in cold climates.", True), + ("Tidal lagoons have less environmental impact than tidal barrages.", True), + ("Deep geothermal systems can access heat at depths of several kilometers.", True), + ("Wind turbines can increase local temperatures.", True), + ("Biofuels always have a lower carbon footprint than fossil fuels.", False), + ("Photovoltaic noise barriers can generate electricity along highways.", True), + ("Marine current power is the same as tidal stream power.", False), + ("Traveling wave reactors can use depleted uranium as fuel.", True), + ("Kite wind generators can reach higher altitudes than traditional wind turbines.", True), + ("Clean coal technology eliminates all pollutants from coal burning.", False), + ("Solar canals combine water conservation with energy generation.", True), + ("Mini-hydro systems always require construction of a dam.", False), + ("Geothermal energy can be used for greenhouse heating in agriculture.", True), + ("Wind turbines cannot be placed close to urban areas.", False), + ("Torrefied biomass has properties similar to coal.", True), + ("Solar chimneys can generate electricity in arid regions.", True), + ("Osmotic power generates electricity from the difference in salt concentration between seawater and river water.", True), + ("Fusion power plants are currently in commercial operation.", False), + ("Makani power kites are a commercially successful form of wind energy.", False), + ("Deep water oil drilling is risk-free.", False), + ("Solar fabric can generate electricity from clothing.", True), + ("In-stream hydro turbines always obstruct fish migration.", False), + ("Engineered geothermal systems can make geothermal power viable in non-volcanic regions.", True), + ("Wind turbines cannot operate in extreme cold.", False), + ("Plasma gasification is a clean way to process municipal solid waste.", True), + ("Photovoltaic glass can generate electricity while remaining transparent.", True), + ("Tidal kite technology can generate power from low-velocity currents.", True), + ("Sodium-cooled fast reactors are the most common type of nuclear reactor.", False), + ("Crosswind kite power systems can generate more energy than traditional wind turbines.", True), + ("Natural gas extraction never contaminates groundwater.", False), + ("Solar balloons can generate electricity at high altitudes.", True), + ("Micro-hydro systems are suitable for most streams and rivers.", True), + ("Geothermal power plants always cause land subsidence.", False), + ("Wind turbines can be harmful to human health.", False), + ("Jatropha is a promising non-food crop for biodiesel production.", True), + ("Solar power satellites can beam energy to Earth from space.", True), + ("Vortex-induced vibrations can be used to generate electricity from slow water currents.", True), + ("Liquid fluoride thorium reactors are widely used in nuclear power generation.", False), + ("High-altitude wind kites are currently a major source of wind power.", False), + ("Offshore oil rigs have no impact on marine ecosystems.", False), + ("Building-integrated wind turbines can be incorporated into skyscrapers.", True), + ("All hydroelectric dams cause significant methane emissions.", False), + ("Shallow geothermal systems can be used for both heating and cooling buildings.", True), + ("Wind turbines significantly reduce property values in nearby areas.", False), + ("Microalgae can be used to produce biofuels without competing with food crops.", True), + ("Solar roadways are currently widely implemented.", False), + ("Underwater compressed air energy storage can be used with offshore wind farms.", True), + ("Nuclear fusion reactors produce long-lived radioactive waste.", False), + ("Vertical sky farms can combine wind energy with agriculture.", True), + ("Mountaintop removal mining is an environmentally friendly way to extract coal.", False), + ("Piezoelectric materials can generate electricity from pedestrian footsteps.", True), + ("All small hydropower projects are environmentally benign.", False), + ("Hot sedimentary aquifer power is a type of geothermal energy.", True), + ("Wind turbines cause significant electromagnetic interference.", False), + ("Biofuels derived from algae require less land than crop-based biofuels.", True), + ("Solar greenhouses can generate electricity while growing plants.", True), + ("Dynamic tidal power systems have been successfully implemented on a large scale.", False), + ("Generation IV nuclear reactors are currently in wide commercial use.", False), + ("Windbelts can generate electricity from wind without using turbines.", True), + ("Hydraulic fracturing never causes induced seismicity.", False), + ("Solar trees can provide both shade and electricity in urban areas.", True), + ("Fish-friendly turbines completely eliminate fish mortality in hydroelectric systems.", False), + ("Geothermal energy extraction always leads to the depletion of geothermal reservoirs.", False), + ("Wind turbine syndrome is a medically recognized condition.", False), + ("Sweet sorghum is a potential feedstock for ethanol production.", True), + ("Space-based solar power is currently a significant source of energy on Earth.", False), + ("Tidal fences can generate electricity without creating reservoirs.", True), + ("Accelerator-driven subcritical reactors are commonly used for power generation.", False), + ("Jet stream wind power is currently harnessed for electricity production.", False), + ("Deep sea oil drilling is completely safe for marine environments.", False), + ("Solar windows can generate electricity without significantly reducing transparency.", True), + ("All run-of-river hydroelectric systems are free from environmental impacts.", False), + ("Ground-source heat pumps require deep drilling in all cases.", False), + ("Wind turbines cause significant decrease in bird populations.", False), + ("Biofuels always produce lower greenhouse gas emissions than fossil fuels.", False), + ("Spray-on solar cells are widely used in commercial solar panels.", False), + ("Archimedes wave swing is a type of wave energy converter.", True), + ("Tokamak fusion reactors are currently used for commercial power generation.", False), + ("Kite-powered ships are widely used in commercial shipping.", False) + ] + + Resource_Statements = [statement for statement, _ in Statement_about_Resources] + Verification = [label for _, label in Statement_about_Resources] + return Resource_Statements, Statement_about_Resources, Verification + + +@app.cell +def __(mo): + mo.callout("Be sure to use punkt through (nltk.download('punkt')",kind='warn') + return + + +@app.cell +def __(Resource_Statements, Word2Vec, word_tokenize): + tokenized_statements = [word_tokenize(statement.lower()) for statement in Resource_Statements] + + word2vec_model = Word2Vec(sentences=tokenized_statements, vector_size=100, window=5, min_count=1, workers=4) + return tokenized_statements, word2vec_model + + +@app.cell +def __(np, word2vec_model, word_tokenize): + def document_vector(doc): + words = word_tokenize(doc.lower()) + word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv] + return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100) + return (document_vector,) + + +@app.cell +def __(Resource_Statements, document_vector, np): + Resource_Statements_vectors = np.array([document_vector(statement) for statement in Resource_Statements]) + return (Resource_Statements_vectors,) + + +@app.cell +def __(Resource_Statements_vectors, Verification, train_test_split): + Resource_Statements_train, Resource_Statements_test, Verification_train, Verification_test = train_test_split(Resource_Statements_vectors, Verification, test_size=0.2, random_state=42) + return ( + Resource_Statements_test, + Resource_Statements_train, + Verification_test, + Verification_train, + ) + + +@app.cell +def __(mo): + mo.md( + r""" + ## Model Choice + +

There's so many different classifiications model that could be used for things like fact-checking, so why a support vector machine (SVM) ? Well, there's a few reasons:

+ + - SVMs are highly effective at distinguishing between different categories (in this case, verifying whether a statement is true or false) while maintaining efficiency. + + - Since we’re working with a smaller dataset, SVMs are a great choice because they perform well with limited data without sacrificing accuracy. + + - The mathematical foundation of SVMs, particularly how they create clear boundaries between categories, makes it less likely for the model to misclassify whether a statement (e.g., about natural resources) is true or false. + """ + ) + return + + +@app.cell +def __(Resource_Statements_train, SVC, Verification_train): + model = SVC(kernel='rbf', probability=True) + model.fit(Resource_Statements_train, Verification_train) + return (model,) + + +@app.cell +def __(mo): + mo.md(r"""## Using the Model""") + return + + +@app.cell +def __(mo): + Statement = mo.ui.text(placeholder='claim', label = 'Claim about resource:').form() + Statement + return (Statement,) + + +@app.cell +def __(Statement, fact_check): + fact_check(Statement.value) + return + + +@app.cell +def __(mo): + mo.md(r"""## Performance Analysis""") + return + + +@app.cell +def __(Verification_pred, Verification_test, accuracy_score, pd): + Accuracy_Table = pd.DataFrame({ + 'Metric': ['Accuracy Score'], + 'Value':[accuracy_score(Verification_test, Verification_pred)] + }) + Accuracy_Table + return (Accuracy_Table,) + + +@app.cell +def __( + Resource_Statements_test, + Verification_test, + classification_report, + model, +): + Verification_pred = model.predict(Resource_Statements_test) + classification_report(Verification_test, Verification_pred) + return (Verification_pred,) + + +@app.cell +def __(document_vector, model): + def fact_check(statement): + vectorized_statement = document_vector(statement).reshape(1, -1) + prediction = model.predict(vectorized_statement) + probability = model.predict_proba(vectorized_statement)[0] + + if prediction[0]: + return f"The statement is likely true (confidence: {probability[1]:.2f})" + else: + return f"The statement is likely false (confidence: {probability[0]:.2f})" + return (fact_check,) + + +@app.cell +def __(Resource_Statements_vectors, Verification, model, np, pd, px): + Probabilities = model.predict_proba(Resource_Statements_vectors) + + # Create a DataFrame for plotting + Confidence_Data = pd.DataFrame({ + 'confidence': np.max(Probabilities, axis=1), + 'true_label': Verification + }) + + Model_Probability_Histogram = px.histogram(Confidence_Data,x='confidence', title = 'Histogram of Model Confidence') + Model_Probability_Histogram + return Confidence_Data, Model_Probability_Histogram, Probabilities + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Canada_Risk_Management.py b/010-Mustjaab/Canada_Risk_Management.py new file mode 100644 index 0000000..8a3671f --- /dev/null +++ b/010-Mustjaab/Canada_Risk_Management.py @@ -0,0 +1,449 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "marimo", +# "numpy==1.26.4", +# "plotly==5.24.1", +# "pandas==2.2.3", +# "duckdb==1.1.2", +# "statsmodels==0.14.4", +# ] +# /// + +import marimo + +__generated_with = "0.9.10" +app = marimo.App() + + +@app.cell +def __(mo): + mo.md("""

Risk Management Analytics

""") + return + + +@app.cell +def __(): + import marimo as mo + import pandas as pd + import plotly.express as px + import duckdb as db + from statsmodels.stats.proportion import proportions_ztest + import numpy as np + return db, mo, np, pd, proportions_ztest, px + + +@app.cell +def __(pd): + Risk_Arrangement = pd.read_csv('assets/Risk_Arrangement.csv') + return (Risk_Arrangement,) + + +@app.cell +def __(): + Risk_query_2019 = """ + SELECT REF_DATE, Risk_management_arrangements, NAICS, VALUE + FROM Risk_Arrangement + WHERE Risk_management_arrangements IN ( + 'A Business Continuity Plan (BCP)', + 'Frequent updating of operating systems', + 'No risk management arrangements' + ) + AND REF_DATE='2019'; + """ + + Risk_query_2021 = """ + SELECT REF_DATE, Risk_management_arrangements, NAICS, VALUE + FROM Risk_Arrangement + WHERE Risk_management_arrangements IN ( + 'A Business Continuity Plan (BCP)', + 'Frequent updating of operating systems', + 'No risk management arrangements' + ) + AND REF_DATE='2021'; + """ + return Risk_query_2019, Risk_query_2021 + + +@app.cell +def __(Risk_query_2019, Risk_query_2021, db): + Canada_Risk_Landscape_2019 = db.execute(Risk_query_2019).df() + Canada_Risk_Landscape_2021 = db.execute(Risk_query_2021).df() + return Canada_Risk_Landscape_2019, Canada_Risk_Landscape_2021 + + +@app.cell +def __(np, proportions_ztest): + #Risk arrangements + Agricultural_BCP=proportions_ztest( + count=np.array([2.2,1.8]), + nobs=np.array([100,100]), + alternative='smaller' + ) + + Schools_BCP=proportions_ztest( + count=np.array([6.5,12.7]), + nobs=np.array([100,100]), + alternative='smaller' + ) + + Hospitals_BCP=proportions_ztest( + count=np.array([20,18.8]), + nobs=np.array([100,100]), + alternative='smaller' + ) + + Agricultural_OS=proportions_ztest( + count=np.array([13.4,13.0]), + nobs=np.array([100,100]), + alternative='smaller' + ) + + Schools_OS=proportions_ztest( + count=np.array([45.2,39.2]), + nobs=np.array([100,100]), + alternative='smaller' + ) + + Hospitals_OS=proportions_ztest( + count=np.array([43.6,38.1]), + nobs=np.array([100,100]), + alternative='smaller' + ) + + Agricultural_No_RM=proportions_ztest( + count=np.array([42.3,37.2]), + nobs=np.array([100,100]), + alternative='smaller' + ) + + Schools_No_RM=proportions_ztest( + count=np.array([8.9,10.1]), + nobs=np.array([100,100]), + alternative='larger' + ) + + Hospitals_No_RM=proportions_ztest( + count=np.array([15.9,8.1]), + nobs=np.array([100,100]), + alternative='smaller' + ) + return ( + Agricultural_BCP, + Agricultural_No_RM, + Agricultural_OS, + Hospitals_BCP, + Hospitals_No_RM, + Hospitals_OS, + Schools_BCP, + Schools_No_RM, + Schools_OS, + ) + + +@app.cell +def __(np, proportions_ztest): + #Incidents + Agricultural_No_Impact = proportions_ztest( + count = np.array([83.3,92.2]), + nobs = np.array([100,100]), + alternative='larger' + ) + + Agricultural_Stolen_Money = proportions_ztest( + count = np.array([5.9,2.7]), + nobs = np.array([100,100]), + alternative='smaller' + ) + + Agricultural_Stolen_Personal = proportions_ztest( + count = np.array([5.5,2.7]), + nobs = np.array([100,100]), + alternative='smaller' + ) + + Hospitals_No_Impact = proportions_ztest( + count = np.array([63.3,89.1]), + nobs = np.array([100,100]), + alternative='larger' + ) + + Hospitals_Stolen_Money = proportions_ztest( + count = np.array([27.5,4.0]), + nobs = np.array([100,100]), + alternative='smaller' + ) + + Hospitals_Stolen_Personal = proportions_ztest( + count = np.array([16.9,4.1]), + nobs = np.array([100,100]), + alternative='smaller' + ) + + Schools_No_Impact = proportions_ztest( + count = np.array([71.2,78.2]), + nobs = np.array([100,100]), + alternative='larger' + ) + + Schools_Stolen_Money = proportions_ztest( + count = np.array([13.3,8.8]), + nobs = np.array([100,100]), + alternative='smaller' + ) + + Schools_Stolen_Personal = proportions_ztest( + count = np.array([5.9,10.8]), + nobs = np.array([100,100]), + alternative='larger' + ) + return ( + Agricultural_No_Impact, + Agricultural_Stolen_Money, + Agricultural_Stolen_Personal, + Hospitals_No_Impact, + Hospitals_Stolen_Money, + Hospitals_Stolen_Personal, + Schools_No_Impact, + Schools_Stolen_Money, + Schools_Stolen_Personal, + ) + + +@app.cell +def __( + Agricultural_BCP, + Agricultural_No_RM, + Agricultural_OS, + Hospitals_BCP, + Hospitals_No_RM, + Hospitals_OS, + Schools_BCP, + Schools_No_RM, + Schools_OS, + pd, +): + Risk_Proportions_Table = pd.DataFrame( + { + 'Risk Arrangement':[ + 'Business Continuity Plan', + 'Frequent Updating of operating systems', + 'No risk management plan in place' + ], + 'Hospitals':[Hospitals_BCP[1], + Hospitals_OS[1], + Hospitals_No_RM[1] + ], + 'Schools':[Schools_BCP[1], + Schools_OS[1], + Schools_No_RM[1] + ], + 'Agricultural':[Agricultural_BCP[1], + Agricultural_OS[1], + Agricultural_No_RM[1] + ] + } + ) + return (Risk_Proportions_Table,) + + +@app.cell +def __( + Agricultural_No_Impact, + Agricultural_Stolen_Money, + Agricultural_Stolen_Personal, + Hospitals_No_Impact, + Hospitals_Stolen_Money, + Hospitals_Stolen_Personal, + Schools_No_Impact, + Schools_Stolen_Money, + Schools_Stolen_Personal, + pd, +): + Incident_Proportions_Table = pd.DataFrame( + { + 'Incident':[ + 'No Impact on business', + 'Stolen money or demand ransom', + 'Stolen personal or financial information', + ], + 'Hospitals':[Hospitals_No_Impact[1], + Hospitals_Stolen_Money[1], + Hospitals_Stolen_Personal[1] + ], + 'Schools':[Schools_No_Impact[1], + Schools_Stolen_Money[1], + Schools_Stolen_Personal[1] + ], + 'Agricultural':[Agricultural_No_Impact[1], + Agricultural_Stolen_Money[1], + Agricultural_Stolen_Personal[1] + ] + } + ) + return (Incident_Proportions_Table,) + + +@app.cell +def __(mo): + Ref_Date = mo.ui.slider(2019,2021,2) + mo.md(rf"Year: {Ref_Date}").style({'border-width':'4px','border-color':'gray'}) + return (Ref_Date,) + + +@app.cell +def __(Ref_Date): + Year = Ref_Date.value + return (Year,) + + +@app.cell +def __( + Canada_Incident_Landscape_2019, + Canada_Incident_Landscape_2021, + Canada_Risk_Landscape_2019, + Canada_Risk_Landscape_2021, + px, +): + Grouped_Risks_2021 = px.histogram( + Canada_Risk_Landscape_2021, + x='Risk_management_arrangements', + y='VALUE', + color='NAICS', + barmode='group' + ) + + Grouped_Risks_2019 = px.histogram( + Canada_Risk_Landscape_2019, + x='Risk_management_arrangements', + y='VALUE', + color='NAICS', + barmode='group' + ) + + Grouped_Incidents_2019 = px.histogram( + Canada_Incident_Landscape_2019, + x='Cyber_security_incidents', + y='VALUE', + color='NAICS', + barmode='group' + ) + + Grouped_Incidents_2021 = px.histogram( + Canada_Incident_Landscape_2021, + x='Cyber_security_incidents', + y='VALUE', + color='NAICS', + barmode='group' + ) + return ( + Grouped_Incidents_2019, + Grouped_Incidents_2021, + Grouped_Risks_2019, + Grouped_Risks_2021, + ) + + +@app.cell +def __( + Grouped_Incidents_2019, + Grouped_Incidents_2021, + Grouped_Risks_2019, + Grouped_Risks_2021, +): + def Grouped_Risks(Year): + if Year == 2019: + return Grouped_Risks_2019 + else: + return Grouped_Risks_2021 + + def Grouped_Incidents(Year): + if Year == 2019: + return Grouped_Incidents_2019 + else: + return Grouped_Incidents_2021 + return Grouped_Incidents, Grouped_Risks + + +@app.cell +def __(Grouped_Incidents, Grouped_Risks, Year): + Risks = Grouped_Risks(Year) + Incidents = Grouped_Incidents(Year) + return Incidents, Risks + + +@app.cell +def __(Incidents, Risks, mo): + mo.md(f""" + {mo.hstack([Risks,Incidents])} + """ + ).center() + return + + +@app.cell +def __(Incident_Proportions_Table, Risk_Proportions_Table, mo): + Risk_Proportions_Explorer = mo.ui.data_explorer(Risk_Proportions_Table) + Incident_Proportions_Explorer = mo.ui.data_explorer(Incident_Proportions_Table) + return Incident_Proportions_Explorer, Risk_Proportions_Explorer + + +@app.cell +def __(Incident_Proportions_Explorer, Risk_Proportions_Explorer, mo): + mo.md( + f""" + + {mo.hstack([Risk_Proportions_Explorer,Incident_Proportions_Explorer])} + + """ + ).center() + return + + +@app.cell +def __(mo): + mo.md(" Exploratory panel of pvalues from proportionality tests: left - risk arrangements, and right incidents.").style({'font-size':'16px'}) + return + + +@app.cell +def __(pd): + Cyber_Incidents = pd.read_csv('assets\Incidents.csv') + return (Cyber_Incidents,) + + +@app.cell +def __(): + Incident_query_2019 = """ + SELECT REF_DATE,Cyber_security_incidents, NAICS, VALUE + FROM Cyber_Incidents + WHERE NAICS IN ( + 'Agriculture, forestry, fishing and hunting', + 'Hospitals', + 'Elementary and secondary schools' + ) + AND REF_DATE='2019'; + """ + + Incident_query_2021 = """ + SELECT REF_DATE,Cyber_security_incidents, NAICS, VALUE + FROM Cyber_Incidents + WHERE NAICS IN ( + 'Agriculture, forestry, fishing and hunting', + 'Hospitals', + 'Elementary and secondary schools' + ) + AND REF_DATE='2021'; + """ + return Incident_query_2019, Incident_query_2021 + + +@app.cell +def __(Incident_query_2019, Incident_query_2021, db): + Canada_Incident_Landscape_2019 = db.execute(Incident_query_2019).df() + Canada_Incident_Landscape_2021 = db.execute(Incident_query_2021).df() + return Canada_Incident_Landscape_2019, Canada_Incident_Landscape_2021 + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Environmental_Protection_Analytics.py b/010-Mustjaab/Environmental_Protection_Analytics.py new file mode 100644 index 0000000..e3e8114 --- /dev/null +++ b/010-Mustjaab/Environmental_Protection_Analytics.py @@ -0,0 +1,175 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "marimo", +# "duckdb==1.1.2", +# "pandas==2.2.3", +# "plotly==5.24.1", +# "scipy==1.14.1", +# "stats-can==2.9.4", +# "statsmodels==0.14.4", +# ] +# /// + +import marimo + +__generated_with = "0.9.10" +app = marimo.App() + + +@app.cell +def __(mo): + mo.md("""

Oil and Gas Expenditure Analytics

""") + return + + +@app.cell +def __(): + import marimo as mo + import pandas as pd + import plotly.express as px + import stats_can as sc + import duckdb + from scipy.stats import pearsonr + return duckdb, mo, pd, pearsonr, px, sc + + +@app.cell +def __(sc): + # Bring in data of interest from Statistics Canada (copy/paste the 10-digit ID next to 'Table:') + DF = sc.table_to_df('25-10-0064-01') + return (DF,) + + +@app.cell +def __(DF, mo, pd): + #Prepare table so it can be queried using SQL by replacing any columns that have spaces with underscores + Energy = DF.rename(columns={ + 'Capital expenditures and operating expenses':'Capital_expenditures_and_operating_expenses'}) + Energy_Table = pd.DataFrame(Energy) + Energy_Data = mo.ui.table(Energy_Table) + return Energy, Energy_Data, Energy_Table + + +@app.cell +def __(Energy_Table, duckdb): + #Pull out the values for total capital from extraction and oil sands expenditures + Total_Expenditures = duckdb.sql("SELECT Capital_expenditures_and_operating_expenses,VALUE AS Total_Capital FROM Energy_Table WHERE Capital_expenditures_and_operating_expenses = 'Total capital expenditures'").df() + return (Total_Expenditures,) + + +@app.cell +def __(Energy_Table, duckdb): + #Pull out the values for expenditures on extraction + Extraction_Expenditures = duckdb.sql("SELECT Capital_expenditures_and_operating_expenses, VALUE AS Extraction_Expenditures FROM Energy_Table WHERE Capital_expenditures_and_operating_expenses = 'Oil and gas extraction expenditures'").df() + return (Extraction_Expenditures,) + + +@app.cell +def __(Energy_Table, duckdb): + #Pull out the values for expenditures on the oil sands + Sands_Expenditures = duckdb.sql("SELECT Capital_expenditures_and_operating_expenses, VALUE AS Sands_Expenditures FROM Energy_Table WHERE Capital_expenditures_and_operating_expenses = 'Oil sands expenditures'").df() + return (Sands_Expenditures,) + + +@app.cell +def __(Sands_Expenditures, Total_Expenditures, pd): + #Create a dataframe for the total capital and oil sand expenditures + C_and_S = { + "Total Capital":Total_Expenditures['Total_Capital'], + "Expenditures":Sands_Expenditures['Sands_Expenditures'] + } + Capital_and_Sands = pd.DataFrame(C_and_S) + return C_and_S, Capital_and_Sands + + +@app.cell +def __(Extraction_Expenditures, Total_Expenditures, pd): + #Also create a dataframe for total capital and extraction expenditures + C_and_E = { + "Total Capital":Total_Expenditures['Total_Capital'], + "Expenditures":Extraction_Expenditures['Extraction_Expenditures'] + } + Capital_and_Extraction = pd.DataFrame(C_and_E) + return C_and_E, Capital_and_Extraction + + +@app.cell +def __(Capital_and_Extraction, Capital_and_Sands, mo): + #Prepare options for the drop down so either of the two dataframes can be selected + Expenditure_Options = { + 'Capital and Extraction':Capital_and_Extraction, + 'Capital and Sands': Capital_and_Sands + } + + Expenditure_Choices = mo.ui.dropdown( + options=[ + 'Capital and Extraction', + 'Capital and Sands' + ], value='Capital and Extraction' + ) + Expenditure_Choices + mo.md( + rf"""This is a summary of **{Expenditure_Choices}** + """ + ) + return Expenditure_Choices, Expenditure_Options + + +@app.cell +def __(Expenditure_Choices, Expenditure_Options): + Expenditure_Visualization = Expenditure_Options[Expenditure_Choices.value] + return (Expenditure_Visualization,) + + +@app.cell +def __(Expenditure_Visualization, pearsonr): + Pearson_Test = pearsonr(Expenditure_Visualization['Total Capital'],Expenditure_Visualization['Expenditures']) + return (Pearson_Test,) + + +@app.cell +def __(Expenditure_Visualization, Pearson_Test, mo, pd): + #A skeleton statistics summary table that returns exploratory data analytics for whatever option is chosen from the drowndown list. + Summary_Table = { + "Variable": [ + 'Mean Total Capital', + 'Mean Expenditure', + 'Median Total Capital', + 'Median Expenditure', + 'Total Capital Skewness', + 'Expenditure Skewness', + "Correlation pvalue" + ], + "Value": [ + Expenditure_Visualization['Total Capital'].mean(), + Expenditure_Visualization['Expenditures'].mean(), + Expenditure_Visualization['Total Capital'].median(), + Expenditure_Visualization['Expenditures'].median(), + Expenditure_Visualization['Total Capital'].skew(), + Expenditure_Visualization['Expenditures'].skew(), + Pearson_Test.pvalue + ] + + } + Summary_Statistics = pd.DataFrame(Summary_Table) + mo.ui.table(Summary_Statistics) + return Summary_Statistics, Summary_Table + + +@app.cell +def __(Expenditure_Visualization, px): + #Displays the scatter plot for whatever option is chosen + px.scatter(Expenditure_Visualization,y='Total Capital', x ='Expenditures',trendline='ols') + return + + +@app.cell +def __(Expenditure_Visualization, px): + #Displays the boxplot for whatever option is chosen + px.box(Expenditure_Visualization) + return + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Exploring_Perplexity.py b/010-Mustjaab/Exploring_Perplexity.py new file mode 100644 index 0000000..a89ad36 --- /dev/null +++ b/010-Mustjaab/Exploring_Perplexity.py @@ -0,0 +1,161 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "marimo", +# "pandas==2.2.3", +# "nltk==3.9.1", +# "textstat==0.7.4", +# ] +# /// +import marimo + +__generated_with = "0.9.1" +app = marimo.App(width="medium") + + +@app.cell +def __(): + import marimo as mo + import pandas as pd + import collections + import math + import nltk + import textstat + return collections, math, mo, nltk, pd, textstat + + +@app.cell(disabled=True) +def __(nltk): + nltk.download('averaged_perceptron_tagger') + return + + +@app.cell +def __(mo): + mo.md(r"""#Exploring Perplexity""") + return + + +@app.cell +def __(mo): + Story_Generator = mo.ui.chat( + mo.ai.llm.openai("gpt-4o"), + prompts=[ + "Write a psychological thriller short story", + "Write a horror short story", + "Write a comedic short story", + ], + show_configuration_controls=True + ) + Story_Generator + return (Story_Generator,) + + +@app.cell +def __(mo): + mo.callout("Cutomize the response you would like through modifying paramaters in the configuration", kind ='info') + return + + +@app.cell +def __(Story_Generator, pd): + Chat_Log = pd.DataFrame(Story_Generator.value) + return (Chat_Log,) + + +@app.cell +def __(Chat_Log, mo): + Story_from_Model_df = mo.sql( + f""" + SELECT * + From Chat_Log + """, output=False + ) + return (Story_from_Model_df,) + + +@app.cell +def __(Story_from_Model_df, collections): + def preprocess(text): + return text.lower().split() + + tokens = preprocess(Story_from_Model_df['content'][1]) + + def build_ngrams(tokens, n): + ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)] + return collections.Counter(ngrams) + + unigrams = build_ngrams(tokens, 1) + bigrams = build_ngrams(tokens, 2) + + def calc_prob(ngram_count, n_minus_1_gram_count): + probabilities = {} + for ngram in ngram_count: + context = ngram[:-1] + probabilities[ngram] = ngram_count[ngram] / n_minus_1_gram_count[context] + return probabilities + return bigrams, build_ngrams, calc_prob, preprocess, tokens, unigrams + + +@app.cell +def __(bigrams, calc_prob, unigrams): + probabilities = calc_prob(bigrams, unigrams) + return (probabilities,) + + +@app.cell +def __(math): + def perplexity(probabilities, tokens, n): + N = len(tokens) + log_prob_sum = 0 + for i in range(n-1, N): + ngram = tuple(tokens[i-n+1:i+1]) + prob = probabilities.get(ngram, 1e-10) # Use a small value if probability is zero + log_prob_sum += math.log(prob) + return math.exp(-log_prob_sum / N) + return (perplexity,) + + +@app.cell +def __(mo, perplexity, probabilities, tokens): + perplexity_value = perplexity(probabilities, tokens, 2) + + mo.md(rf"Perplexity: {perplexity_value}") + return (perplexity_value,) + + +@app.cell +def __(nltk, textstat): + def calculate_fluency(text): + tokens = nltk.word_tokenize(text) + tagged = nltk.pos_tag(tokens) + + pos_counts = { + 'nouns': sum(1 for word, pos in tagged if pos.startswith('NN')), + 'verbs': sum(1 for word, pos in tagged if pos.startswith('VB')), + 'adjectives': sum(1 for word, pos in tagged if pos.startswith('JJ')), + 'adverbs': sum(1 for word, pos in tagged if pos.startswith('RB')) + } + + readability_score = textstat.flesch_reading_ease(text) + + fluency_score = (readability_score + sum(pos_counts.values())) / 2 + + return { + "fluency_score": fluency_score, + "readability": readability_score, + "pos_counts": pos_counts + } + return (calculate_fluency,) + + +@app.cell +def __(Story_from_Model_df, calculate_fluency, pd): + fluency_results = calculate_fluency(Story_from_Model_df['content'][1]) + Fluency = pd.DataFrame(fluency_results) + Fluency + return Fluency, fluency_results + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Job_Market_Word_Clouds.py b/010-Mustjaab/Job_Market_Word_Clouds.py new file mode 100644 index 0000000..dbfc6d9 --- /dev/null +++ b/010-Mustjaab/Job_Market_Word_Clouds.py @@ -0,0 +1,123 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "nltk==3.9.1", +# "marimo", +# "pandas==2.2.3", +# "matplotlib==3.9.2", +# "wordcloud==1.9.3", +# ] +# /// + +import marimo + +__generated_with = "0.9.10" +app = marimo.App() + + +@app.cell +def __(mo): + mo.md("""

Word Clouds of Different Markets

""") + return + + +@app.cell +def __(): + import marimo as mo + from wordcloud import WordCloud + import nltk + from nltk.corpus import stopwords + import matplotlib.pyplot as plt + import pandas as pd + return WordCloud, mo, nltk, pd, plt, stopwords + + +@app.cell +def __(pd): + #Currently a local csv file with pretend data on skills reflecting experiences for different stages in regulatory affairs + Markets = pd.read_csv("Skills_for_Markets.csv") + return (Markets,) + + +@app.cell +def __(stopwords): + #Allows the word clouds to built using English vocabularoy + Stop = stopwords.words("english") + return (Stop,) + + +@app.cell +def __(Markets, Stop, WordCloud): + #Prepare word clouds so they can be linked with the slider options + Entry_Skills = Markets['Entry Level'].values + Entry_WC = WordCloud(stopwords = Stop).generate(str(Entry_Skills)) + + Middle_Skills = Markets['Middle Level'].values + Middle_WC = WordCloud(stopwords = Stop).generate(str(Middle_Skills)) + + Senior_Skills = Markets['Senior Level'].values + Senior_WC = WordCloud(stopwords = Stop).generate(str(Senior_Skills)) + return ( + Entry_Skills, + Entry_WC, + Middle_Skills, + Middle_WC, + Senior_Skills, + Senior_WC, + ) + + +@app.cell +def __(mo): + Levels = mo.ui.slider(1,3) + return (Levels,) + + +@app.cell +def __(Levels): + Skill_Level = Levels.value + return (Skill_Level,) + + +@app.cell +def __(Levels, mo): + mo.md(rf"Market: {Levels}") + return + + +@app.cell +def __(Skill_Level): + #"Translates" the numerical slider option into what experience level the word cloud is showing + def Market_Level(Skill_Level): + if Skill_Level == 1: + return ('Entry Level Regulatory Affairs') + if Skill_Level == 2: + return ('Middle Level Regulatory Affairs') + if Skill_Level == 3: + return ('Senior Level Regulatory Affairs') + Experience = Market_Level(Skill_Level) + return Experience, Market_Level + + +@app.cell +def __(Experience, mo): + mo.md(rf"
{Experience}
") + return + + +@app.cell +def __(Entry_WC, Middle_WC, Senior_WC, Skill_Level, plt): + def Market_Word_Cloud(Skill_Level): + if Skill_Level == 1: + return plt.imshow(Entry_WC) + if Skill_Level == 2: + return plt.imshow(Middle_WC) + if Skill_Level == 3: + return plt.imshow(Senior_WC) + + Market_Word_Cloud(Skill_Level) + return (Market_Word_Cloud,) + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Monitoring Flow of GHG Emissions.py b/010-Mustjaab/Monitoring Flow of GHG Emissions.py new file mode 100644 index 0000000..2733d0b --- /dev/null +++ b/010-Mustjaab/Monitoring Flow of GHG Emissions.py @@ -0,0 +1,529 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "statsmodels==0.14.4", +# "pandas==2.2.3", +# "scipy==1.14.1", +# "marimo", +# "altair==5.4.1", +# ] +# /// +import marimo + +__generated_with = "0.9.7-dev1" +app = marimo.App() + + +@app.cell +def __(mo): + mo.md("

Monitoring Flow of GHG Emissions


").style({'background-color':'green'}) + return + + +@app.cell +def __(): + import marimo as mo + import pandas as pd + from scipy.stats import f_oneway + from statsmodels.tsa.stattools import adfuller + return adfuller, f_oneway, mo, pd + + +@app.cell +async def __(): + import micropip + await micropip.install("altair") + import altair as alt + return alt, micropip + + +@app.cell +def __( + Canada_Pesticide_Stack, + Canada_Pharmaceutical_Stack, + Canada_Transportation_Stack, + mo, +): + mo.md( + rf"""" + {mo.hstack([Canada_Pharmaceutical_Stack,Canada_Pesticide_Stack,Canada_Transportation_Stack])} + """ + ).center() + return + + +@app.cell +def __( + Ontario_Pesticide_Stack, + Ontario_Pharmaceutical_Stack, + Ontario_Transportation_Stack, + mo, +): + mo.md( + rf"""" + {mo.hstack([Ontario_Pharmaceutical_Stack,Ontario_Pesticide_Stack,Ontario_Transportation_Stack])} + """ + ).center() + return + + +@app.cell +def __(pd): + Start_Date = '2009' + End_Date = '2022' + + Date_Range = pd.date_range(start=Start_Date, end=End_Date, freq='Y') + return Date_Range, End_Date, Start_Date + + +@app.cell +def __(Date_Range, pd): + Canada_Pharmaceutical = pd.DataFrame( + { + 'REF_DATE': Date_Range, + 'VALUE':[ + 250, + 247, + 221, + 235, + 190, + 258, + 263, + 426, + 297, + 277, + 274, + 271, + 264 + ] + } + ) + + Canada_Pesticide = pd.DataFrame( + { + 'REF_DATE':Date_Range, + 'VALUE':[ + 5539, + 5869, + 6168, + 6168, + 6758, + 6342, + 6722, + 6987, + 6216, + 5881, + 6115, + 6050, + 6071 + ] + } + ) + + Canada_Transportation = pd.DataFrame( + { + 'REF_DATE':Date_Range, + 'VALUE':[ + 1357, + 1381, + 1310, + 1295, + 1199, + 933, + 1276, + 1388, + 1635, + 1617, + 1440, + 1188, + 1228 + ] + } + ) + + Ontario_Pharmaceutical = pd.DataFrame( + { + 'REF_DATE':Date_Range, + 'VALUE':[ + 154, + 153, + 137, + 151, + 121, + 176, + 163, + 326, + 176, + 166, + 177, + 184, + 185 + ] + } + ) + + Ontario_Pesticide = pd.DataFrame( + { + 'REF_DATE':Date_Range, + 'VALUE':[ + 754, + 575, + 671, + 768, + 1120, + 1140, + 950, + 1179, + 1112, + 387, + 399, + 387, + 389 + ] + } + ) + + Ontario_Transportation = pd.DataFrame( + { + 'REF_DATE':Date_Range, + 'VALUE':[ + 526, + 516, + 445, + 461, + 519, + 410, + 636, + 618, + 769, + 712, + 642, + 488, + 485, + ] + } + ) + return ( + Canada_Pesticide, + Canada_Pharmaceutical, + Canada_Transportation, + Ontario_Pesticide, + Ontario_Pharmaceutical, + Ontario_Transportation, + ) + + +@app.cell +def __( + Canada_Pesticide, + Canada_Pharmaceutical, + Canada_Transportation, + Ontario_Pesticide, + Ontario_Pharmaceutical, + Ontario_Transportation, + adfuller, +): + Canada_ADF_Pharmaceutical = adfuller(Canada_Pharmaceutical['VALUE']) + Canada_ADF_Pesticide = adfuller(Canada_Pesticide['VALUE']) + Canada_ADF_Transportation = adfuller(Canada_Transportation['VALUE']) + + Ontario_ADF_Pharmaceutical = adfuller(Ontario_Pharmaceutical['VALUE']) + Ontario_ADF_Pesticide = adfuller(Ontario_Pesticide['VALUE']) + Ontario_ADF_Transportation = adfuller(Ontario_Transportation['VALUE']) + return ( + Canada_ADF_Pesticide, + Canada_ADF_Pharmaceutical, + Canada_ADF_Transportation, + Ontario_ADF_Pesticide, + Ontario_ADF_Pharmaceutical, + Ontario_ADF_Transportation, + ) + + +@app.cell +def __( + Canada_ADF_Pesticide, + Canada_ADF_Pharmaceutical, + Canada_ADF_Transportation, + Canada_Pesticide, + Canada_Pharmaceutical, + Canada_Transportation, + Ontario_ADF_Pesticide, + Ontario_ADF_Pharmaceutical, + Ontario_ADF_Transportation, + Ontario_Pesticide, + Ontario_Pharmaceutical, + Ontario_Transportation, + pd, +): + Canada_Pharmaceutical_Summary = pd.DataFrame( + { + 'Statistic':[ + 'Mean', + 'Median', + 'Skewness', + 'ADF pvalue' + ], + 'Value':[ + Canada_Pharmaceutical['VALUE'].mean(), + Canada_Pharmaceutical['VALUE'].median(), + Canada_Pharmaceutical['VALUE'].skew(), + Canada_ADF_Pharmaceutical[1] + ] + } + ) + + Canada_Pesticide_Summary = pd.DataFrame( + { + 'Statistic':[ + 'Mean', + 'Median', + 'Skewness', + 'ADF pvalue' + ], + 'Value':[ + Canada_Pesticide['VALUE'].mean(), + Canada_Pesticide['VALUE'].median(), + Canada_Pesticide['VALUE'].skew(), + Canada_ADF_Pesticide[1] + ] + } + ) + + + Canada_Transportation_Summary = pd.DataFrame( + { + 'Statistic':[ + 'Mean', + 'Median', + 'Skewness', + 'ADF pvalue' + ], + 'Value':[ + Canada_Transportation['VALUE'].mean(), + Canada_Transportation['VALUE'].median(), + Canada_Transportation['VALUE'].skew(), + Canada_ADF_Transportation[1] + ] + } + ) + + Ontario_Pharmaceutical_Summary = pd.DataFrame( + { + 'Statistic':[ + 'Mean', + 'Median', + 'Skewness', + 'ADF pvalue' + ], + 'Value':[ + Ontario_Pharmaceutical['VALUE'].mean(), + Ontario_Pharmaceutical['VALUE'].median(), + Ontario_Pharmaceutical['VALUE'].skew(), + Ontario_ADF_Pharmaceutical[1] + ] + } + ) + + + Ontario_Pharmaceutical_Summary = pd.DataFrame( + { + 'Statistic':[ + 'Mean', + 'Median', + 'Skewness', + 'ADF pvalue' + ], + 'Value':[ + Ontario_Pharmaceutical['VALUE'].mean(), + Ontario_Pharmaceutical['VALUE'].median(), + Ontario_Pharmaceutical['VALUE'].skew(), + Ontario_ADF_Pharmaceutical[1] + ] + } + ) + + Ontario_Pesticide_Summary = pd.DataFrame( + { + 'Statistic':[ + 'Mean', + 'Median', + 'Skewness', + 'ADF pvalue' + ], + 'Value':[ + Ontario_Pesticide['VALUE'].mean(), + Ontario_Pesticide['VALUE'].median(), + Ontario_Pesticide['VALUE'].skew(), + Ontario_ADF_Pesticide[1] + ] + } + ) + + Ontario_Transportation_Summary = pd.DataFrame( + { + 'Statistic':[ + 'Mean', + 'Median', + 'Skewness', + 'ADF pvalue' + ], + 'Value':[ + Ontario_Transportation['VALUE'].mean(), + Ontario_Transportation['VALUE'].median(), + Ontario_Transportation['VALUE'].skew(), + Ontario_ADF_Transportation[1] + ] + } + ) + return ( + Canada_Pesticide_Summary, + Canada_Pharmaceutical_Summary, + Canada_Transportation_Summary, + Ontario_Pesticide_Summary, + Ontario_Pharmaceutical_Summary, + Ontario_Transportation_Summary, + ) + + +@app.cell +def __( + Canada_Pesticide, + Canada_Pharmaceutical, + Canada_Transportation, + Ontario_Pesticide, + Ontario_Pharmaceutical, + Ontario_Transportation, + alt, + mo, +): + Pharmaceutical_Time_Series = mo.ui.altair_chart(alt.Chart(Canada_Pharmaceutical).mark_point().encode( + x='REF_DATE', + y='VALUE' + )) + + Pesticide_Time_Series = mo.ui.altair_chart(alt.Chart(Canada_Pesticide).mark_point().encode( + x='REF_DATE', + y='VALUE' + )) + + Transportation_Time_Series = mo.ui.altair_chart(alt.Chart(Canada_Transportation).mark_point().encode( + x='REF_DATE', + y='VALUE' + )) + + + Ontario_Pharmaceutical_Time_Series = mo.ui.altair_chart( + alt.Chart(Ontario_Pharmaceutical).mark_point().encode( + x='REF_DATE', + y='VALUE' + )) + + Ontario_Pesticide_Time_Series = mo.ui.altair_chart( + alt.Chart(Ontario_Pesticide).mark_point().encode( + x='REF_DATE', + y='VALUE' + ) + ) + + Ontario_Transportation_Time_Series = mo.ui.altair_chart( + alt.Chart(Ontario_Transportation).mark_point().encode( + x='REF_DATE', + y='VALUE' + ) + ) + return ( + Ontario_Pesticide_Time_Series, + Ontario_Pharmaceutical_Time_Series, + Ontario_Transportation_Time_Series, + Pesticide_Time_Series, + Pharmaceutical_Time_Series, + Transportation_Time_Series, + ) + + +@app.cell +def __( + Canada_Pesticide_Summary, + Canada_Pharmaceutical_Summary, + Canada_Transportation_Summary, + Ontario_Pesticide_Summary, + Ontario_Pesticide_Time_Series, + Ontario_Pharmaceutical_Summary, + Ontario_Pharmaceutical_Time_Series, + Ontario_Transportation_Summary, + Ontario_Transportation_Time_Series, + Pesticide_Time_Series, + Pharmaceutical_Time_Series, + Transportation_Time_Series, + mo, +): + Canada_Pharmaceutical_Stack = mo.vstack( + [ + mo.md("

Canada Pharmaceuticals

").style({'background-color':'crimson','float':'left'}), + Pharmaceutical_Time_Series, + mo.ui.table(Pharmaceutical_Time_Series.value), + mo.ui.table(Canada_Pharmaceutical_Summary) + ]) + + Canada_Pesticide_Stack = mo.vstack( + [ + mo.md("

Canada Pesticide Manufacturing

").style({'background-color':'crimson','float':'left'}), + Pesticide_Time_Series, + mo.ui.table(Pesticide_Time_Series.value), + mo.ui.table(Canada_Pesticide_Summary) + ]) + + Canada_Transportation_Stack = mo.vstack( + [ + mo.md("

Canada Transportation Engineering

").style({'background-color':'crimson','float':'left'}), + Transportation_Time_Series, + mo.ui.table(Transportation_Time_Series.value), + mo.ui.table(Canada_Transportation_Summary) + ]) + + + Ontario_Pharmaceutical_Stack = mo.vstack( + [ + mo.md("

Ontario Pharmaceuticals

").style({'background-color':'navy','float':'left'}), + Ontario_Pharmaceutical_Time_Series, + mo.ui.table(Ontario_Pharmaceutical_Time_Series.value), + mo.ui.table(Ontario_Pharmaceutical_Summary) + ] + ) + + Ontario_Pesticide_Stack = mo.vstack( + [ + mo.md("

Ontario Pesticide Manufacturing

").style({'background-color':'navy','float':'left'}), + Ontario_Pesticide_Time_Series, + mo.ui.table(Ontario_Pesticide_Time_Series.value), + mo.ui.table(Ontario_Pesticide_Summary) + ] + ) + + Ontario_Transportation_Stack = mo.vstack( + [ + mo.md("

Ontario Transportation Engineering

").style({'background-color':'navy','float':'left'}), + Ontario_Transportation_Time_Series, + mo.ui.table(Ontario_Transportation_Time_Series.value), + mo.ui.table(Ontario_Transportation_Summary) + ] + ) + return ( + Canada_Pesticide_Stack, + Canada_Pharmaceutical_Stack, + Canada_Transportation_Stack, + Ontario_Pesticide_Stack, + Ontario_Pharmaceutical_Stack, + Ontario_Transportation_Stack, + ) + + +@app.cell +def __(mo): + mo.md("""""") + return + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Periodic_App.py b/010-Mustjaab/Periodic_App.py new file mode 100644 index 0000000..57ca2af --- /dev/null +++ b/010-Mustjaab/Periodic_App.py @@ -0,0 +1,73 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "marimo", +# "pandas==2.2.3", +# "periodictable==1.7.1", +# ] +# /// + +import marimo + +__generated_with = "0.9.10" +app = marimo.App() + + +@app.cell +def __(mo): + mo.md("""

Periodic Table App

""") + return + + +@app.cell +def __(): + import marimo as mo + # pip install periodictable first before importing library + import periodictable + import pandas as pd + return mo, pd, periodictable + + +@app.cell +def __(mo): + Form = mo.ui.text(label="Atomic Number:").form() + Form + return (Form,) + + +@app.cell +def __(Form): + Element = Form.value + E = int(Element) + return E, Element + + +@app.cell +def __(E, periodictable): + element = periodictable.elements[E] + + Property_Table = { + 'Property': [ + 'Name', + 'Symbol', + 'Mass' + ], + + 'Value': [ + element.name, + element.symbol, + element.mass + ] + } + return Property_Table, element + + +@app.cell +def __(Property_Table, mo, pd): + Dynamic_Table = pd.DataFrame(Property_Table) + mo.ui.table(Dynamic_Table) + return (Dynamic_Table,) + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Portfolio.py b/010-Mustjaab/Portfolio.py new file mode 100644 index 0000000..1815e4b --- /dev/null +++ b/010-Mustjaab/Portfolio.py @@ -0,0 +1,119 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "marimo", +# ] +# /// +import marimo + +__generated_with = "0.9.7-dev1" +app = marimo.App() + + +@app.cell +def __(mo): + mo.md( + """ +

Muhammad Mustjaab

+

Portfolio

+ """ + ).center() + return + + +@app.cell +def __(): + import marimo as mo + return (mo,) + + +@app.cell +def __(mo): + mo.md("""

Projects

""") + return + + +@app.cell +def __(mo): + mo.hstack([ + mo.md( + """ +

+ Post Approval Study Recommender +

+ + Description: + + """ + ).style({"border-width":'2px','border-color':'crimson','overflow':'auto'}), + mo.md( + """ +

Differential Privacy and GC Content

+ Description: + + """).style({'border-width':'2px','border-color':'gold','overflow':'auto'}), + mo.md( + """ +

Warning Letter Classifier

+ Description: + + """).style({'border-width':'2px','border-color':'violet','overflow':'auto'}) + ] + ) + return + + +@app.cell +def __(mo): + mo.md( + """ +

Github:

+ https://github.com/Mustjaab +

Education

+ """ + ) + return + + +@app.cell +def __(mo): + mo.carousel( + [ + mo.md( + """ +

BSc|General Sciences|University of Waterloo

+ + """ + ), + mo.md( + """ +

Graduate Certificate|Regulatory Affairs|Humber College

+ + """) + ] + ).style({"border-width":'4px','border-color':'seagreen'}) + return + + +if __name__ == "__main__": + app.run() diff --git a/010-Mustjaab/Post Approval Study Recommender.py b/010-Mustjaab/Post Approval Study Recommender.py new file mode 100644 index 0000000..eb54d6a --- /dev/null +++ b/010-Mustjaab/Post Approval Study Recommender.py @@ -0,0 +1,120 @@ +# /// script +# requires-python = ">=3.12" +# dependencies = [ +# "pandas==2.2.3", +# "scikit-learn==1.5.2", +# "marimo", +# "plotly==5.24.1", +# "pyodide-py==0.26.2", +# ] +# /// + +import marimo + +__generated_with = "0.9.10" +app = marimo.App() + + +@app.cell +def __(mo): + mo.md("""

Post Approval Study Recommender

""") + return + + +@app.cell +def __(): + import marimo as mo + import pandas as pd + import micropip + import pyodide + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.metrics.pairwise import linear_kernel + return TfidfVectorizer, linear_kernel, micropip, mo, pd, pyodide + + +@app.cell +async def __(micropip): + await micropip.install("plotly") + import plotly.express as px + return (px,) + + +@app.cell +def __(pd): + PAS_FDA = "https://raw.githubusercontent.com/Mustjaab/PAS-Recommender/main/Post_Approval_Studies.csv" # csv file also exists under assets folder + PAS_FDA = pd.read_csv(PAS_FDA, header=0) + return (PAS_FDA,) + + +@app.cell +def __(mo): + Speciality_Selection = mo.ui.dropdown( + options=['Cardiovascular', 'Clinical Chemistry', 'Neurology', 'Ophthalmic', + 'General & Plastic Surgery', 'Orthopedic', 'General Hospital', + 'Toxicology', 'Obstetrics/Gynecology', 'Radiology', + 'Ear Nose & Throat', 'Pathology', 'Anesthesiology', + 'Gastroenterology/Urology'], + value='Cardiovascular', + label='Medical Specialty' + ) + + Top_Results = mo.ui.slider(2,11,1,label='Top Results:') + return Speciality_Selection, Top_Results + + +@app.cell +def __(Speciality_Selection, Top_Results): + top_results = Top_Results.value + Medical_Specailty = Speciality_Selection.value + return Medical_Specailty, top_results + + +@app.cell +def __(Speciality_Selection, Top_Results, mo): + mo.hstack([Speciality_Selection,Top_Results]).center() + return + + +@app.cell +def __(PAS_FDA, TfidfVectorizer): + tfidf = TfidfVectorizer(stop_words='english') + tfidf_matrix = tfidf.fit_transform(PAS_FDA['Study_Name']) + return tfidf, tfidf_matrix + + +@app.cell +def __(linear_kernel, tfidf_matrix): + cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix) + return (cosine_sim,) + + +@app.cell +def __(PAS_FDA, cosine_sim): + def get_recommendations(Medical_Specialty,top_results,cosine_sim=cosine_sim): + idx = PAS_FDA.index[PAS_FDA['Medical_Specialty'] == + Medical_Specialty].tolist()[0] + sim_scores = list(enumerate(cosine_sim[idx])) + sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) + sim_scores = sim_scores[1:top_results] + Study_indices = [i[0] for i in sim_scores] + return PAS_FDA['Study_Name'].iloc[Study_indices] + return (get_recommendations,) + + +@app.cell +def __(Medical_Specailty, get_recommendations, mo, pd, top_results): + Study_Recommendation = get_recommendations(Medical_Specailty,top_results) + Study_Recommendation = pd.DataFrame(Study_Recommendation) + Study_Recommendation = Study_Recommendation.rename(columns={'Study_Name':'Recommended Study'}) + mo.ui.table(Study_Recommendation) + return (Study_Recommendation,) + + +@app.cell +def __(mo): + mo.md("""