app.py

# -*- coding: utf-8 -*-
"""Final_merge_eval.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1EoQRzgehY8MDS7WooKh0-ECwEkIdd-V5
"""
import streamlit as st
# Install PySpark and Spark NLP

import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType
from transformers import pipeline
import Levenshtein


@st.cache(allow_output_mutation=True)
def load_spark_nlp_model():
    spark = sparknlp.start()

    document_assembler = DocumentAssembler() \
        .setInputCol('text') \
        .setOutputCol('document')

    sentence_detector = SentenceDetector() \
        .setInputCols(['document']) \
        .setOutputCol('sentence')

    tokenizer = Tokenizer() \
        .setInputCols(['sentence']) \
        .setOutputCol('token')

    tokenClassifier_loaded = BertForTokenClassification.pretrained("bert_token_classifier_hi_en_ner", "hi") \
        .setInputCols(["sentence", 'token']) \
        .setOutputCol("ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "ner"]) \
        .setOutputCol("ner_chunk")

    nlp_pipeline = Pipeline(stages=[document_assembler,
                                    sentence_detector,
                                    tokenizer,
                                    tokenClassifier_loaded,
                                    ner_converter])

    checkpoint = "/content/drive/MyDrive/NER/checkpoint-3135"
    token_classifier = pipeline(
        "token-classification", model=checkpoint, aggregation_strategy="simple"
    )

    universe_of_names = fuzzyMatchingPreprocessing()

    return spark, nlp_pipeline, token_classifier, universe_of_names

# text_list =["""वॉरेन एडवर्ड बफेट (Warren Buffet) (अगस्त 30 (August 30), 1930 को ओमाहा (Omaha), नेब्रास्का (Nebraska) में पैदा हुए) एक अमेरिकी निवेशक (investor), व्यवसायी और परोपकारी (philanthropist) व्यक्तित्व हैं।"""]

# df = spark.createDataFrame(text_list, StringType()).toDF("text")
# result = nlp_pipeline.fit(df).transform(df)

# text_list =["Are Jaipur and Ajmer the same state?"]
# text_list = ["AMong jaipur, Ajmer and Baroda whih of these have a higher population than Surat"]
# text_list = ["Which has higher average temperature in June, AHmedabad or Gandhinagar"]
# text_list = ["Where is zebara? is it in amdavad?"]
# text_list = ["The new england journal of medicine is the best medical journal in the world"]
# text_list = ["Can I visit new york, mars, sun and delhi on the same day?"]
# text_list = ["Can I visit new york mars sun and delhi on the same day?"]
# text_list = ["where can i find lakes near Ahmedabad"]
# text_list = ["where can I find lakes near Amdavad"]
# text_list = ["is there water in delli?"]
# text_list = ["Is it a good time to visit Prince edward island?"]
# text_list = ["which has higher average temperature in june, ahmedabad or gandhinagar"]
# text_list = ["Which Has Higher Average Temperature In June, Ahmdvad Or Gandhingr"]
# text_list = ["temperature at amdavad is high"]
# text_list = ["The zoo is located in Abc"]
# text_list = ["The zoo is located in Abc"]
# text_list = ["अहमदाबाद का तापमान मध्य प्रदेश से भी ज्यादा है"]
# text_list = ["अमदाबद का तापमान मध्यदेश से भी ज्यादा है"]
# text_list = ["Name of my daughter is India"]
# text_list = ["I hate Gujarat but i love faafada"]
# text_list = ["Anjeer is my favourite"]
# text_list = ["Farah Went To Kushk"]
# text_list = ["Temperature At Abc Is Higher Than Temperature at xyz"]
# text_list = ["Venus has a travel planned to mars"]
# text_list = ["The dal lake is in sri nagar"]
# text_list = ["What happened in Tamil Nadu"]

import codecs,string
def is_hindi(character):
    maxchar = max(character)
    if u'\u0900' <= maxchar <= u'\u097f':
        return True
    else:
      return False

def findword(text, s, e):
  while s>0 and text[s-1] != ' ' and text[s-1] != ',' and text[s-1] != '.' and text[s-1] != '?':
    s-=1
  while e<len(text) and text[e] != ' ' and text[e] != ',' and text[e] != '.' and text[e] != '?':
    e+=1
  return text[s:e]

def combinedOutput(extracted_list, output, text_list):
  namelist1 = {}
  # print(extracted_list, output)
  for out in extracted_list:
    # print(out)
    namelist1[out[0].lower()] = out[1]

  namelist2 = {}
  for out in output:
    # if out['score'] > 0.70:
      # print("printing out", out)
      word = findword(text_list[0].lower(), out['start'], out['end'])
      # print(word)
      namelist2[word] =  out['score']

  # print(namelist2)
  # print(namelist1)
  result = {}
  for place in namelist2.keys():
    if is_hindi(place):
      continue
    if place not in namelist1.keys():
      result[place] = "Looks Like"

    elif namelist1[place] == 'PLACE':
      result[place] = "Certain"
    else:
      result[place] = f'{place} occurs in the context of {namelist1[place]} but its name may resemble the name of a place'

  for place in namelist1.keys():
    if namelist1[place] == 'PLACE' and place not in result.keys():
      result[place] = "Most Likely"
      if len(place.split(' ')) > 1:
        for x in place.split(' '):
          if x in result.keys():
            del result[x]


  return result

def getNER(text):
  text = text.capitalize()
  # print(text)
  text_list = [text]
  df = spark.createDataFrame(text_list, StringType()).toDF("text")
  result = nlp_pipeline.fit(df).transform(df)


  extracted_result = result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias("cols")) \
    .select(F.expr("cols['0']").alias("chunk"),
            F.expr("cols['1']['entity']").alias("ner_label"))
  # extracted_result.show(truncate=False)

  extracted_list = extracted_result.collect()

  checkpoint = "/content/drive/MyDrive/NER/checkpoint-3135"
  token_classifier = pipeline(
      "token-classification", model=checkpoint, aggregation_strategy="simple"
  )

  output = token_classifier(text_list[0].lower())
  # print(extracted_list, output)
  answer = combinedOutput(extracted_list, output, text_list)
  return answer

# The function to find similarity between correctAnswer and userAnswer, based on levenshtein distance
def validate_answer_levenshtein(correct_answer, user_answer, threshold=80):
    distance = Levenshtein.distance(user_answer.lower(), correct_answer.lower())
    similarity = 1 - (distance / max(len(user_answer), len(correct_answer)))
    # normalising from [0,1] to [0,100]
    return round(100*similarity, 2)

def fuzzyMatching(fuzzy_li, universe_of_names):
    global_nearest_match_li = []
    max_similarity = 0

    for word in fuzzy_li:
        nearest_match_li = []
        n = 0
        for checking_word, type_ in universe_of_names:
            temp = validate_answer_levenshtein(word, checking_word)
            if n == 0:
                nearest_match_li.append((checking_word, temp, type_))
                n += 1
            else:
                for i in range(n):
                    # print(len(nearest_match_li), n)
                    if temp >= nearest_match_li[i][1]:
                        nearest_match_li.insert(i, (checking_word, temp, type_))
                        n += 1
                        break
                else:
                    nearest_match_li.append((checking_word, temp, type_))
                    n += 1
            if n == 4:
                nearest_match_li = nearest_match_li[:-1]
                n -= 1
        to_append = []
        for x in nearest_match_li:
          to_append.append((x[0], x[2]))
        global_nearest_match_li.append(to_append)

    return global_nearest_match_li

def fuzzyMatchingPreprocessing():

    countries_df = pd.read_csv("/content/drive/MyDrive/NER/countries.csv")
    states_df = pd.read_csv("/content/drive/MyDrive/NER/states.csv")
    cities_df = pd.read_csv("/content/drive/MyDrive/NER/cities.csv")


    n_cities = len(cities_df)
    n_states = len(states_df)
    n_countries = len(countries_df)

    cities_list = [(cities_df.iloc[i]["name"].lower(), "city") for i in range(n_cities)]
    states_list = [(states_df.iloc[i]["name"].lower(), "state") for i in range(n_states)]
    countries_list = [(countries_df.iloc[i]["name"].lower(), "country") for i in range(n_countries)]

    universe_of_names = countries_list + states_list + cities_list
    return universe_of_names


def fuzzyMatchingComplete (fuzzy_li, universe_of_names):

    final_list = fuzzyMatching(fuzzy_li, universe_of_names)

    return final_list

def add_data(word, entity, universe_of_names):
  entry = (word.lower(), entity.lower())

  if entry in universe_of_names:
      st.text(f"{entry}is already present in the database")
      changed = False
  else:
      universe_of_names.append(entry)
      st.text("Added to the database.")
      changed = True
  return universe_of_names, changed

def delete_data(word,entity,universe_of_names):
  entry = (word.lower(), entity.lower())

  if entry not in universe_of_names:
    st.text(f"{entry} is not present in the database")
    changed = False
  else:
    universe_of_names.remove(entry)
    st.text("Deleted from the database.")
    changed = True
  return universe_of_names, changed

def main(universe_of_names):
    st.title("Your Streamlit App")
    if 'history' not in st.session_state:
      st.session_state.history = []
    while True:
        # Take input from the user
        if st.button("ShowHistory"):
          st.text(st.session_state.history)

        if st.button("ClearHistory"):
          st.session_state.history = []
        request_type = st.text_input("Enter a request type: (Entering sentence: 1, add to database: 2, delete to database: 3), (exit: -1)", key="request_type")
    
        if request_type == '-1':
            st.text("Exiting...")
            break

        elif request_type == '1':
            user_input = st.text_input("Enter a sentence (type '-1' to exit): ", key="user_input_1")
            if st.button("Submit"):
              if user_input == '-1':
                  st.text("Exiting...")
                  break

              # Call the processing function and display the result
              result = getNER(user_input)
              matched_cities = fuzzyMatchingComplete(result.keys(), universe_of_names)

              st.text(f"Result: {result}")
              st.session_state.history.append({user_input: result})
              for entity, matched_places in zip(result.keys(), matched_cities):
                  st.text(f"Matching places for {entity} are: {matched_places}")

        elif request_type == '2':
            user_input = st.text_input("Enter the name of the place (type '-1' to exit): ", key="user_input_2")
            user_category = st.text_input("Enter the type of place(city, state, country)  (type '-1' to exit): ", key="user_category_2")
            if st.button("Submit"):
              if user_input == '-1' or user_category == '-1':
                  st.text("Exiting...")
                  break
              
              universe_of_names, changed = add_data(user_input, user_category, universe_of_names)
              st.session_state.history.append({("add",user_input, user_category) : "added" if changed else "already present"})

        elif request_type == '3':
            user_input = st.text_input("Enter the name of the place  (type '-1' to exit): ", key="user_input_3")
            user_category = st.text_input("Enter the type of place(city, state, country)  (type '-1' to exit): ", key="user_category_3")
            if st.button("Submit"):
              if user_input == '-1' or user_category == '-1':
                  st.text("Exiting...")
                  break
              
              universe_of_names, changed = delete_data(user_input, user_category, universe_of_names)
              st.session_state.history.append({("delete",user_input, user_category) : "deleted" if changed else "not present"})

        else:
            st.text("Please enter a valid input")

if __name__ == '__main__':
  spark, nlp_pipeline, token_classifier, universe_of_names = load_spark_nlp_model()
  main(universe_of_names)