From ade9ecfaea77396a97f48ceae80bf61d16b181ef Mon Sep 17 00:00:00 2001 From: Dianeod <40861871+Dianeod@users.noreply.github.com> Date: Wed, 10 Mar 2021 14:30:56 +0000 Subject: [PATCH] Refactor of NLP library (#26) * update merge * update function name * fixed findDates function to account for the word of or in between dates, months and years * removed sys argv statement * fix appveyor and travis files * fix copy error * removed getembedpy * cluster refactor * fixed indentations * fixed @ * Update of date_time.q to new format * update email to new format and commenting style * Fix commenting error * review of parser * fix email error * fixed bug * updated comments * update commenting * updated comments * review of parser code * Updates to move utils to .i, removal of duplicate email function definitions * moved callable functions to the end * moved callable functions to the end * Minor consistency update * moved python funcs * review of regex function * Updates to parser functionality * Minor updates to regex string matching refactor * review of sent * fix indentation * fixed length of line to be <80 in regex * review of utils functions * fixed indentation * initial review of nlp_code * moved functions to nlp_code.q * Minor changes to sentiment analysis functionality * renamed files * minor description updates for nlp utilities * reintroduction of embedPy load * updated removeMain and added filelength.t * minor updates to coincide with docs * update to coincide with docs * changed input names * update comments * nlp code review qdocs and headers * updates following comments * adding dictionarys kind and type * two small changes Co-authored-by: Conor McCarthy Co-authored-by: Conor McCarthy Co-authored-by: Conor McCarthy Co-authored-by: Conor McCarthy Co-authored-by: andrewmorrison1 --- code/cluster.q | 372 ++++++++++++++++++----- code/dateTime.q | 184 +++++++++++ code/date_time.q | 65 ---- code/email.q | 259 +++++++++++++--- code/{extract_rtf.p => extractRtf.p} | 2 +- code/nlpCode.q | 437 +++++++++++++++++++++++++++ code/nlp_code.q | 163 ---------- code/parser.p | 40 +++ code/parser.q | 242 ++++++++++----- code/regex.q | 249 +++++++++++++-- code/sent.q | 277 ++++++++++++----- code/utils.q | 253 ++++++++++++---- init.q | 9 +- nlp.q | 5 + tests/filelength.t | 36 +++ tests/nlptest.t | 27 +- tests/senttest.t | 14 +- 17 files changed, 1995 insertions(+), 639 deletions(-) create mode 100644 code/dateTime.q delete mode 100644 code/date_time.q rename code/{extract_rtf.p => extractRtf.p} (99%) create mode 100644 code/nlpCode.q delete mode 100644 code/nlp_code.q create mode 100644 code/parser.p create mode 100644 tests/filelength.t diff --git a/code/cluster.q b/code/cluster.q index 3f981fb..36f0bff 100644 --- a/code/cluster.q +++ b/code/cluster.q @@ -1,72 +1,308 @@ +// code/cluster.q - Nlp clustering utilities +// Copyright (c) 2021 Kx Systems Inc +// +// Clustering utilites for textual data + \d .nlp -// Run on either docs or keyword dicts -cluster.i.asKeywords:{i.fillEmptyDocs $[-9=type x[0]`keywords;x;x`keywords]} - -// Get cohesiveness of cluster as measured by mean sum of squares error -cluster.MSE:{[docs] - $[0=n:count docs;0n;1=n;1.;0=sum count each docs;0n; - avg d*d:0^compareDocToCentroid[i.takeTop[50]i.fastSum docs]each i.fillEmptyDocs docs]} - -// Bisecting k-means algo (repeatedly splits largest cluster in 2) -cluster.bisectingKMeans:{[docs;k;n] - if[0=n:count docs:cluster.i.asKeywords docs;:()]; - (k-1){[n;docs;clusters] - cluster:clusters idx:i.minIndex cluster.MSE each docs clusters; - (clusters _ idx),cluster@/:cluster.kmeans[docs cluster;2;n] - }[n;docs]/enlist til n} - -// k-means clustering for docs -cluster.kmeans:{[docs;k;n] - n{[docs;clusters] - centroids:(i.takeTop[3]i.fastSum@)each docs clusters; - value group i.maxIndex each centroids compareDocs\:/:docs - }[docs]/(k;0N)#neg[nd]?nd:count docs:cluster.i.asKeywords docs} - -// Match each doc to nearest centroid -cluster.groupByCentroids:{[centroids;docs] - value group{[centroids;doc]$[00f;similarities?m;0n] + } + +// @private +// @kind function +// @category nlpClusteringUtility +// @desc Merge any clusters with significant overlap into a single +// cluster +// @param clusters {any[][]} Cluster indices +// @returns {any[][]} Appropriate clusters merged together cluster.i.mergeOverlappingClusters:{[clusters] - similarClusters:{[clusters;counts;idx] - superset:counts=sum each clusters[idx]in/:clusters; - similar:.5<=avg each clusters[idx]in/:clusters; - notSmaller:(count clusters idx)>=count each clusters; - where superset or(similar & notSmaller) - }[clusters;count each clusters]each til count clusters; - merge:1=count each clusters; + where superset or(similar & notSmaller) + } + +// @private +// @kind function +// @category nlpClusteringUtility +// @desc Normalize the columns of a matrix so they sum to 1 +// @param matrix {float[][]} Numeric matrix of values +// @returns {float[][]} The normalized columns +cluster.i.columnNormalize:{[matrix] + 0f^matrix%\:sum matrix + } + +// @private +// @kind function +// @category nlpClusteringUtility +// @desc Graph clustering that works on a similarity matrix +// @param matrix {boolean[][]} NxN adjacency matrix +// @returns {long[][]} Lists of indices in the corpus where each row +// is a cluster +cluster.i.similarityMatrix:{[matrix] + matrix:"f"$matrix; + // Make the matrix stochastic and run MCL until stable + normMatrix:cluster.i.columnNormalize matrix; + attractors:cluster.i.MCL/[normMatrix]; + // Use output of MCL to get the clusters + clusters:where each attractors>0; + // Remove empty clusters and duplicates + distinct clusters where 0<>count each clusters + } -// Extremely fast clustering algo for large datasets (produces small but cohesive clusters) -cluster.radix:{[docs;n] - reduced:{distinct 4#key desc x}each docs:cluster.i.asKeywords docs; - keywords:(where 5<=count each group raze reduced)except`; +// @private +// @kind function +// @category nlpClusteringUtility +// @desc SM Van Dongen's MCL clustering algorithm +// @param matrix {float[][]} NxN matrix +// @return {float[][]} MCL algorithm applied to matrix +cluster.i.MCL:{[matrix] + // Expand matrix by raising to the nth power (currently set to 2) + do[2-1;mat:{i.np[`:matmul;x;x]`}matrix]; + mat:cluster.i.columnNormalize mat*mat; + @[;;:;0f] ./:flip(mat;where each(mat>0)&(mat<.00001)) + } + +// @kind function +// @category nlpClustering +// @desc Uses the top ten keywords of each document in order to cluster +// similar documents together +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @param k {long} The number of clusters to return +// @returns {long[][]} The documents' indices grouped into clusters +cluster.summarize:{[parsedTab;k] + if[0=count parsedTab;:()]; + docs:i.takeTop[10]each cluster.i.asKeywords parsedTab; + summary:i.fastSum[docs]%count docs; + centroids:(); + do[k; + // Find the document that summarizes the corpus best + // and move that document to the centroid list + centroids,:nearest:i.maxIndex docs[;i.maxIndex summary]; + summary-:docs nearest; + summary:(where summary<0)_ summary + ]; + cluster.groupByCentroids[docs centroids;docs] + } + +// @kind function +// @category nlpClustering +// @desc Use the top 50 keywords of each document to calculate the +// cohesiveness as measured by the mean sum of sqaures +// @param keywords {dictionary[]} A parsed document containing keywords and +// their associated significance scores +// @returns {float} The cohesion of the cluster +cluster.MSE:{[parsedTab] + n:count parsedTab; + if[(0=n)|0=sum count each parsedTab,(::);:0n]; + if[1=n;:1f]; + centroid:i.takeTop[50]i.fastSum parsedTab; + docs:i.fillEmptyDocs parsedTab; + // Don't include the current document in the centroid, or for small clusters + // it just reflects its similarity to itself + dists:0^compareDocToCentroid[centroid]each docs; + avg dists*dists + } + +// @kind function +// @category nlpClustering +// @desc The bisecting k-means algorithm which uses k-means to +// repeatedly split the most cohesive clusters into two clusters +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @param k {long} The number of clusters to return +// @param iters {long} The number of times to iterate the refining step +// @returns {long[][]} The documents' indices, grouped into clusters +cluster.bisectingKMeans:{[parsedTab;k;iters] + docs:cluster.i.asKeywords parsedTab; + if[0=n:count docs;:()]; + (k-1)cluster.i.bisect[iters;docs]/enlist til n + } + +// @kind function +// @category nlpClustering +// @desc k-means clustering for documents +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @param k {long} The number of clusters to return +// @param iters {long} The number of times to iterate the refining step +// @returns {long[][]} The documents' indices, grouped into clusters +cluster.kmeans:{[parsedTab;k;iters] + docs:cluster.i.asKeywords parsedTab; + numDocs:count docs; + iters cluster.i.kmeans[docs]/(k;0N)#neg[numDocs]?numDocs + } + +// @kind function +// @category nlpClustering +// @desc Given a list of centroids and a list of documents, match each +// document to its nearest centroid +// @param centroids {dictionary[]} Centroids as keyword dictionaries +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @returns {long[][]} Lists of document indices where each list is a cluster +// N.B. These don't line up with the number of centroids passed in, +// and the number of lists returned may not equal the number of centroids. +// There can be documents which match no centroids (all of which will end up +// in the same group), and centroids with no matching documents. +cluster.groupByCentroids:{[centroids;parsedTab] + // If there are no centroids, everything is in one group + if[not count centroids;:enlist til count parsedTab]; + value group cluster.i.findNearestNeighbor[centroids]each parsedTab + } + +// @kind function +// @category nlpClustering +// @desc Uses the Radix clustering algorithm and bins are taken from +// the top 3 terms of each document +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @param k {long} The number of clusters desired, though fewer may +// be returned. This must be fairly high to cover a substantial amount of the +// corpus, as clusters are small +// @returns {long[][]} The documents' indices, grouped into clusters +cluster.radix:{[parsedTab;k] + docs:cluster.i.asKeywords parsedTab; + // Bin on keywords, taking the 3 most significant keywords from each document + // and dropping those that occur less than 3 times + reduced:{distinct 4#key desc x}each docs; + // Remove any keywords that occur less than 5 times + keywords:where (count each group raze reduced) >= 5; + keywords:keywords except `; clusters:{[reduced;keyword]where keyword in/:reduced}[reduced]each keywords; + // Score clusters based on the harmonic mean of their cohesion and log(size) cohesion:i.normalize cluster.MSE each docs clusters; size:i.normalize log count each clusters; score:i.harmonicMean each flip(cohesion;size); - sublist[n]cluster.i.mergeOverlappingClusters/[clusters sublist[2*n]idesc score]} + // Take the n*2 highest scoring clusters, as merging will remove some + // but don't run it on everything, since merging is expensive. + // This may lead to fewer clusters than expected if a lot of merging happens + clusters:clusters sublist[2*k]idesc score; + sublist[k]cluster.i.mergeOverlappingClusters/[clusters] + } -cluster.fastRadix:{[docs;n] - docs:cluster.i.asKeywords docs; - grouped:(group i.maxIndex each docs)_`; +// @kind function +// @category nlpClustering +// @desc Uses the Radix clustering algorithm and bins by the most +// significant term +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @param k {long} The number of clusters desired, though fewer may +// be returned. This must be fairly high to cover a substantial amount of the +// corpus, as clusters are small +// @returns {long[][]} The documents' indices, grouped into clusters +cluster.fastRadix:{[parsedTab;k] + docs:cluster.i.asKeywords parsedTab; + // Group documents by their most significant term + grouped:group i.maxIndex each docs; + // Remove the entry for empty documents + grouped:grouped _ `; + // Remove all clusters containing only one element clusters:grouped where 1=mn; + clusters:cluster.i.similarityMatrix similarities>=minimum; + clustersOfOne:1=count each clusters; if[not sample;:clusters where not clustersOfOne]; // Any cluster of 1 documents isn't a cluster, so throw it out outliers:raze clusters where clustersOfOne; @@ -76,33 +312,5 @@ cluster.MCL:{[docs;mn;sample] centroids:avg each keywords clusters; // Move each non-outlier to the nearest centroid nonOutliers:(til count docs)except idx outliers; - nonOutliers cluster.groupByCentroids[centroids;docs nonOutliers]} - -// Graph clustering that works on a similarity matrix -cluster.i.columnNormalize:{[mat]0f^mat%\:sum mat} -cluster.i.similarityMatrix:{[mat] - matrix:"f"$mat; - // SM Van Dongen's MCL clustering algorithm - MCL:{[mat] - // Expand matrix by raising to the nth power (currently set to 2) - do[2-1;mat:{i.np[`:matmul;x;x]`}mat]; - mat:cluster.i.columnNormalize mat*mat; - @[;;:;0f] ./:flip(mat;where each(mat>0)&(mat<.00001))}; - // Make the matrix stochastic and run MCL until stable - attractors:MCL/[cluster.i.columnNormalize mat]; - // Use output of MCL to get the clusters - clusters:where each attractors>0; - // Remove empty clusters and duplicates - distinct clusters where 0<>count each clusters} - -// Subtracts most representive elements from centroid & iterate until number of clusters reached -cluster.summarize:{[docs;n] - if[0=count docs;:()]; - docs:i.takeTop[10]each cluster.i.asKeywords docs; - summary:i.fastSum[docs]%count docs; - centroids:(); - do[n; - centroids,:nearest:i.maxIndex docs[;i.maxIndex summary]; - summary-:docs nearest; - summary:(where summary<0)_ summary]; - cluster.groupByCentroids[docs centroids;docs]} + nonOutliers cluster.groupByCentroids[centroids;docs nonOutliers] + } diff --git a/code/dateTime.q b/code/dateTime.q new file mode 100644 index 0000000..0b7f580 --- /dev/null +++ b/code/dateTime.q @@ -0,0 +1,184 @@ +// code/dateTime.q - Nlp time utilities +// Copyright (c) 2021 Kx Systems Inc +// +// Utilities for handling dates and times + +\d .nlp + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Pads a string containing a single integer to two digits +// or extracts the last 2 digits from a string +// @param day {string} Contains a date +// @returns {string} Padded date to two digits +tm.i.parseDay:{[day] + -2#"0",day where day in .Q.n + } + +// @private +// @kind dictionary +// @category nlpTimeUtility +// @desc Dictionary mapping the months of the year +// @type dictionary +// to a symbol denoting integer representation +tm.i.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12 + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Convert a long-form or short-form month string to +// a string denoting the month as an integer "feb"/"february" +// become "02" +// @param day {string} A month of the year in English +// @returns {string} A padded integer representing the month of the year +tm.i.parseMonth:{[month] + -2#"0",string month^tm.i.months month:lower`$3 sublist month + } + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Pad a string denoting a year to 4 digits +// if input > 35 this is deemed to be 1900s +// i.e. "20" -> "2020" / "44" -> "1944") +// @param year {string} Contains a year +// @returns {string} Padded year value +tm.i.parseYear:{[year] + -4#$[35<"I"$-2#year;"19";"20"],year + } + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Convert year string to the entire date +// encapsulating that year +// @param year {string} A year +// @returns {string} Date range from Jan 1 to Dec 31 of +// the specified year +tm.i.convY:{[year] + "D"$year,/:(".01.01";".12.31") + } + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Convert string containing yearMonth +// to the date range encapsulating that month +// i.e. "test 2020.02" -> 2020.02.01 2020.02.29 +// "2019.02 test" -> 2019.02.01 2019.02.28 +// @param text {string} Text containing yearMonth value +// @returns {string} Date range for the month of the +// provided yearMonth +tm.i.convYearMonth:{[text] + txt:regex.matchAll[;text]each regex.objects`year`month; + matches:ungroup([format:"ym"]txt); + updMatches:matches,'flip`txt`s`e!flip matches`txt; + matches:value select format,last txt by s from updMatches; + format:tm.i.formatYM/[matches`format]; + format:raze@[format;i where 1 "y","m" +// @params ym {string[]} The format for each date objecct +// @returns {string} Formats of YearMonths objects seperated +tm.i.formatYM:{[ym] + @[ym;where not counts;except[;raze ym where counts:1=count each ym]] + } + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Convert string containing yearMonthDay +// to the date range encapsulating that day +// i.e. "test 2020.01.01" -> 2020.01.01 2020.01.01 +// "2010.01.01 test" -> 2010.01.01 2010.01.01 +// @param text {string} Text containing yearMonthDay value +// @returns {string} Date range associated with the +// provided yearMonthDay +tm.i.convYearMonthDay:{[text] + txt:regex.matchAll[;text]each regex.objects`year`month`day; + matches:ungroup([format:"ymd"]txt); + updMatches:matches,'flip`txt`s`e!flip matches`txt; + matches:value select format,last txt by s from updMatches; + format:tm.i.formatYMD/[matches`format]; + format:tm.i.resolveFormat raze@[format;where 1 "y","m","d" +// @params ymd {string[]} The format for each date objecct +// @returns {string} Formats of YearMonthDays objects seperated +tm.i.formatYMD:{[ymd] + @[ymd;i unq;:;"ymd" unq:where 1=count each i:where each "ymd" in/:\:ymd] + } + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Fill in the blanks in a date format string +// @param format {string} A date format, as some permutation of +// "d", "m", and "y" +// @returns {string} The date format with any blanks filled with their most +// plausible value +tm.i.resolveFormat:{[format] + $[0=n:sum" "=format; + ; + 1=n; + ssr[;" ";first"ymd"except format]; + 2=n; + tm.i.dateFormats; + {"dmy"} + ]format + } + +// @private +// @kind dictionary +// @category nlpTimeUtility +// @desc The format to use, given a single known position +// @type dictionary +tm.i.dateFormats:(!). flip( + ("d ";"dmy"); // 10th 02 99 + ("m ";"mdy"); // Feb 10 99 + ("y ";"ymd"); // 1999 02 10 + (" d ";"mdy"); // 02 10th 99 + (" m ";"dmy"); // 10 Feb 99 + (" y ";"dym"); // 10 1999 02 This is never conventionally used + (" d";"ymd"); // 99 02 10th + (" m";"ydm"); // 99 10 Feb This is never conventionally used + (" y";"dmy")) // 10 02 1999 //mdy is the american option + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Turns a regex time string into a q timestamp +// i.e "131030" -> 13:10:30.000 +// "1pm" -> 13:00:00.000 +// @param text {string} A time string +// @returns {timestamp} The q time parsed from an +// appropriate string +tm.i.parseTime:{[text] + numText:vs[" ";text][0]in"1234567890:."; + time:"T"$text where numText; + amPM:regex.i.check[;text]each regex.objects`am`pm; + time+$[amPM[0]&12=`hh$time;-1;amPM[1]&12>`hh$time;1;0]*12:00 + } + +// @private +// @kind function +// @category nlpTimeUtility +// @desc Remove any null values +// @array {number[][]} Array of values +// returns {number[][]} Array with nulls removed +tm.i.rmNull:{[array] + array where not null array[;0] + } diff --git a/code/date_time.q b/code/date_time.q deleted file mode 100644 index 9bfa945..0000000 --- a/code/date_time.q +++ /dev/null @@ -1,65 +0,0 @@ -\d .nlp - -// Pad day string to 2 digits -tm.i.parseDay:{-2#"0",x where x in .Q.n} - -// Convert month string and pad to 2 digits -tm.i.months:`jan`feb`mar`apr`may`jun`jul`aug`sep`oct`nov`dec!`$string 1+til 12 -tm.i.parseMonth:{-2#"0",string x^tm.i.months x:lower`$3 sublist x} - -// Pad year string to 4 digits (>35 deemed 1900s) -tm.i.parseYear:{-4#$[35<"I"$-2#x;"19";"20"],x} - -// Convert year string to date range -tm.i.convY:{"D"$x,/:(".01.01";".12.31")} - -// Convert yearmonth string to date range -tm.i.convYM:{ - matches:ungroup([fmt:"ym"]txt:regex.matchAll[;x]each regex.objects`year`month); - matches:value select fmt,last txt by s from matches,'flip`txt`s`e!flip matches`txt; - fmt:{@[x;where not xx;except[;raze x where xx:1=count each x]]}/[matches`fmt]; - fmt:raze@[fmt;i where 1`hh$tm;1;0]*12:00} - - -// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex) -tm.findTimes:{time:(tm.i.parseTime each tmtxt[;0]),'tmtxt:regex.matchAll[regex.objects.time;x]; time where time[;0]<24:01} - -// Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex) -tm.findDates:{[text] - rmInv:{x where not null x[;0]}; - ym:regex.matchAll[regex.objects.yearmonth;text]; - ymd:regex.matchAll[regex.objects.yearmonthday;text]; - dts:rmInv(tm.i.convYMD each ymd[;0]),'ymd; - if[count dts;ym@:where not any ym[;1] within/: dts[; 3 4]]; - dts,:rmInv(tm.i.convYM each ym[;0]),'ym; - dts iasc dts[;3]} - diff --git a/code/email.q b/code/email.q index 490e4e9..79945d0 100644 --- a/code/email.q +++ b/code/email.q @@ -1,59 +1,224 @@ -\d .nlp +// code/email.q - Nlp email utilities +// Copyright (c) 2021 Kx Systems Inc +// +// Utilities for handling emails -//Loading python script to extract rtf text -system"l ",.nlp.path,"/","code/extract_rtf.p"; -i.striprtf:.p.get[`striprtf;<] +\d .nlp -// Read mbox file, convert to table, parse metadata & content -email.getMboxText:{[fp]update text:.nlp.email.i.extractText each payload from email.i.parseMbox fp} +// @private +// @kind function +// @category nlpEmailUtility +// @desc Rich Text Format (RTF) parsing function imported from python +email.i.striprtf:.p.get[`striprtf;<] -email.i.findmime:{all(99=type each y`payload;x~/:y`contentType;0b~'y[`payload]@'`attachment)} -email.i.html2text:{email.i.bs[x;"html.parser"][`:get_text;"\\n"]`} / extract text from html -email.i.extractText:{ - / string is actual text, bytes attachment or non text mime type like inline image, dict look at content element - $[10=type x;x;4=type x;"";99=type x;.z.s x`content; - count i:where email.i.findmime["text/plain"]x;"\n\n"sv{x[y][`payload]`content}[x]each i; - / use beautiful soup to extract text from html - count i:where email.i.findmime["text/html"]x ;"\n\n"sv{email.i.html2text x[y][`payload]`content}[x]each i; - / use python script to extract text from rtf - count i:where email.i.findmime["application/rtf"]x ;"\n\n"sv{i.striprtf x[y][`payload]`content}[x]each i; - "\n\n"sv .z.s each x`payload]} +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract information from various message text types +// @params textTyp {string} The format of the message text +// @param msg {string|dictionary} An email message, or email subtree +// @returns {boolean} Whether or not msg fits the text type criteria +email.i.findMime:{[textTyp;msg] + msgDict:99=type each msg`payload; + contentTyp:textTyp~/:msg`contentType; + attachment:0b~'msg[`payload]@'`attachment; + all(msgDict;contentTyp;attachment) + } +// @private +// @kind function +// @category nlpEmailUtility +// @desc Use beautiful soup to extract text from a html file +// @param msg {string} The message payload +// @returns {string} The text from the html +email.i.html2text:{[msg] + email.i.bs[msg;"html.parser"][`:get_text;"\\n"]` + } -// Graph of who emailed whom, inc number of mails -email.getGraph:{[msgs] - 0!`volume xdesc select volume:count i by sender,to from flip`sender`to!flip`$raze email.i.getToFrom each msgs} +// @private +// @kind function +// @category nlpEmailUtility +// @desc Given an email, extract the text of the email +// @param msg {string|dictionary} An email message, or email subtree +// @returns {string} The text of the email, or email subtree +email.i.extractText:{[msg] + // String is actual text, bytes attachment or non text mime type like inline + // image, dict look at content element + msgType:type msg; + if[10=msgType;:msg]; + if[4=msgType;:""]; + if[99=msgType;:.z.s msg`content]; + findMime:email.i.findMime[;msg]; + text:$[count i:where findMime["text/plain"]; + {x[y][`payload]`content}[msg]each i; + count i:where findMime["text/html"]; + {email.i.html2text x[y][`payload]`content}[msg]each i; + count i:where findMime["application/rtf"]; + // Use python script to extract text from rtf + {email.i.striprtf x[y][`payload]`content}[msg]each i; + .z.s each msg`payload + ]; + "\n\n"sv text + } -// Get to/from pairs from an email +// @private +// @kind function +// @category nlpEmailUtility +// @desc Get all the to/from pairs from an email +// @param msg {dictionary} An email message, or subtree thereof +// @returns {any[]} To/from pairings of an email email.i.getToFrom:{[msg] - ((msg[`sender;0;1];)each msg[`to;;1]),$[98=type p:msg`payload;raze .z.s each p;()]} + payload:msg`payload; + payload:$[98=type payload;raze .z.s each payload;()]; + edges:(msg[`sender;0;1];)each msg[`to;;1]; + edges,payload + } + +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract the sender information from an email +// @param emails {<} The email as an embedPy object +// @returns {string[]} Sender name and email +email.i.getSender:{[emails] + fromInfo:raze emails[`:get_all;<]each("from";"resent-from"); + email.i.getAddr fromInfo where not(::)~'fromInfo + } + +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract the receiver information from an email +// @param emails {<} The email as an embedPy object +// @returns {string[]} Reciever name and email +email.i.getTo:{[emails] + toInfo:raze emails[`:get_all;<]each("to";"cc";"resent-to";"resent-cc"); + email.i.getAddr toInfo where not any(::;"")~/:\:toInfo + } + +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract the date information from an email +// @param emails {<} The email as an embedPy object +// @returns {timestamp} Date email was sent +email.i.getDate:{[emails] + dates:string 6#email.i.parseDate emails[@;`date]; + "P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each dates + } + +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract the subject information from an email +// @param emails {<} The email as an embedPy object +// @returns {string} Subject of the email +email.i.getSubject:{[emails] + subject:emails[@;`subject]; + $[(::)~subject`; + ""; + email.i.makeHdr[email.i.decodeHdr subject][`:__str__][]` + ] + } + +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract the content type of an email +// @param emails {<} The email as an embedPy object +// @returns {string} Content type of an email +email.i.getContentType:{[emails] + emails[`:get_content_type][]` + } -// Init python and q functions for reading mbox files -email.i.parseMail:{email.i.parseMbox1 email.i.msgFromString[x]`.} -email.i.parseMbox:{email.i.parseMbox1 each .p.list[<] .p.import[`mailbox;`:mbox]x} -email.i.parseMbox1:{k!email.get.i[k:`sender`to`date`subject`contentType`payload]@\:.p.wrap x} +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract the payload information from an email +// @param emails {<} The email as an embedPy object +// @returns {dictionary|table} Dictionary of `attachment`content or a table +// of payloads +// Content is byte[] for binary data, char[] for text +email.i.getPayload:{[emails] + if[emails[`:is_multipart][]`; + :email.i.parseMbox1 each emails[`:get_payload][]` + ]; + // Raw bytes decoded from base64 encoding, wrapped embedPy + raw:emails[`:get_payload;`decode pykw 1]; + rtf:"application/rtf"~email.i.getContentType emails; + attachment:"attachment"~emails[`:get_content_disposition][]`; + payload:`attachment`content!(0b;raw`); + if[all(rtf;attachment);:payload]; + if[attachment; + payload,`attachment`filename!(1b;email[`:get_filename][]`); + ]; + content:email.i.getContentType emails; + if[not any content~/:("text/html";"text/plain";"message/rfc822");:payload]; + charset:emails[`:get_content_charset][]`; + content:i.str[raw;$[(::)~charset;"us-ascii";charset];"ignore"]`; + `attachment`content!(0b;content) + } +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract meta information from an email +// @params filepath {string} The path to the mbox +// @returns {dictionary} Meta information from the email +email.i.parseMbox:{[filepath] + mbox:email.i.mbox filepath; + email.i.parseMbox1 each .p.list[<] mbox + } + +// @private +// @kind function +// @category nlpEmailUtility +// @desc Extract meta information from an email +// @params mbox {<} Emails in mbox format +// @returns {dictionary} Meta information from the email +email.i.parseMbox1:{[mbox] + columns:`sender`to`date`subject`contentType`payload; + msgInfo:`getSender`getTo`getDate`getSubject`getContentType`getPayload; + columns!email.i[msgInfo]@\:.p.wrap mbox + } + +// Python imports email.i.bs:.p.import[`bs4]`:BeautifulSoup -email.i.getaddr:.p.import[`email.utils;`:getaddresses;<] -email.i.parsedate:.p.import[`email.utils;`:parsedate;<] -email.i.decodehdr:.p.import[`email.header;`:decode_header] -email.i.makehdr:.p.import[`email.header;`:make_header] +email.i.getAddr:.p.import[`email.utils;`:getaddresses;<] +email.i.parseDate:.p.import[`email.utils;`:parsedate;<] +email.i.decodeHdr:.p.import[`email.header;`:decode_header] +email.i.makeHdr:.p.import[`email.header;`:make_header] email.i.msgFromString:.p.import[`email]`:message_from_string +email.i.mbox:.p.import[`mailbox]`:mbox + + +// @kind function +// @category nlpEmail +// @desc Convert an mbox file to a table of parsed metadata +// @param filepath {string} The path to the mbox file +// @returns {table} Parsed metadata and content of the mbox file +email.loadEmails:{[filepath] + parseMbox:email.i.parseMbox filepath; + update text:.nlp.email.i.extractText each payload from parseMbox + } + +// @kind function +// @category nlpEmail +// @desc Get the graph of who emailed who, including the number of +// times they emailed +// @param emails {table} The result of .nlp.loadEmails +// @returns {table} Defines to-from pairings of emails +email.getGraph:{[emails] + getToFrom:flip`$raze email.i.getToFrom each emails; + getToFromTab:flip`sender`to!getToFrom; + 0!`volume xdesc select volume:count i by sender,to from getToFromTab + } -email.get.i.sender:{email.i.getaddr e where not(::)~'e:raze x[`:get_all;<]each("from";"resent-from")} -email.get.i.to:{email.i.getaddr e where not any(::;"")~/:\:e:raze x[`:get_all;<]each("to";"cc";"resent-to";"resent-cc")} -email.get.i.date:{"P"$"D"sv".:"sv'3 cut{$[1=count x;"0";""],x}each string 6#email.i.parsedate x[@;`date]} -email.get.i.subject:{$[(::)~(s:x[@;`subject])`;"";email.i.makehdr[email.i.decodehdr s][`:__str__][]`]} -email.get.i.contentType:{x[`:get_content_type][]`} -/ return a dict of `attachment`content or a table of payloads, content is byte[] for binary data, char[] for text -email.get.i.payload:{ - if[x[`:is_multipart][]`;:email.i.parseMbox1 each x[`:get_payload][]`]; - raw:x[`:get_payload;`decode pykw 1]; / raw bytes decoded from base64 encoding, wrapped embedPy - if[all("application/rtf"~(x[`:get_content_type][]`);"attachment"~x[`:get_content_disposition][]`);:`attachment`content!(0b;raw`)]; - if["attachment"~x[`:get_content_disposition][]`;:`attachment`content`filename!(1b;raw`;x[`:get_filename][]`)]; - /if text is in rtf, mbox treats it as an attachment - /if[all("application/rtf"~(x[`:get_content_type][]`);"attachment"~x[`:get_content_dispositon][]`);:`attachment`content!(0b;raw`)]; - / e.g. inline images, return raw bytes in payload - if[not any(ct:x[`:get_content_type][]`)~/:("text/html";"text/plain";"message/rfc822");:`attachment`content!(0b;raw`)]; - :`attachment`content!(0b;i.str[raw;$[(::)~s:x[`:get_content_charset][]`;"us-ascii";s];"ignore"]`) - } +// @kind function +// @category nlpEmailUtility +// @desc Extract meta information from an email +// @params filepath {string} The path to where the email is stored +// @returns {dictionary} Meta information from the email +email.parseMail:{[filepath] + email.i.parseMbox1 email.i.msgFromString[filepath]`. + } diff --git a/code/extract_rtf.p b/code/extractRtf.p similarity index 99% rename from code/extract_rtf.p rename to code/extractRtf.p index e0af14a..d154de2 100644 --- a/code/extract_rtf.p +++ b/code/extractRtf.p @@ -5,7 +5,7 @@ import re def striprtf(text): pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) - # control words which specify a "destionation". + # control words which specify a "destination". destinations = frozenset(( 'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid', 'atnparent','atnref','atntime','atrfend','atrfstart','author','background', diff --git a/code/nlpCode.q b/code/nlpCode.q new file mode 100644 index 0000000..4806d5d --- /dev/null +++ b/code/nlpCode.q @@ -0,0 +1,437 @@ +// code/nlpCode.q - NLP code +// Copyright (c) 2021 Kx Systems Inc +// +// Main NLP code base + +\d .nlp + +// Date-Time + +// @kind function +// @category nlp +// @desc Find any times in a string +// @param text {string} A text, potentially containing many times +// @returns {any[]} A list of tuples for each time containing +// (q-time; timeText; startIndex; 1+endIndex) +findTimes:{[text] + timeText:regex.matchAll[regex.objects.time;text]; + parseTime:tm.i.parseTime each timeText[;0]; + time:parseTime,'timeText; + time where time[;0]<24:01 + } + +// @kind function +// @category nlp +// @desc Find all the dates in a document +// @param text {string} A text, potentially containing many dates +// @returns {any[]} A list of tuples for each time containing +// (startDate; endDate; dateText; startIndex; 1+endIndex) +findDates:{[text] + ym:regex.matchAll[regex.objects.yearMonth;text]; + ymd:regex.matchAll[regex.objects.yearMonthDay;text]; + convYMD:tm.i.convYearMonthDay each ymd[;0]; + dates:tm.i.rmNull convYMD,'ymd; + if[count dates;ym@:where not any ym[;1] within/: dates[; 3 4]]; + convYM:tm.i.convYearMonth each ym[;0]; + dates,:tm.i.rmNull convYM,'ym; + dates iasc dates[;3] + } + +// Parsing function + +// @kind function +// @category nlp +// @desc Parse URLs into dictionaries containing the +// constituent components +// @param url {string} The URL to decompose into its components +// @returns {dictionary} Contains information about the scheme, domain name +// and other URL information +parseURLs:{[url] + urlKeys:`scheme`domainName`path`parameters`query`fragment; + urlVals:parser.i.parseURLs url; + urlKeys!urlVals + } + +// @kind function +// @category nlp +// @desc Create a new parser +// @param spacyModel {symbol} The spaCy model/language to use. +// This must already be installed. +// @param fieldNames {symbol[]} The fields the parser should return +// @returns {fn} A function to parse text +newParser:{[spacyModel;fieldNames] + options:{distinct x,raze parser.i.depOpts x}/[fieldNames]; + disabled:`ner`tagger`parser except options; + model:parser.i.newSubParser[spacyModel;options;disabled]; + tokenAttrs:parser.i.q2spacy key[parser.i.q2spacy]inter options; + pyParser:parser.i.parseText[model;tokenAttrs;options;]; + stopWords:(`$.p.list[model`:Defaults.stop_words]`),`$"-PRON-"; + parser.i.runParser[pyParser;fieldNames;options;stopWords] + } + +// Sentiment + +// @kind function +// @category nlp +// @desc Calculate the sentiment of a sentence or short message, +// such as a tweet +// @param text {string} The text to score +// @returns {dictionary} The score split up into compound, positive, negative +// and neutral components +sentiment:{[text] + valences:sent.i.lexicon tokens:lower rawTokens:sent.i.tokenize text; + isUpperCase:(rawTokens=upper rawTokens)& rawTokens<>tokens; + upperIndices:where isUpperCase & not all isUpperCase; + valences[upperIndices]+:sent.i.ALLCAPS_INCR*signum valences upperIndices; + valences:sent.i.applyBoosters[tokens;isUpperCase;valences]; + valences:sent.i.negationCheck[tokens;valences]; + valences:sent.i.butCheck[tokens;valences]; + sent.i.scoreValence[0f^valences;text] + } + +// Comparing docs/terms + +// @kind function +// @category nlp +// @desc Calculates the affinity between terms in two corpus' using +// an Algorithm from Rayson, Paul and Roger Garside. +// "Comparing corpora using frequency profiling." +// Proceedings of the workshop on Comparing Corpora. Association for +// Computational Linguistics, 2000 +// @param parsedTab1 {table} A parsed document containing keywords and their +// associated significance scores +// @param parsedTab2 {table} A parsed document containing keywords and their +// associated significance scores +// @returns {dictionary[]} A dictionary of terms and their affinities for +// parsedTab2 over parsedTab1 +compareCorpora:{[parsedTab1;parsedTab2] + if[not min count each (parsedTab1;parsedTab2);:((`$())!();(`$())!())]; + termCountA:i.getTermCount parsedTab1; + termCountB:i.getTermCount parsedTab2; + totalWordCountA:sum termCountA; + totalWordCountB:sum termCountB; + // The expected termCount of each term in each corpus + coef:(termCountA+termCountB)%(totalWordCountA+totalWordCountB); + expectedA:totalWordCountA*coef; + expectedB:totalWordCountB*coef; + // Return the differences between the corpora + dict1:desc termCountA*log termCountA%expectedA; + dict2:desc termCountB*log termCountB%expectedB; + (dict1;dict2) + } + +// @kind function +// @category nlp +// @desc Calculates the cosine similarity of two documents +// @param keywords1 {dictionary} Keywords and their significance scores +// @param keywords2 {dictionary} Keywords and their significance scores +// @returns {float} The cosine similarity of two documents +compareDocs:{[keyword1;keyword2] + keywords:distinct raze key each(keyword1;keyword2); + cosineSimilarity .(keyword1;keyword2)@\:keywords + } + +// @kind function +// @category nlp +// @desc A function for comparing the similarity of two vectors +// @param keywords1 {dictionary} Keywords and their significance scores +// @param keywords2 {dictionary} Keywords and their significance scores +// @returns {float} Similarity score between -1f and 1f inclusive, 1 being +// perfectly similar, -1 being perfectly dissimilar +cosineSimilarity:{[keywords1;keywords2] + sqrtSum1:sqrt sum keywords1*keywords1; + sqrtSum2:sqrt sum keywords2*keywords2; + sum[keywords1*keywords2]%(sqrtSum1)*sqrtSum2 + } + +// @kind function +// @category nlp +// @desc Calculate how much each term contributes to the +// cosine similarity +// @param keywords1 {dictionary} Keywords and their significance scores +// @param keywords2 {dictionary} Keywords and their significance scores +// @returns {dictionary} A dictionary of how much of the similarity score each +// token is responsible for +explainSimilarity:{[keywords1;keywords2] + alignedKeys:inter[key keywords1;key keywords2]; + keywords1@:alignedKeys; + keywords2@:alignedKeys; + product:(keywords2%i.magnitude keywords1)*(keywords2%i.magnitude keywords2); + desc alignedKeys!product%sum product + } + +// @kind function +// @category nlp +// @desc Calculates the cosine similarity of a document and a centroid, +// subtracting the document from the centroid. +// This does the subtraction after aligning the keys so that terms not in +// the centroid don't get subtracted. +// This assumes that the centroid is the sum, not the avg, of the documents +// in the cluster +// @param centroid {dictionary} The sum of all the keywords significance scores +// @param keywords {dictionary} Keywords and their significance scores +// @returns {float} The cosine similarity of a document and centroid +compareDocToCentroid:{[centroid;keywords] + keywords@:alignedKeys:distinct key[centroid],key keywords; + vec:centroid[alignedKeys]-keywords; + cosineSimilarity[keywords;vec] + } + +// @kind function +// @category nlp +// @desc Find the cosine similarity between one document and all the +// other documents of the corpus +// @param keywords {dictionary} Keywords and their significance scores +// @param idx {number} The index of the feature vector to compare to the rest +// of the corpus +// @returns {float[]} The document's significance to the rest of the corpus +compareDocToCorpus:{[keywords;idx] + compareDocs[keywords idx]each(idx+1)_ keywords + } + +// @kind function +// @category nlp +// @desc Calculate the Jaro-Winkler distance of two strings, +// scored between 0 and 1 +// @param str1 {str|string[]} A string of text +// @param str2 {string|string[]} A string of text +// @returns {float} The Jaro-Winkler of two strings, between 0 and 1 +jaroWinkler:{[str1;str2] + str1:lower str1; + str2:lower str2; + jaroScore:i.jaro[str1;str2]; + jaroScore+$[0.70;term]#results + } + +// @kind function +// @category nlp +// @desc Find tokens that contain the term where each consecutive word +// has an above-average co-occurrence with the term +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @param term {symbol} The term to extract phrases around +// @returns {dictionary} Phrases as the keys, and their relevance as the values +extractPhrases:{[parsedTab;term] + term:lower term; + tokens:parsedTab`tokens; + related:findRelatedTerms[parsedTab]term; + // This gets the top words that have an above average relavance to the + // query term + relevant:term,sublist[150]where 01)#phrases + } + +// @kind function +// @category nlp +// @desc Given an input which is conceptually a single document, +// such as a book, this will give better results than TF-IDF. +// This algorithm is explained in the paper Carpena, P., et al. +// "Level statistics of words: Finding keywords in literary texts +// and symbolic sequences." +// Physical Review E 79.3 (2009): 035102. +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @returns {dictionary} Where the keys are keywords as symbols, and the values +// are their significance, as floats,with higher values being more +// significant +keywordsContinuous:{[parsedTab] + text:raze parsedTab[`tokens]@'where each not parsedTab`isStop; + groupTxt:group text; + n:count each groupTxt; + // Find the distinct words, ignoring stop words and those with 3 or fewer + // occurences, or make up less than .002% of the corpus + words:where n>=4|.00002*count text; + // Find the distances between occurences of the same word + // and use this to generate a 'sigma value' for each word + dist:deltas each words#groupTxt; + n:words#n; + sigma:(dev each dist)%(avg each dist)*sqrt 1-n%count text; + stdSigma:1%sqrt[n]*1+2.8*n xexp -0.865; + chevSigma:((2*n)-1)%2*n+1; + desc(sigma-chevSigma)%stdSigma + } + +// @kind function +// @category nlp +// @desc Find the TF-IDF scores for all terms in all documents +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @returns {dictionary[]} For each document, a dictionary with the tokens as +// keys, and relevance as values +TFIDF:{[parsedTab] + nums:parsedTab[`tokens]like\:"[0-9]*"; + tokens:parsedTab[`tokens]@'where each not parsedTab[`isStop]|nums; + words:distinct each tokens; + // The term frequency of each token within the document + TF:{x!{sum[x in y]%count x}[y]each x}'[words;tokens]; + // Calculate the inverse document frequency + IDF:1+log count[tokens]%{sum{x in y}[y]each x}[tokens]each words; + TF*IDF + } + +// Exploratory Analysis + +// @kind function +// @category nlp +// @desc Find runs of tokens whose POS tags are in the set passed in +// @param tagType {symbol} `uniPOS or `pennPOS (Universal or Penn +// Part-of-Speech) +// @param tags {symbol|symbol[]} One or more POS tags +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @returns {list} Two item list containing +// 1. The text of the run as a symbol vector +// 2. The index associated with the first token +findPOSRuns:{[tagType;tags;parsedTab] + matchingTag:parsedTab[tagType]in tags; + start:where 1=deltas matchingTag; + lengths:sum each start cut matchingTag; + idx:start+til each lengths; + runs:`$" "sv/:string each parsedTab[`tokens]start+til each lengths; + flip(runs;idx) + } + +// @kind function +// @category nlp +// @desc Determine the probability of one word following another +// in a sequence of words +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @returns {dictionary} The probability that the secondary word in the +// sequence follows the primary word. +biGram:{[parsedTab] + nums:parsedTab[`tokens]like\:"[0-9]*"; + tokens:raze parsedTab[`tokens]@'where each not parsedTab[`isStop]|nums; + occurance:(distinct tokens)!{count where y=x}[tokens]each distinct tokens; + raze i.biGram[tokens;occurance]''[tokens;next tokens] + } + +// @kind function +// @category nlp +// @desc Determine the probability of a `n` tokens appearing together +// in a text +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @param n {long} The number of words to occur together +// @returns {dictionary} The probability of `n` tokens appearing together in +// a text +nGram:{[parsedTab;n] + nums:parsedTab[`tokens]like\:"[0-9]*"; + tokens:raze parsedTab[`tokens]@'where each not parsedTab[`isStop]|nums; + tab:rotate\:[til n]tokens; + nGroup:last[tab]group neg[n-1]_flip(n-1)#tab; + occurance:{(count each group x)%count x}each nGroup; + returnKeys:raze key[occurance],/:'{key x}each value occurance; + returnVals:raze value each value occurance; + returnKeys!returnVals + } + +// Util + +// @kind function +// @category nlp +// @desc Find Regular expressions within texts +// @param text {string[]} The text of a document +// @param expr {symbol} The expression type to be searched for within the text +findRegex:{[text;expr] + nExpr:$[1=count expr;enlist;]; + regexKeys:nExpr expr; + regexVals:nExpr{regex.matchAll[regex.objects[x];y]}[;text]each expr; + regexKeys!regexVals + } + +// @kind function +// @category nlp +// @desc Remove any non-ascii characters from a text +// @param text {string} A string of text +// @returns {string} Non-ascii characters removed from the text +removeNonAscii:{[text] + text where text within (0;127) + } + +// @kind function +// @category nlp +// @desc Remove certain characters from a string of text +// @param text {string} A string of text +// @param char {string[]} Characters or expressions to be removed from the text +// @returns {string} The text without anything that contains the defined +// characters +removeCustom:{[text;char] + vecText:" " vs text; + rtrim raze(vecText where{not(max ,'/)x like/:y}[;char]each vecText),'" " + } + +// @kind function +// @category nlp +// @desc Remove and replace certain characters from a string of text +// @param text {string} A string of text +// @param char {string[]} Characters or expressions to be removed from the text +// @param replace {string} The characters which will replace the removed +// characters +removeReplace:{[text;char;replace] + {x:ssr[x;y;z];x}[;;replace]/[text;char] + } + +// @kind function +// @category nlp +// @desc Detect language from text +// @param text {string} A string of text +// @returns {symbol} The language of the text +detectLang:{[text] + `$.p.import[`langdetect][`:detect;<][text] + } + +// @kind function +// @category nlp +// @desc Import all files in a directory recursively +// @param filepath {string} The directories file path +// @returns {table} Filenames, paths and texts contained within the filepath +loadTextFromDir:{[filepath] + path:{raze$[-11=type k:key fp:hsym x;fp;.z.s each` sv'fp,'k]}`$filepath; + ([]fileName:(` vs'path)[;1];path;text:"\n"sv'read0 each path) + } + +// @kind function +// @category nlp +// @desc Get all the sentences for a document +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @returns {string[]} All the sentences from a document +getSentences:{[parsedTab] + (sublist[;parsedTab`text]deltas@)each parsedTab`sentChars + } + diff --git a/code/nlp_code.q b/code/nlp_code.q deleted file mode 100644 index 28f2e83..0000000 --- a/code/nlp_code.q +++ /dev/null @@ -1,163 +0,0 @@ -\d .nlp - -// Date-Time - -// Find all dates : list of 5-tuples (startDate; endDate; dateText; startIndex; 1+endIndex) -findDates:tm.findDates - -// Find all times : list of 4-tuples (time; timeText; startIndex; 1+endIndex) -findTimes:tm.findTimes - -// Email - -// Read mbox file, convert to table, parse metadata & content -email.loadEmails:loadEmails:email.getMboxText - -// Graph of who emailed whom, inc number of mails -email.getGraph:{[msgs] - 0!`volume xdesc select volume:count i by sender,to from flip`sender`to!flip`$raze email.i.getToFrom each msgs} - -email.parseMail:email.i.parseMail - -// Sentiment - -// Calculate sentiment of sentence of short message -sentiment:sent.score - -// Comparing docs/terms - -// Give 2 dicts of each term's affinity to each corpus -// Algorithm from Rayson, Paul, and Roger Garside. "Comparing corpora using frequency profiling." -// Proceedings of the workshop on Comparing Corpora. Association for Computational Linguistics, 2000 -compareCorpora:{[corp1;corp2] - if[(not count corp1)|(not count corp2);:((`$())!();(`$())!())]; - getTermCount:{[corp] - i.fastSum{1+log count each group x}each corp[`tokens]@'where each not corp`isStop}; - totalWordCountA:sum termCountA:getTermCount corp1; - totalWordCountB:sum termCountB:getTermCount corp2; - // The expected termCount of each term in each corpus - coef:(termCountA+termCountB)%(totalWordCountA+totalWordCountB); - expectedA:totalWordCountA*coef; - expectedB:totalWordCountB*coef; - // Return the differences between the corpora - (desc termCountA*log termCountA%expectedA;desc termCountB*log termCountB%expectedB)} - -// Calc cosine similarity of two docs -compareDocs:{cosineSimilarity .(x;y)@\:distinct raze key each(x;y)} - -// Compare similarity of 2 vectors -cosineSimilarity:{sum[x*y]%(sqrt sum x*x)*sqrt sum y*y} - -// How much each term contributes to the cosine similarity -explainSimilarity:{[doc1;doc2] - alignedKeys:inter[key doc1;key doc2]; - doc1@:alignedKeys; - doc2@:alignedKeys; - product:(doc2%i.magnitude doc1)*(doc2%i.magnitude doc2); - desc alignedKeys!product%sum product} - -// Cosine similarity of doc and centroid -compareDocToCentroid:{[centroid;doc] - doc@:alignedKeys:distinct key[centroid],key doc; - cosineSimilarity[doc;centroid[alignedKeys]-doc]} - -// Calc cosine similarity between doc and entire corpus -compareDocToCorpus:i.compareDocToCorpus - -// Jaro-Winkler distance between 2 strings -jaroWinkler:{i.jaroWinkler[lower x;lower y]} - -// Feature Vectors - -// Generate feature vector (of stemmed tokens) for a term -findRelatedTerms:{[docs;term] - sent:raze docs[`sentIndices]cut'@'[docs[`tokens];where each docs`isStop;:;`]; - sent@:asc distinct raze 0|-1 0 1+\:where(term:lower term)in/:sent; - ccur:` _ count each group raze distinct each sent; - tcur:idx@'group each docs[`tokens]@'idx:where each docs[`tokens]in\:key ccur; - tcur:i.fastSum((count distinct@)each)each docs[`sentIndices]bin'tcur; - ccur%:tcur term; - tcur%:sum count each docs`sentIndices; - desc except[where r>0;term]#r:(ccur-tcur)%sqrt tcur*1-tcur} - -// Find runs containing term where each word has above average co-ocurrance with term -extractPhrases:{[corpus;term] - relevant:term,sublist[150]where 01)#r:count each group r where term in/:r:raze tokens@'runs} - -// On a conceptually single doc (e.g. novel), gives better results than TF-IDF -// This algorithm is explained in the paper -// Carpena, P., et al. "Level statistics of words: Finding keywords in literary texts and symbolic sequences." -// Physical Review E 79.3 (2009): 035102. -keywordsContinuous:{[docs] - n:count each gt:group text:raze docs[`tokens]@'where each not docs`isStop; - words:where n>=4|.00002*count text; - dist:deltas each words#gt; - sigma:(dev each dist)%(avg each dist)*sqrt 1-(n:words#n)%count text; - std_sigma:1%sqrt[n]*1+2.8*n xexp -0.865; - chev_sigma:((2*n)-1)%2*n+1; - desc(sigma-chev_sigma)%std_sigma} - -// Find TFIDF scores for all terms in all documents -TFIDF:{[corpus] - tokens:corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*"; - tab:{x!{sum[x in y]%count x}[y]each x}'[words:distinct each tokens;tokens]; - tab*idf:1+log count[tokens]%{sum{x in y}[y]each x}[tokens]each words} - -TFIDF_tot:{[corpus]desc sum t%'sum each t:TFIDF corpus} - -// Parse Data - -// Create a new parser using a spaCy model (must already be installed) -newParser:parser.newParser - -// Parse urls to dictionaries -parseURLs:{`scheme`domainName`path`parameters`query`fragment!i.parseURLs x} - -// Exploratory Analysis - -// Find runs of tokens whose POS tags are in the set passed in -// Returns pair (text; firstIndex) -findPOSRuns:{[tagType;tags;doc] - start:where 1=deltas matchingTag:doc[tagType]in tags; - ii:start+til each lengths:sum each start cut matchingTag; - runs:`$" "sv/:string each doc[`tokens]start+til each lengths; - flip(runs;ii)} - -// Currently only for 2-gram -bi_gram:{[corpus] - tokens:raze corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*"; - occ:(distinct tokens)!{count where y=x}[tokens]each distinct tokens; - raze{[x;y;z;n](enlist(z;n))!enlist(count where n=x 1+where z=x)%y[z]}[tokens;occ]''[tokens;next tokens]} - -// Util - -// Find Regular expressions within texts -findRegex:{[text;expr]($[n;enlist;]expr)!$[n:1=count[expr];enlist;]{regex.matchAll[regex.objects[x];y]}[;text]each expr} - -// Remove any ascii characters from a text -ascii:{x where x within (0;127)} - -// Remove certain characters from a string of text -rmv_custom:{rtrim raze(l where{not(max ,'/)x like/:y}[;y]each l:" "vs x),'" "} - -// Remove and replace certain characters from a string of text -rmv_main:{{x:ssr[x;y;z];x}[;;z]/[x;y]} - -// Detect language from text -detectLang:{[text]`$.p.import[`langdetect][`:detect;<][text]} - -// Import all files in a dir recursively -loadTextFromDir:{[fp] - path:{[fp]raze$[-11=type k:key fp:hsym fp;fp;.z.s each` sv'fp,'k]}`$fp; - ([]fileName:(` vs'path)[;1];path;text:"\n"sv'read0 each path)} - -// Get all sentences for a doc -getSentences:i.getSentences - -// n-gram -ngram:{[corpus;n] - tokens:raze corpus[`tokens]@'where each not corpus[`isStop]|corpus[`tokens]like\:"[0-9]*"; - raze[key[b],/:'{key x}each value b]!raze value each value b:{(count each group x)%count x - }each last[tab]group neg[n-1]_flip(n-1)#tab:rotate\:[til n]tokens} diff --git a/code/parser.p b/code/parser.p new file mode 100644 index 0000000..ae6a1c7 --- /dev/null +++ b/code/parser.p @@ -0,0 +1,40 @@ +## Python spell check function +p)def spell(doc,model): + lst=[] + for s in doc: + if s._.hunspell_spell==False: + sug=s._.hunspell_suggest + if len(sug)>0: + ([lst.append(n)for n in model((sug)[0])]) + else: + lst.append(s) + else: + lst.append(s) + return lst + +## Python function for running spacy +p)def get_doc_info(parser,tokenAttrs,opts,text): + doc=doc1=parser(text) + if('spell' in opts): + doc1=spell(doc,parser) + res=[[getattr(w,a)for w in doc1]for a in tokenAttrs] + if('sentChars' in opts): # indices of first+last char per sentence + res.append([(s.start_char,s.end_char)for s in doc.sents]) + if('sentIndices' in opts): # index of first token per sentence + res.append([s.start for s in doc.sents]) + res.append([w.is_punct or w.is_bracket or w.is_space for w in doc]) + return res + +## Python functions to detect sentence borders +p)def x_sbd(doc): + if len(doc): + doc[0].is_sent_start=True + for i,token in enumerate(doc[:-1]): + doc[i+1].is_sent_start=token.text in ['。','?','!'] + return doc + +## Python functionality for the generation of a url parser +p)from urllib.parse import urlparse +p)import re +p)seReg=re.compile('([a-z0-9]+:)?//') + diff --git a/code/parser.q b/code/parser.q index 56456f9..20aacf8 100644 --- a/code/parser.q +++ b/code/parser.q @@ -1,39 +1,38 @@ -\d .nlp +// code/parser.q - Nlp parser utilities +// Copyright (c) 2021 Kx Systems Inc +// +// Utilities for parsing -p)def spell(doc,model): - lst=[] - for s in doc: - if s._.hunspell_spell==False: - sug=s._.hunspell_suggest - if len(sug)>0: - ([lst.append(n)for n in model((sug)[0])]) - else:lst.append(s) - else: - lst.append(s) - return lst +\d .nlp -// Python functions for running spacy -p)def get_doc_info(parser,tokenAttrs,opts,text): - doc=doc1=parser(text) - if('spell' in opts): - doc1=spell(doc,parser) - res=[[getattr(w,a)for w in doc1]for a in tokenAttrs] - if('sentChars' in opts): # indices of first+last char per sentence - res.append([(s.start_char,s.end_char)for s in doc.sents]) - if('sentIndices' in opts): # index of first token per sentence - res.append([s.start for s in doc.sents]) - res.append([w.is_punct or w.is_bracket or w.is_space for w in doc]) - return res +// @private +// @kind function +// @category nlpParserUtility +// @desc Retrieve python function for running spacy parser.i.parseText:.p.get[`get_doc_info;<]; -parser.i.cleanUTF8:.p.import[`builtins;`:bytes.decode;<][;`errors pykw`ignore]$["x"]@; -p)def x_sbd(doc): - if len(doc): - doc[0].is_sent_start=True - for i,token in enumerate(doc[:-1]): - doc[i+1].is_sent_start=token.text in ['。','?','!'] - return doc -// Dependent options +// @private +// @kind function +// @category nlpParserUtility +// @desc Convert string input to an appropriate +// byte representation suitable for application in Python +// functions, this is particularly useful when dealing with +// languages other than English +// @param data {string} Any input string containing any character +// arrays +// @returns {string} The data parsed such that UTF-8 compliant +// characters can be appropriately managed by the NLP models +parser.i.cleanUTF8:{[data] + byteDecode:.p.import[`builtins;`:bytes.decode;<]; + // Convert data to bytes and decode to appropriate string + byteDecode["x"$data;`errors pykw`ignore] + } + +// @private +// @kind dictionary +// @category nlpParserUtility +// @desc Dependent options for input to spacy module +// @type dictionary parser.i.depOpts:(!). flip( (`keywords; `tokens`isStop); (`sentChars; `sentIndices); @@ -43,7 +42,11 @@ parser.i.depOpts:(!). flip( (`lemmas; `tagger); (`isStop; `lemmas)) -// Map from q-style attribute names to spacy +// @private +// @kind dictionary +// @category nlpParserUtility +// @desc Map from q-style attribute names to spacy +// @type dictionary parser.i.q2spacy:(!). flip( (`likeEmail; `like_email); (`likeNumber; `like_num); @@ -55,68 +58,145 @@ parser.i.q2spacy:(!). flip( (`pennPOS; `tag_); (`starts; `idx)) -// Model inputs for spacy 'alpha' models -parser.i.alphalang:(!). flip( +// @private +// @kind dictionary +// @category nlpParserUtility +// @desc Model inputs for spacy 'alpha' models +// @type dictionary +parser.i.alphaLang:(!). flip( (`ja;`Japanese); (`zh;`Chinese)) -// Create new parser -// Valid opts : text keywords likeEmail likeNumber likeURL isStop tokens lemmas uniPOS pennPOS starts sentChars sentIndices spell -parser.newParser:{[lang;opts] - opts:{distinct x,raze parser.i.depOpts x}/[colnames:opts]; - disabled:`ner`tagger`parser except opts; - model:parser.i.newSubParser[lang;opts;disabled]; - tokenAttrs:parser.i.q2spacy key[parser.i.q2spacy]inter opts; - pyParser:parser.i.parseText[model;tokenAttrs;opts;]; - stopwords:(`$.p.list[model`:Defaults.stop_words]`),`$"-PRON-"; - parser.i.runParser[pyParser;colnames;opts;stopwords]} +// @private +// @kind function +// @category nlpParser +// @desc Create a new parser +// @param modelName {symbol} The spaCy model/language to use. +// This must already be installed. +// @param options {symbol[]} The fields the parser should return +// @param disabled {symbol[]} The modules to be disabled +// @returns {fn} a parser for the given language +parser.i.newSubParser:{[modelName;options;disabled] + checkLang:parser.i.alphaLang modelName; + lang:$[`~checkLang;`spacy;sv[`]`spacy.lang,modelName]; + model:.p.import[lang][hsym$[`~checkLang;`load;checkLang]]; + model:model . raze[$[`~checkLang;modelName;()];`disable pykw disabled]; + if[`sbd in options; + pipe:$[`~checkLang;model[`:create_pipe;`sentencizer];.p.pyget`x_sbd]; + model[`:add_pipe]pipe; + ]; + if[`spell in options; + spacyTokens:.p.import[`spacy.tokens][`:Token]; + if[not spacyTokens[`:has_extension]["hunspell_spell"]`; + spHun:.p.import[`spacy_hunspell]`:spaCyHunSpell; + platform:`$.p.import[`platform][`:system][]`; + osSys:$[`Darwin~platform;`mac;lower platform]; + hunspell:spHun[model;osSys]; + model[`:add_pipe]hunspell + ] + ]; + model + } -// Returns a parser for the given language -parser.i.newSubParser:{[lang;opts;disabled] - chklng:parser.i.alphalang lang; - model:.p.import[$[`~chklng;`spacy;sv[`]`spacy.lang,lang]][hsym$[`~chklng;`load;chklng] - ]. raze[$[`~chklng;lang;()];`disable pykw disabled]; - if[`sbd in opts;model[`:add_pipe]$[`~chklng;model[`:create_pipe;`sentencizer];.p.pyget `x_sbd]]; - if[`spell in opts;if[not .p.import[`spacy.tokens][`:Token][`:has_extension]["hunspell_spell"]`; - sphun:.p.import[`spacy_hunspell]`:spaCyHunSpell;hunspell:sphun[model; - $[`Darwin~syst:`$.p.import[`platform][`:system][]`;`mac;lower syst]];model[`:add_pipe]hunspell]]; - model} +// @private +// @kind function +// @category nlpParserUtility +// @desc Parser operations that must be done in q, or give better +// performance in q +// @param pyParser {fn} A projection to call the spacy parser +// @param fieldNames {symbol[]} The field names the parser should return +// @param options {symbol[]} The fields to compute +// @param stopWords {symbol[]} The stopWords in the text +// @param docs {string|string[]} The text being parsed +// @returns {dictionary|table} The parsed document(s) +parser.i.runParser:{[pyParser;fieldNames;options;stopWords;docs] + tab:parser.i.cleanUTF8 each docs; + parsed:parser.i.unpack[pyParser;options;stopWords]each tab; + if[`keywords in options;parsed[`keywords]:TFIDF parsed]; + fieldNames:($[1=count fieldNames;enlist;]fieldNames) except `spell; + fieldNames#@[parsed;`text;:;tab] + } -// Operations that must be done in q, or give better performance in q -parser.i.runParser:{[pyParser;colnames;opts;stopwords;docs] - t:parser.i.cleanUTF8 each docs; - parsed:parser.i.unpack[pyParser;opts;stopwords]each t; - if[`keywords in opts;parsed[`keywords]:TFIDF parsed]; - (($[1=count colnames;enlist;]colnames) except `spell)#@[parsed;`text;:;t]} - -// Operations that must be done in q, or give better performance in q -parser.i.unpack:{[pyParser;opts;stopwords;text] - names:inter[key[parser.i.q2spacy],`sentChars`sentIndices;opts],`isPunct; +// @private +// @kind function +// @category nlpParserUtility +// @desc This handles operations such as casting/removing punctuation +// that need to be done in q, or for performance reasons are better in q +// @param pyParser {fn} A projection to call the spaCy parser +// @param options {symbol[]} The fields to include in the output +// @param stopWords {symbol[]} The stopWords in the text +// @param text {string} The text being parsed +// @returns {dictionary} The parsed document +parser.i.unpack:{[pyParser;options;stopWords;text] + names:inter[key[parser.i.q2spacy],`sentChars`sentIndices;options],`isPunct; doc:names!pyParser text; + // Cast any attributes which should be symbols doc:@[doc;names inter`tokens`lemmas`uniPOS`pennPOS;`$]; + // If there are entities, cast them to symbols if[`entities in names;doc:.[doc;(`entities;::;0 1);`$]] if[`isStop in names; - if[`uniPOS in names;doc[`isStop]|:doc[`uniPOS ]in i.stopUniPOS ]; + if[`uniPOS in names;doc[`isStop]|:doc[`uniPOS]in i.stopUniPOS]; if[`pennPOS in names;doc[`isStop]|:doc[`pennPOS]in i.stopPennPOS]; - if[`lemmas in names;doc[`isStop]|:doc[`lemmas ]in stopwords]; - ]; + if[`lemmas in names;doc[`isStop]|:doc[`lemmas]in stopWords]; + ]; doc:parser.i.removePunct parser.i.adjustIndices[text]doc; - if[`sentIndices in opts; + if[`sentIndices in options; doc[`sentIndices]@:unique:value last each group doc`sentIndices; - if[`sentChars in opts;doc[`sentChars]@:unique] - ]; - @[doc;`;:;::]} + if[`sentChars in options;doc[`sentChars]@:unique] + ]; + @[doc;`;:;::] + } -// Python indexes into strings by char instead of byte, so must be modified to index a q string +// @private +// @kind function +// @category nlpParserUtility +// @desc This converts python indices to q indices in the text +// This has to be done because python indexes into strings by char instead +// of byte, so must be modified to index a q string +// @param text {string} The text being parsed +// @param doc {dictionary} The parsed document +// @returns {dictionary} The document with corrected indices parser.i.adjustIndices:{[text;doc] - adj:cont-til count cont:where ($[1~count text;enlist;]text) within"\200\277"; - if[`starts in cols doc;doc[`starts ]+:adj binr 1+doc`starts ]; - if[`sentChars in cols doc;doc[`sentChars]+:adj binr 1+doc`sentChars]; - doc} + if[1~count text;text:enlist text]; + // Any bytes following the first byte in UTF-8 multi-byte characters + // will be in the range 128-191. These are continuation bytes. + continuations: where text within "\200\277"; + // To find a character's index in python, + // the number of previous continuation bytes must be subtracted + adjusted:continuations-til count continuations; + // Add to each index the number of continuation bytes which came before it + // This needs to add 1, as the string "“hello”" gives the + // adjustedContinuations 1 1 7 7. + // If the python index is 1, 1 1 7 7 binr 1 gives back 0, so it needs to + // check the index after the python index + if[`starts in cols doc;doc[`starts]+:adjusted binr 1+doc`starts]; + if[`sentChars in cols doc;doc[`sentChars]+:adjusted binr 1+doc`sentChars]; + doc + } -// Removes punctuation and space tokens and updates indices +// @private +// @kind function +// @category nlpParserUtility +// @desc Removes punctuation and space tokens and updates indices +// @param doc {dictionary} The parsed document +// @returns {dictionary} The parsed document with punctuation removed parser.i.removePunct:{[doc] - doc:@[doc;key[parser.i.q2spacy]inter k:cols doc;@[;where not doc`isPunct]]; + // Extract document attributes + attrs:cols doc; + doc:@[doc;key[parser.i.q2spacy]inter attrs;@[;where not doc`isPunct]]; idx:sums 0,not doc`isPunct; - if[`sentIndices in k;doc:@[doc;`sentIndices;idx]]; - doc _`isPunct} + if[`sentIndices in attrs;doc:@[doc;`sentIndices;idx]]; + doc _`isPunct + } + +// @private +// @kind function +// @category nlpParserUtility +// @desc Parse a URL into its constituent components +// @param url {string} The URL to be decomposed into its components +// @returns {string[]} The components which make up the +parser.i.parseURLs:{[url] + pyLambda:"lambda url: urlparse(url if seReg.match(url) ", + "else 'http://' + url)"; + .p.eval[pyLambda;<]url + } diff --git a/code/regex.q b/code/regex.q index fd287c3..e8469d2 100644 --- a/code/regex.q +++ b/code/regex.q @@ -1,32 +1,223 @@ +// code/regex.q - Nlp regex utilities +// Copyright (c) 2021 Kx Systems Inc +// +// Utilities for regular expresions + \d .nlp -re:.p.import`re -regex.compile:{re[`:compile;x;$[y;re`:IGNORECASE;0]]} -regex.matchAll:.p.eval["lambda p,t:[[x.group(),x.start(),x.end()]for x in p.finditer(t)]";<] -regex.check:{i.bool[x[`:search]y]`} - -regex.patterns.specialChars: "[-[\\]{}()*+?.,\\\\^$|#\\s]" -regex.patterns.money: "[$¥€£¤฿]?\\s*((?]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/))" -regex.patterns.zipCode: "\\b\\d{5}\\b" -regex.patterns.postalCode: "\\b[a-z]\\d[a-z] ?\\d[a-z]\\d\\b" -regex.patterns.postalOrZipCode: "\\b(\\d{5}|[a-z]\\d[a-z] ?\\d[a-z]\\d)\\b" -regex.patterns.dtsep: "[\\b(of |in )\\b\\t .,-/\\\\]+" -regex.patterns.day: "\\b[0-3]?[0-9](st|nd|rd|th)?\\b" -regex.patterns.month: "\\b([01]?[0-9]|jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)\\b" -regex.patterns.year: "\\b([12][0-9])?[0-9]{2}\\b" -regex.patterns.yearfull: "\\b[12][0-9]{3}\\b" -regex.patterns.am: "(a[.\\s]?m\\.?)" -regex.patterns.pm: "(p[.\\s]?m\\.?)" -regex.patterns.time12: "\\b[012]?[0-9]:[0-5][0-9](h|(:[0-5][0-9])([.:][0-9]{1,9})?)?\\s*(",sv["|";regex.patterns`am`pm],")?\\b" -regex.patterns.time24: "\\b[012][0-9][0-5][0-9]h\\b" -regex.patterns.time: "(",sv["|";regex.patterns`time12`time24],")" -regex.patterns.yearmonthList: "(",sv["|";regex.patterns`year`month ],")" -regex.patterns.yearmonthdayList:"(",sv["|";regex.patterns`year`month`day],")" -regex.patterns.yearmonth: "(",sv[regex.patterns.dtsep;2#enlist regex.patterns.yearmonthList ],")" -regex.patterns.yearmonthday: "(",sv[regex.patterns.dtsep;3#enlist regex.patterns.yearmonthdayList],")" - -regex.objects:regex.compile[;1b]each 1_regex.patterns +// @private +// @kind function +// @category nlpRegexUtility +// @desc Import the regex module from python +regex.i.re:.p.import`re + +// @private +// @kind function +// @category nlpRegexUtility +// @desc Check if a pattern occurs in the text +// @params patterns {<} A regex pattern as an embedPy object +// @params text {string} A piece of text +// @returns {boolean} Indicate whether or not the pattern is present in the +// text +regex.i.check:{[patterns;text] + i.bool[patterns[`:search]text]` + } + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of special characters +// @type string +regex.i.patterns.specialChars:"[-[\\]{}()*+?.,\\\\^$|#\\s]" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of special characters +// @type string +regex.i.patterns.money:"[$¥€£¤฿]?\\s*((?]+(?:\\([\\w\\d]+\\)|([^[:punct:]\\s]|/))" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of zipcode characters +// @type string +regex.i.patterns.zipCode:"\\b\\d{5}\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of postal code characters +// @type string +regex.i.patterns.postalCode:"\\b[a-z]\\d[a-z] ?\\d[a-z]\\d\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of postal or zip code characters +// @type string +regex.i.patterns.postalOrZipCode:"\\b(\\d{5}|[a-z]\\d[a-z] ?\\d[a-z]\\d)\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of date separator characters +// @type string +regex.i.patterns.dateSeparate:"[\\b(of |in )\\b\\t .,-/\\\\]+" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of date characters +// @type string +regex.i.patterns.day:"\\b[0-3]?[0-9](st|nd|rd|th)?\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of monthly characters +// @type string +regex.i.patterns.month:"\\b([01]?[0-9]|jan(uary)?|feb(ruary)?|mar(ch)?|", + "apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?", + "|dec(ember)?)\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of yearly characters +// @type string +regex.i.patterns.year:"\\b([12][0-9])?[0-9]{2}\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of year characters in full +// @type string +regex.i.patterns.yearFull:"\\b[12][0-9]{3}\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of am characters +// @type string +regex.i.patterns.am:"(a[.\\s]?m\\.?)" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of pm characters +// @type string +regex.i.patterns.pm:"(p[.\\s]?m\\.?)" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of time (12hr) characters +// @type string +regex.i.patterns.time12:"\\b[012]?[0-9]:[0-5][0-9](h|(:[0-5][0-9])([.:][0-9]", + "{1,9})?)?\\s*(",sv["|";regex.i.patterns`am`pm],")?\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of time (24hr) characters +// @type string +regex.i.patterns.time24:"\\b[012][0-9][0-5][0-9]h\\b" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of all time characters +// @type string +regex.i.patterns.time:"(",sv["|";regex.i.patterns`time12`time24],")" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of year/month characters as a list +// @type string +regex.i.patterns.yearMonthList:"(",sv["|";regex.i.patterns`year`month],")" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of year/month/date characters +// @type string +regex.i.patterns.yearMonthDayList:"(",sv["|"; + regex.i.patterns`year`month`day],")" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of year/month characters along with date separators +// @type string +regex.i.patterns.yearMonth:"(",sv[regex.i.patterns.dateSeparate; + 2#enlist regex.i.patterns.yearMonthList],")" + +// @private +// @kind data +// @category nlpRegexUtilityPattern +// @desc A string of year/month/date characters along with date +// separators +// @type string +regex.i.patterns.yearMonthDay:"(",sv[regex.i.patterns.dateSeparate; + 3#enlist regex.i.patterns.yearMonthDayList],")" + +// @kind function +// @category nlpRegex +// @desc Compile a regular expression pattern into a regular +// expression embedPy object which can be used for matching +// @params patterns {string} A regex pattern +// @params ignoreCase {boolean} Whether the case of the string is to be ignored +// @return {<} The compiled regex object +regex.compile:{[patterns;ignoreCase] + case:$[ignoreCase;regex.i.re`:IGNORECASE;0]; + regex.i.re[`:compile;patterns;case] + } + +// @kind function +// @category nlpRegex +// @desc Finds all the matches in a string of text +// @params patterns {<} A regex pattern as an embedPy object +// @params text {string} A piece of text +// @returns {::|string[]} If the pattern is not present in the text a null +// is returned. Otherwise, the pattern along with the index where the +// pattern begins and ends is returned +regex.matchAll:.p.eval["lambda p,t:[[x.group(),x.start(),x.end()]", + "for x in p.finditer(t)]";<] +// @kind function +// @category nlpRegex +// @desc Compile all patterns into regular expression objects +// @return {<} The compiled regex object +regex.objects:regex.compile[;1b]each 1_regex.i.patterns diff --git a/code/sent.q b/code/sent.q index 8e977a1..fc92ce8 100644 --- a/code/sent.q +++ b/code/sent.q @@ -1,98 +1,210 @@ +// code/email.q - Nlp sentiment utilities +// Copyright (c) 2021 Kx Systems Inc +// +// Utilities for sentiment analysis + \d .nlp -// Create regex used for tokenizing +// @private +// @kind function +// @category nlpSentUtility +// @desc Create a regex patterns used for tokenization +// @returns {<} The compiled regex object sent.i.tokenPattern:{ - rightFacingEmoticons:"[<>]?[:;=8][\\-o\\*\\']?[\\)\\]\\(\\[dDpP/\\:\\}\\{@\\|\\\\]"; / n.b. Left-facing rarely used - miscEmoticons:"<3|[0o][._][0o]|]?[:;=8][\\-o\\*\\']?[\\)\\]\\(\\[dDpP/\\:\\}\\{@", + "\\|\\\\]"; / n.b. Left-facing rarely used + miscEmoticons:"<3|[0o][._][0o]|0; + positive:sum 1+valences where valences>0; negative:sum -1+valences where valences<0; neutral:count where valences=0; // If punctuation affects the sentiment, apply emphasis to dominant sentiment @@ -119,16 +239,5 @@ sent.i.scoreValence:{[valences;text] if[positivetokens; - upperIndices:where isUpperCase & not all isUpperCase; - valences[upperIndices]+:sent.i.ALLCAPS_INCR*signum valences upperIndices; - valences:sent.i.applyBoosters[tokens;isUpperCase;valences]; - valences:sent.i.negationCheck[tokens;valences]; - valences:sent.i.butCheck[tokens;valences]; - sent.i.scoreValence[0f^valences;text]} - + `compound`pos`neg`neu!(compound,abs(positive;negative;neutral)%total) + } diff --git a/code/utils.q b/code/utils.q index 8f5f860..3c5412f 100644 --- a/code/utils.q +++ b/code/utils.q @@ -1,66 +1,201 @@ +// code/utils.q - NLP utilities +// Copyright (c) 2021 Kx Systems Inc +// +// General nlp utility functions + \d .nlp \l p.q -{.p.import[`sys;x][:;`:write;{x y;count y}y]}'[`:stdout`:stderr;1 2]; / redundant in latest embedPy + +// @private +// @kind function +// @category nlpUtility +// @desc Import python functions i.np:.p.import`numpy i.str:.p.import[`builtins]`:str i.bool:.p.import[`builtins]`:bool -// Fast sum list of dicts (3 experimentally determined optimal number iterations) -i.fastSum:{[it;d]sum$[it;.z.s it-1;sum]each(ceiling sqrt count d)cut d}2 - -// Replace empty dicts with (,`)!,0f -i.fillEmptyDocs:{[docs]$[98=type docs;0^docs;@[docs;i;:;count[i:where not count each docs]#enlist(1#`)!1#0f]]} - -// Given monotonic increasing int list, return runs of consecutive numbers -i.findRuns:{(where x<>1+prev x)_ x@:where r|:next r:x=1+prev x} - -// Get all sentences for doc -i.getSentences:{[doc](sublist[;doc`text]deltas@)each doc`sentChars} - -// Index of min element -i.minIndex:{x?min x} - -// Index of max element -i.maxIndex:{x?max x} - -// Calc harmonic mean -i.harmonicMean:{1%avg 1%x} - -// Calc a vector's magnitude -i.magnitude:{sqrt sum x*x} - -// Normalize list or dict so the highest value is 1f -i.normalize:{x%max x} - -// Take largest N values -i.takeTop:{[n;x]n sublist desc x} - -// Jaro distance of 2 strings -i.jaro:{[s1;s2] - if[0=l1:count s1;:0f]; - d:1|-1+floor .5*l1|l2:count s2; - k:l[0]+where each s1='sublist\:[flip l:deltas 0|til[l1]+/:(-1 1)*d]s2; - m:count i:$[1=count j:k[0;0]{x,(y except x)0}/1_k;where not null j:enlist[j];where not null j]; - t:.5*sum s1[i]<>s2 asc j i; - avg(m%l1;m%l2;(m-t)%m)} - -// Jaro-Winkler distance of 2 strings -i.jaroWinkler:{$[0.71+prev array)_ array@:inRun + } + +// @private +// @kind function +// @category nlpUtility +// @desc Index of the first occurrence of the minimum +// value of an array +// @param array {number[]} Array of values +// @return {number} The index of the minimum element of the array +i.minIndex:{[array] + array?min array + } + +// @private +// @kind function +// @category nlpUtility +// @desc Index of the first occurrence of the maximum +// value of the array +// @param array {number[]} Array of values +// @return {number} The index of the maximum element of the array +i.maxIndex:{[array] + array?max array + } + +// @private +// @kind function +// @category nlpUtility +// @desc Calculate the harmonic mean +// @param array {number[]} Array of values +// @returns {float} The harmonic mean of the input +i.harmonicMean:{[array] + 1%avg 1%array + } + +// @private +// @kind function +// @category nlpUtility +// @desc Calculate a vector's magnitude +// @param array {number[]} Array of values +// @returns {float} The magnitude of the vector +i.magnitude:{[array] + sqrt sum array*array + } + +// @private +// @kind function +// @category nlpUtility +// @desc Normalize a list or dictionary so the highest value is 1f +// @param vals {float[]|dictionary} A list or dictionary of numbers +// @returns {float[]|dictionary} The input, normalized +i.normalize:{[vals] + vals%max vals + } + +// @private +// @kind function +// @category nlpUtility +// @desc Takes the largest N values +// @param n {long} The number of elements to take +// @param vals {any[]} A list of values +// @returns {any[]} The largest N values +i.takeTop:{[n;vals] + n sublist desc vals + } + +// @private +// @kind function +// @category nlpUtility +// @desc Calculate the Jaro similarity score of two strings +// @param str1 {string|string[]} A string of text +// @param str2 {string|string[]} A string of text +// @returns {Float} The similarity score of two strings +i.jaro:{[str1;str2] + lenStr1:count str1; + lenStr2:count str2; + if[0=lenStr1;:0f]; + // The range to search for matching characters + range:1|-1+floor .5*lenStr1|lenStr2; + // The low end of each window + lowWin:deltas 0|til[lenStr1]+/:(-1 1)*range; + k:lowWin[0]+where each str1='sublist\:[flip lowWin]str2; + j:raze k[0;0]{x,(y except x)0}/1_k; + nonNull:where not null j; + n:count nonNull; + // Find the number of transpositions + trans:.5*sum str1[nonNull]<>str2 asc j nonNull; + avg(n%lenStr1;n%lenStr2;(n-trans)%n) + } + +// @private +// @kind function +// @category nlpUtility +// @desc Generating symmetric matrix from triangle (ragged list) +// This is used to save time when generating a matrix where the upper +// triangular component is the mirror of the lower triangular component +// @param raggedList {float[][]} A list of lists of floats representing +// an upper triangular matrix where the diagonal values are all 0. +// eg. (2 3 4f; 5 6f; 7f) for a 4x4 matrix +// @returns {float[][]} An n x n two dimensional array +// The input, mirrored across the diagonal, with all diagonal values being 1 +i.matrixFromRaggedList:{[raggedList] + // Pad the list with 0fs to make it an array,and set the diagonal values to + // .5 which become 1 when the matrix is added to its flipped value + matrix:((til count raggedList)#'0.),'.5,'raggedList; + matrix+flip matrix + } + +// @private +// @kind data +// @category nlpUtility +// @desc Parts-of-speech not useful as keywords +// @type symbol[] i.stopUniPOS:asc`ADP`PART`AUX`CONJ`DET`SYM`NUM`PRON`SCONJ -i.stopPennPOS:asc`CC`CD`DT`EX`IN`LS`MD`PDT`POS`PRP`SYM`TO`WDT`WP`WRB`,`$("PRP$";"WP$";"$") /add in ` for symbols - -// Parse urls -p)from urllib.parse import urlparse -p)import re -p)seReg=re.compile('([a-z0-9]+:)?//') -i.parseURLs:.p.eval["lambda url: urlparse(url if seReg.match(url) else 'http://' + url)";<] - -// Calc cosine similarity between doc and entire corpus -i.compareDocToCorpus:{[keywords;idx]compareDocs[keywords idx]each(idx+1)_ keywords} - - +i.stopPennPOS:asc`CC`CD`DT`EX`IN`LS`MD`PDT`POS`PRP`SYM`TO`WDT`WP`WRB`, + `$("PRP$";"WP$";"$") + +// @private +// @kind function +// @category nlpUtility +// @desc Get the count of individual terms in a corpus +// @param parsedTab {table} A parsed document containing keywords and their +// associated significance scores +// @returns {dictionary} The count of terms in the corpus +i.getTermCount:{[parsedTab] + tokens:parsedTab[`tokens]@'where each not parsedTab`isStop; + i.fastSum{1+log count each group x}each tokens + } + +// @kind function +// @category nlpUtility +// @desc Calculate the probability of words appearing in a text +// @param tokens {symbol[]} The tokens in the text +// @param occurance {dictionary} The total times a token appears in the text +// @param token {symbol} A single token +// @param nextToken {symbol} The next token in the list of tokens +// @returns {dictionary} The probability that the secondary word in the +// sequence follows the primary word. +i.biGram:{[tokens;occurance;token;nextToken] + returnKeys:enlist(token;nextToken); + countToken:count where nextToken=tokens 1+where token=tokens; + returnVals:countToken%occurance[token]; + returnKeys!enlist returnVals + } diff --git a/init.q b/init.q index 33eb05c..f605221 100644 --- a/init.q +++ b/init.q @@ -1,3 +1,6 @@ +// init.q - Load nlp libraries +// Copyright (c) 2021 Kx Systems Inc + path:{string`nlp^`$@[{"/"sv -1_"/"vs ssr[;"\\";"/"](-3#get .z.s)0};`;""]}` system"l ",path,"/","nlp.q" @@ -6,9 +9,11 @@ system"l ",path,"/","nlp.q" loadfile`:code/utils.q loadfile`:code/regex.q loadfile`:code/sent.q +loadfile`:code/parser.p loadfile`:code/parser.q -loadfile`:code/date_time.q +loadfile`:code/dateTime.q +loadfile`:code/extractRtf.p loadfile`:code/email.q loadfile`:code/cluster.q -loadfile`:code/nlp_code.q +loadfile`:code/nlpCode.q diff --git a/nlp.q b/nlp.q index aba5942..2560ff7 100644 --- a/nlp.q +++ b/nlp.q @@ -1,3 +1,8 @@ +// nlp.q - Setup for nlp namespace +// Copyright (c) 2021 Kx Systems Inc +// +// Define version, path, and loadfile + \d .nlp version:@[{NLPVERSION};0;`development] path:{string`nlp^`$@[{"/"sv -1_"/"vs ssr[;"\\";"/"](-3#get .z.s)0};`;""]}` diff --git a/tests/filelength.t b/tests/filelength.t new file mode 100644 index 0000000..1b0d821 --- /dev/null +++ b/tests/filelength.t @@ -0,0 +1,36 @@ +// Names of the folders containing q scripts whose line lengths are to be tested +folders:enlist"code" + +// Function for retrieval of all q files +getFiles:{ + files:key hsym `$x; + pathStem:x,"/"; + qfiles:files where files like "*.q"; + `$pathStem,/:string qfiles + } + +// List of all the q files within the appropriate folders +files:raze getFiles each folders + +// For an individual file test that the lines of the file do no exceed 80 characters unless +// exempt using '// noqa' at the end of the line +testLineLength:{[file] + fileContent:read0 hsym file; + excessCharacters:80 keywords[0; `transacting] enlist[(`u#`$())!()]~TFIDF([]tokens:enlist `$(); isStop:enlist `boolean$()); keywords:TFIDF enlist corpus 1; 98h~type keywords -keywords_tot:TFIDF_tot corpus -keywords_tot[`erv]~keywords_tot[`published] -keywords_tot[`mpr] > keywords_tot[`attached] p:newParser[`en;`keywords]; corpus:p text; 1f~compareDocs . corpus[`keywords]0 0 @@ -96,10 +93,10 @@ getSentences[first sentenceParser enlist"This is my sentence"]~enlist "This is m (getSentences first sentenceParser enlist "There's no train to Guysborough. Though I know there'll be one in time")~("There's no train to Guysborough."; "Though I know there'll be one in time") truncate:{[precision; x]coefficient: 10 xexp precision;reciprocal[coefficient]*`long$coefficient*x} /jaroWinkler -all(.961~truncate[3] i.jaroWinkler["martha";"marhta"];.840~truncate[3] i.jaroWinkler["dwayne"; "duane"];.813~truncate[3] i.jaroWinkler["dixon";"dicksonx"];.743~truncate[3] i.jaroWinkler["johnson"; "jannsen"];.562~truncate[3] i.jaroWinkler["johnson";"jannsenberg"];.906~truncate[3] i.jaroWinkler["aahahahahahahhaahah"; "ahahahahhahahahahaha"]) -all(0f~i.jaroWinkler["benjamin";enlist"z"];0f~i.jaroWinkler["benjamin";enlist"a"]) -all(0f~i.jaroWinkler["";enlist"a"];0f~i.jaroWinkler["ben";""]) -.75~i.jaroWinkler["abcd"; enlist "b"] +all(.961~truncate[3] jaroWinkler["martha";"marhta"];.840~truncate[3] jaroWinkler["dwayne"; "duane"];.813~truncate[3] jaroWinkler["dixon";"dicksonx"];.743~truncate[3] jaroWinkler["johnson"; "jannsen"];.562~truncate[3] jaroWinkler["johnson";"jannsenberg"];.906~truncate[3] jaroWinkler["aahahahahahahhaahah"; "ahahahahhahahahahaha"]) +all(0f~jaroWinkler["benjamin";enlist"z"];0f~jaroWinkler["benjamin";enlist"a"]) +all(0f~jaroWinkler["";enlist"a"];0f~jaroWinkler["ben";""]) +.75~jaroWinkler["abcd"; enlist "b"] p:newParser[`en; `tokens`isStop]; corpus:p text; (()!())~keywordsContinuous 0#corpus @@ -109,9 +106,9 @@ keywords:keywordsContinuous enlist doc; 99h ~ type keywords keywords:keywordsContinuous corpus; {x~desc x} keywords `chairman`chief`group`enron`thanks`mountains -(1 1f,(2%3),(1%3),0.5 0.5 0.5 0.5 0.5 0.5)~value 10#ngram[enlist first corpus;2] -1 1 .5 .5 1 1 1 1 1 1f~value 10#ngram[enlist first corpus;3] -((`enrononline`management`report);(`management`report`june);(`report`june`attached))~key 3#ngram[enlist first corpus;3] +(1 1f,(2%3),(1%3),0.5 0.5 0.5 0.5 0.5 0.5)~value 10#nGram[enlist first corpus;2] +1 1 .5 .5 1 1 1 1 1 1f~value 10#nGram[enlist first corpus;3] +((`enrononline`management`report);(`management`report`june);(`report`june`attached))~key 3#nGram[enlist first corpus;3] emails:email.loadEmails["tests/data/test.mbox"] `sender`to`date`subject`contentType`payload`text~cols emails (last emails`text)~"Your email client does not support HTML mails." @@ -124,16 +121,16 @@ parseURLs["https://www.google.ca:1234/test/index.html;myParam?foo=bar&quux=blort all(parseURLs["google.ca/test/index.html"][`scheme`domainName`path]~("http";"google.ca";"/test/index.html");parseURLs["www.google.co.uk"][`scheme`domainName`path]~("http";"www.google.co.uk";"")) parseURLs["https://网站.中国.com"]~`scheme`domainName`path`parameters`query`fragment!("https";"网站.中国.com";"";"";"";"") (parseURLs each ("https://travel.gc.ca/";"https://www.canada.ca/en/revenue-agency.html"))~([]scheme:("https"; "https");domainName:("travel.gc.ca"; "www.canada.ca");path:(enlist "/";"/en/revenue-agency.html");parameters: (""; "");query:(""; "");fragment:(""; "")) -seq:bi_gram[corpus] +seq:biGram[corpus] seq[`enrononline`management]~1f seq[`management`report]>seq[`report`june] `en~detectLang["This is a sentence"] `de~detectLang["Das ist ein Satz"] `fr~detectLang["C'est une phrase"] -ascii["This is ä senteñcê"]~"This is sentec" +removeNonAscii["This is ä senteñcê"]~"This is sentec" rmv_list :("http*";"*,";"*&*";"*[0-9]*") -rmv_custom["https//:google.com & https//:bing.com are 2 search engines!";rmv_list]~"are search engines!" -rmv_main["https//:google.com & https//:bing.com are 2 search engines!";",.:?!/@'\n";""]~"httpsgooglecom & httpsbingcom are 2 search engines" +removeCustom["https//:google.com & https//:bing.com are 2 search engines!";rmv_list]~"are search engines!" +removeReplace["https//:google.com & https//:bing.com are 2 search engines!";",.:?!/@'\n";""]~"httpsgooglecom & httpsbingcom are 2 search engines" loadDir:loadTextFromDir["tests/data/test.mbox"] `fileName`path`text~cols loadDir loadDir[`fileName]~enlist `test.mbox @@ -144,5 +141,5 @@ phonecall:corpus n:where corpus[`text] like "*Telephone Call*" remaining:corpus til[count corpus]except n (`message`murdock`erica`error`jerry;`enron`know`let,`meeting`company)~key each 5#/:compareCorpora[phonecall;remaining] txt:"You can call the number 123 456 7890 or email us on name@email.com in book an appoinment for January,February and March for £30.00" -findRegex[txt;`phoneNumber`emailAddress`yearmonthList`money]~`phoneNumber`emailAddress`yearmonthList`money!(enlist (" 123 456 7890";23;36);enlist("name@email.com";52;66);(("January";93;100);("February";101;109);("March";114;119);("30";125;127);("00";128;130));enlist("\302\24330.00";124;130)) +findRegex[txt;`phoneNumber`emailAddress`yearMonthList`money]~`phoneNumber`emailAddress`yearMonthList`money!(enlist (" 123 456 7890";23;36);enlist("name@email.com";52;66);(("January";93;100);("February";101;109);("March";114;119);("30";125;127);("00";128;130));enlist("\302\24330.00";124;130)) \d . diff --git a/tests/senttest.t b/tests/senttest.t index bb0d84d..bc9ffa2 100644 --- a/tests/senttest.t +++ b/tests/senttest.t @@ -7,19 +7,11 @@ sent.i.amplifyEP[enlist "!"]~.292 sent.i.amplifyQM[""]~0f sent.i.amplifyQM[enlist "?"]~0f 0 0 0.36 0.54 0.96 0.96~sent.i.amplifyQM each ("yes"; "oh?"; "oh? really?"; "you don't say???"; "forsooth????"; "????????????") -all (sent.i.findSequence[`a`b`c`d;enlist`c]~enlist 2;sent.i.findSequence[`c`b`c`d; enlist `c] ~ 0 2) -all (sent.i.findSequence[`a`b`c`d`e`f;`c`d]~enlist 2;sent.i.findSequence[`a`b`c`d`e`f`c`d; `c`d] ~ 2 6;sent.i.findSequence[`a`b`c`d`e`f`a`b`c`d`e`g`a`b`c`d; `a`b`c`d] ~ 0 6 12) -sent.i.findSequence[`a`b`c`d;`c]~enlist 2; -sent.i.findSequence[`$();`a`b`c]~`long$() -all(sent.i.findSequence[enlist`a;`a]~enlist 0;sent.i.findSequence[enlist`a;`b]~`long$()) -sent.i.findSequence[`a`b`c`d`e`a;`a]~0 5 -sent.i.findSequence[0 0 4 5 1 2 4 5;4 5]~2 6 -sent.i.findSequence["Facebook,Tim Cook";"oo"]~5 14 sent.i.butCheck[`$(); `float$()] ~ `float$() all(sent.i.butCheck[enlist `good; enlist 2f] ~ enlist 2f;sent.i.butCheck[enlist`but;enlist 0f]~enlist 0f) all(sent.i.butCheck[`that`was`good`but; 0 0 1 0f] ~ 0 0 .5 0f;sent.i.butCheck[`that`was`good`but`it; 0 0 1 0 0f] ~ 0 0 .5 0 0f;sent.i.butCheck[`but`it`was`ok; 0 0 0 1f] ~ 0 0 0 1.5f;sent.i.butCheck[`tasty`but`it`smelled`bad; 2 0 0 -1.5 -2f] ~ 1 0 0 -2.25 -3f) sent.i.butCheck[`it`was`good`and`useful`but`boring`and`gross;0 0 1 0 1.5 0 -1 0 -2]~0 0 .5 0 .75 0 -1.5 0 -3 -compare:{value (floor 1000* sent.score x) % 1000} +compare:{value (floor 1000* sentiment x) % 1000} all(compare[""]~0 0 0 0f;compare["\t\t\r\n\n"]~0 0 0 0f;compare["a b c 1"]~0 0 0 0f) all(compare["bad"]~-.543 0 1 0f;compare["racist"]~-.613 0 1 0f;compare["good"]~.44 1 0 0f;compare["free"] ~.51 1 0 0f;compare["those"]~0 0 0 1f;compare["123"]~0 0 0 1f) all(compare["ugly smile"]~-0.203 0.431 0.568 0;compare["free sadness"]~0.102 0.532 0.467 0) @@ -33,8 +25,8 @@ all(compare["Paul Anka is cool"]~0.318 0.433 0 0.566;compare["Paul Anka is cool, all(compare["Jethro Tull is dorkier"]~-0.274 0 0.411 0.588;compare["But Jethro Tull is dorkier"]~-0.392 0 0.398 0.601) all(compare["Paul Anka is a dork"]~-0.34 0 0.444 0.555;compare["Paul Anka isn't a dork"]~.258 .404 0 0.595) all(compare["Paul Anka is a nerd"]~-0.296 0 0.423 0.576;compare["Paul Anka is kind of a nerd"]~-0.229 0 0.322 0.677) -all(sent.score["Paul Anka is the GREATEST"][`compound`pos])>sent.score["Paul Anka is thegreatest"][`compound`pos] -(sent.score["PAUL ANKA IS THE GREATEST"])~sent.score["Paul Anka is the greatest"] +all(sentiment["Paul Anka is the GREATEST"][`compound`pos])>sentiment["Paul Anka is thegreatest"][`compound`pos] +(sentiment["PAUL ANKA IS THE GREATEST"])~sentiment["Paul Anka is the greatest"] all(compare["中国 is beautiful"]~0.599 0.661 0 0.338;compare["Best φαλάφελ in Greece"]~0.636 0.583 0 0.416;compare["Paul Anka…king of the dorks"]~-0.129 0 0.23 0.769) compare["Paul Anka's singing is beautiful- especially Black Hole Sun"]~compare["Paul Anka's singing is beautiful especially Black Hole Sun"] \d .