From 987b219a5fefd317f88202820761d496a135ed52 Mon Sep 17 00:00:00 2001 From: kasramvd Date: Sat, 19 May 2018 13:48:49 +0430 Subject: [PATCH 1/3] create biotrend module --- biotrend/biotrend.py | 220 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 biotrend/biotrend.py diff --git a/biotrend/biotrend.py b/biotrend/biotrend.py new file mode 100644 index 0000000..91a9fad --- /dev/null +++ b/biotrend/biotrend.py @@ -0,0 +1,220 @@ +#!/usr/bin/python + +###################################################################### +# tempSeg.py # +# Author: Dario Ghersi # +# Version: 20150407 # +# Goal: implementation of the temporal segmentation of time # +# series as described in: # +# Siy, Chundi, Rosenkrantz, and Subramaniam # +# Journal of Software maintenance and evolution, 2007; 11 # +# Usage: tempSeg.py TIME_SERIES_DIR ALPHA P # +###################################################################### + +import glob +import sys + +###################################################################### +# CONSTANTS # +###################################################################### + +MIN_SIZE = 4 # the minimum size for a token + +###################################################################### +# FUNCTIONS # +###################################################################### + +def acquireTimeSeries(timeSeriesDir): + """ + Store the time series as a list of dictionaries + """ + ## set up the variables + allFiles = glob.glob(timeSeriesDir + "/*") + timeSeries = [{} for _ in range(numPoints)] + ## process each time point + for i, f in enumerate(allFiles): + with open(f) as timePointFile: + for line in timePointFile: + token, freq = line[:-1].split() + # include only tokens above a minimum length + if len(token) >= MIN_SIZE: + timeSeries[i][token] = int(freq) + return timeSeries + +###################################################################### + +def calcDecomp(T): + """ + Calculate the optimal decomposition. + """ + + curr = len(T) - 1 + decomp = [] + numCl = len(T[0]) # number of segments + while curr > 0: + decomp.append(curr) + curr = T[curr][numCl - 1] + numCl -= 1 + + decomp.reverse() + return decomp + +###################################################################### + +def calcNumPap(allAbstrFileName): + """ + calculate the total number of papers per year + """ + + numPap = defaultdict(int) + with open(allAbstrFileName) as allAbstrFile: + for line in allAbstrFile: + fields = line[:-1].split() + try: + numPap[int(fields[0])] += 1 + except ValueError: + # invalid literal for int() with base 10 + pass + numPapSorted = [numPap[year] for year in sorted(numPap.keys())] + return numPapSorted + +###################################################################### + +def computeSegDiff(timeSeries, numPap, alpha): + """ + compute the loss for combining two or more segments together + """ + + numSets = len(timeSeries) + totSize = numSets * (numSets - 1) / 2 + segDiff = [[0 for x in range(totSize)] for x in range(totSize)] + ## extract the significant items for each item set + signifItems = [getSignifItems2(timeSeries[i:i+1], + numPap, + alpha, i, i) for i in range(numSets)] + ## process each possible segment + for i in range(numSets - 1): + for j in range(i + 1, numSets): + # extract the significant items + #segmentItems = getSignifItems(timeSeries[i:j + 1], alpha) + segmentItems = getSignifItems2(timeSeries[i:j + 1], numPap, + alpha, i, j) + segDiff[i][j] = fracDiff(signifItems, segmentItems, i, j) + + return segDiff, signifItems + +###################################################################### + +def fracDiff(signifItems, segmentItems, i, j): + """ + compute the cumulative fractional difference between item sets in + a segment i <= x <= j + """ + + segmentItems = set(segmentItems) + fracDiff = sum(len(segmentItems.symmetric_difference( + set(signifItems[x])) + ) for x in range(i, j+1)) + + return fracDiff + +###################################################################### + +def getSignifItems(segment, alpha): + """ + extract the items in the segment whose relative + frequency is above alpha + """ + totals = sum(Counter(timePoint) for timePoint in segment) + ## extract the items whose relative frequency is >= alpha + total = float(sum(totals.values())) + signifItems = [ + item for item, value in totals.items() + if value / total >= alpha + ] + return signifItems + +###################################################################### + +def getSignifItems2(segment, numPap, alpha, i, j): + """ + extract the items in the segment whose relative + frequency is above alpha + """ + + signifItems = [] + avgs = defaultdict(int) + weights = numPap[i:j+1] + numSeg = j - i + 1 + for k in range(numSeg): + timePoint = segment[k] + for item in timePoint: + avgs[item] += timePoint[item] + + ## extract the items whose relative frequency is >= alpha + total = float(sum(weights)) + signifItems = [ + item for item, value in avgs.items() + if value / total >= alpha + ] + #print signifItems, i, j + #sys.exit(1) + return signifItems + +###################################################################### + +def optimalSeg(segDiff, n, p): + """ + apply dynamic programming to optimally segment the time series + """ + + ## initialize the R and T arrays, which will contain the loss values + ## and the segmentation path, respectively + R = [[0 for x in range(p)] for x in range(n)] + T = [[0 for x in range(p)] for x in range(n)] + + for j in range(n): + R[j][0] = segDiff[0][j] + kmax = min(j + 1, p) + + for k in range(1, kmax): + R[j][k] = R[k - 1][k - 1] + segDiff[k][j] + T[j][k] = k - 1 + + for z in range(k - 1, j): + if (R[z][k - 1] + segDiff[z + 1][j]) < R[j][k]: + R[j][k] = R[z][k - 1] + segDiff[z + 1][j] + T[j][k] = z + + return R, T + +###################################################################### +# MAIN PROGRAM # +###################################################################### + +if __name__ == "__main__": + ## parse the parameters + if len(sys.argv) != 5: + print "Usage: tempSeg.py TIME_SERIES_DIR ALL_ABSTR ALPHA P" + sys.exit(1) + + timeSeriesDir, allAbstrFileName, alpha, p = sys.argv[1:] + alpha = float(alpha) + p = int(p) + ## calculate the number of papers for each year + numPap = calcNumPap(allAbstrFileName) + ## acquire the time series + timeSeries = acquireTimeSeries(timeSeriesDir) + ## compute the segment difference for each possible segment and + ## the significant items + segDiff, signifItems = computeSegDiff(timeSeries, numPap, alpha) + ## run the optimal segmentation algorithm + R, T = optimalSeg(segDiff, len(timeSeries), p) + for i in range(len(T)): + print i, T[i] + ## calculate the decomposition + decomp = calcDecomp(T) + print decomp + #print signifItems[10] + #print "***" + #print signifItems[1] \ No newline at end of file From a6627397dfdb2a126a176e8e51abf44213e7e1df Mon Sep 17 00:00:00 2001 From: kasramvd Date: Mon, 21 May 2018 11:13:12 +0430 Subject: [PATCH 2/3] add config file --- biotrend/config.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 biotrend/config.py diff --git a/biotrend/config.py b/biotrend/config.py new file mode 100644 index 0000000..9a74927 --- /dev/null +++ b/biotrend/config.py @@ -0,0 +1 @@ +MIN_SIZE = 4 # the minimum size for a token \ No newline at end of file From f523ab8b1a0e0411a6a0f0553a86d2676d702cd1 Mon Sep 17 00:00:00 2001 From: kasramvd Date: Mon, 21 May 2018 11:16:37 +0430 Subject: [PATCH 3/3] Wrap all the functions in a class, syntax improvements and import new modules --- biotrend/biotrend.py | 388 ++++++++++++++++++++----------------------- 1 file changed, 182 insertions(+), 206 deletions(-) diff --git a/biotrend/biotrend.py b/biotrend/biotrend.py index 91a9fad..aabada1 100644 --- a/biotrend/biotrend.py +++ b/biotrend/biotrend.py @@ -1,6 +1,5 @@ #!/usr/bin/python -###################################################################### # tempSeg.py # # Author: Dario Ghersi # # Version: 20150407 # @@ -8,213 +7,190 @@ # series as described in: # # Siy, Chundi, Rosenkrantz, and Subramaniam # # Journal of Software maintenance and evolution, 2007; 11 # -# Usage: tempSeg.py TIME_SERIES_DIR ALPHA P # -###################################################################### +# Usage: tempSeg.py TIME_SERIES_DIR self.ALPHA P # +from collections import defaultdict +from itertools import chain +from config import MIN_SIZE import glob import sys -###################################################################### -# CONSTANTS # -###################################################################### - -MIN_SIZE = 4 # the minimum size for a token - -###################################################################### -# FUNCTIONS # -###################################################################### - -def acquireTimeSeries(timeSeriesDir): - """ - Store the time series as a list of dictionaries - """ - ## set up the variables - allFiles = glob.glob(timeSeriesDir + "/*") - timeSeries = [{} for _ in range(numPoints)] - ## process each time point - for i, f in enumerate(allFiles): - with open(f) as timePointFile: - for line in timePointFile: - token, freq = line[:-1].split() - # include only tokens above a minimum length - if len(token) >= MIN_SIZE: - timeSeries[i][token] = int(freq) - return timeSeries - -###################################################################### - -def calcDecomp(T): - """ - Calculate the optimal decomposition. - """ - - curr = len(T) - 1 - decomp = [] - numCl = len(T[0]) # number of segments - while curr > 0: - decomp.append(curr) - curr = T[curr][numCl - 1] - numCl -= 1 - - decomp.reverse() - return decomp - -###################################################################### - -def calcNumPap(allAbstrFileName): - """ - calculate the total number of papers per year - """ - - numPap = defaultdict(int) - with open(allAbstrFileName) as allAbstrFile: - for line in allAbstrFile: - fields = line[:-1].split() - try: - numPap[int(fields[0])] += 1 - except ValueError: - # invalid literal for int() with base 10 - pass - numPapSorted = [numPap[year] for year in sorted(numPap.keys())] - return numPapSorted - -###################################################################### - -def computeSegDiff(timeSeries, numPap, alpha): - """ - compute the loss for combining two or more segments together - """ - - numSets = len(timeSeries) - totSize = numSets * (numSets - 1) / 2 - segDiff = [[0 for x in range(totSize)] for x in range(totSize)] - ## extract the significant items for each item set - signifItems = [getSignifItems2(timeSeries[i:i+1], - numPap, - alpha, i, i) for i in range(numSets)] - ## process each possible segment - for i in range(numSets - 1): - for j in range(i + 1, numSets): - # extract the significant items - #segmentItems = getSignifItems(timeSeries[i:j + 1], alpha) - segmentItems = getSignifItems2(timeSeries[i:j + 1], numPap, - alpha, i, j) - segDiff[i][j] = fracDiff(signifItems, segmentItems, i, j) - - return segDiff, signifItems - -###################################################################### - -def fracDiff(signifItems, segmentItems, i, j): - """ - compute the cumulative fractional difference between item sets in - a segment i <= x <= j - """ - - segmentItems = set(segmentItems) - fracDiff = sum(len(segmentItems.symmetric_difference( - set(signifItems[x])) - ) for x in range(i, j+1)) - - return fracDiff - -###################################################################### - -def getSignifItems(segment, alpha): - """ - extract the items in the segment whose relative - frequency is above alpha - """ - totals = sum(Counter(timePoint) for timePoint in segment) - ## extract the items whose relative frequency is >= alpha - total = float(sum(totals.values())) - signifItems = [ - item for item, value in totals.items() - if value / total >= alpha - ] - return signifItems - -###################################################################### - -def getSignifItems2(segment, numPap, alpha, i, j): - """ - extract the items in the segment whose relative - frequency is above alpha - """ - - signifItems = [] - avgs = defaultdict(int) - weights = numPap[i:j+1] - numSeg = j - i + 1 - for k in range(numSeg): - timePoint = segment[k] - for item in timePoint: - avgs[item] += timePoint[item] - - ## extract the items whose relative frequency is >= alpha - total = float(sum(weights)) - signifItems = [ - item for item, value in avgs.items() - if value / total >= alpha - ] - #print signifItems, i, j - #sys.exit(1) - return signifItems - -###################################################################### - -def optimalSeg(segDiff, n, p): - """ - apply dynamic programming to optimally segment the time series - """ - - ## initialize the R and T arrays, which will contain the loss values - ## and the segmentation path, respectively - R = [[0 for x in range(p)] for x in range(n)] - T = [[0 for x in range(p)] for x in range(n)] - - for j in range(n): - R[j][0] = segDiff[0][j] - kmax = min(j + 1, p) - - for k in range(1, kmax): - R[j][k] = R[k - 1][k - 1] + segDiff[k][j] - T[j][k] = k - 1 - - for z in range(k - 1, j): - if (R[z][k - 1] + segDiff[z + 1][j]) < R[j][k]: - R[j][k] = R[z][k - 1] + segDiff[z + 1][j] - T[j][k] = z - - return R, T + +class BioTrend: + def __init__(self, *args, **kwargs): + self.time_series_dir = kwargs["time_serise_dir"] + self.all_abstr_file_name = kwargs["all_abstr_file_name"] + self.alpha = kwargs["alpha"] + self.p = kwargs["p"] + + def acquireTimeSeries(self): + """ + Store the time series as a list of dictionaries + """ + ## set up the variables + allFiles = glob.glob(self.time_series_dir + "/*") + timeSeries = [{} for _ in range(numPoints)] + ## process each time point + for i, f in enumerate(allFiles): + with open(f) as timePointFile: + for line in timePointFile: + token, freq = line[:-1].split() + # include only tokens above a minimum length + if len(token) >= MIN_SIZE: + timeSeries[i][token] = int(freq) + return timeSeries + + def calcDecomp(self, T): + """ + Calculate the optimal decomposition. + """ + + curr = len(T) - 1 + decomp = [] + numCl = len(T[0]) # number of segments + while curr > 0: + decomp.append(curr) + curr = T[curr][numCl - 1] + numCl -= 1 + + decomp.reverse() + return decomp + + + def calcNumPap(self): + """ + calculate the total number of papers per year + """ + + numPap = defaultdict(int) + with open(self.all_abstr_file_name) as allAbstrFile: + for line in allAbstrFile: + fields = line[:-1].split() + try: + numPap[int(fields[0])] += 1 + except ValueError: + # invalid literal for int() with base 10 + pass + numPapSorted = [numPap[year] for year in sorted(numPap.keys())] + return numPapSorted + + + def computeSegDiff(self, timeSeries, numPap, self.alpha): + """ + compute the loss for combining two or more segments together + """ + + numSets = len(timeSeries) + totSize = numSets * (numSets - 1) / 2 + segDiff = [[0 for x in range(totSize)] for x in range(totSize)] + ## extract the significant items for each item set + signifItems = [getSignifItems2(timeSeries[i:i+1], + numPap, + self.alpha, i, i) for i in range(numSets)] + ## process each possible segment + for i in range(numSets - 1): + for j in range(i + 1, numSets): + # extract the significant items + #segmentItems = getSignifItems(timeSeries[i:j + 1], self.alpha) + segmentItems = getSignifItems2(timeSeries[i:j + 1], numPap, + self.alpha, i, j) + segDiff[i][j] = fracDiff(signifItems, segmentItems, i, j) + + return segDiff, signifItems + + + def fracDiff(self, signifItems, segmentItems, i, j): + """ + compute the cumulative fractional difference between item sets in + a segment i <= x <= j + """ + + segmentItems = set(segmentItems) + fracDiff = sum(len(segmentItems.symmetric_difference( + set(signifItems[x])) + ) for x in range(i, j+1)) + + return fracDiff + + + def getSignifItems(self, segment, self.alpha): + """ + extract the items in the segment whose relative + frequency is above self.alpha + """ + totals = sum(Counter(timePoint) for timePoint in segment) + ## extract the items whose relative frequency is >= self.alpha + total = float(sum(totals.values())) + signifItems = [ + item for item, value in totals.items() + if value / total >= self.alpha + ] + return signifItems + + + def getSignifItems2(self, segment, numPap, self.alpha, i, j): + """ + extract the items in the segment whose relative + frequency is above self.alpha + """ + + signifItems = [] + avgs = defaultdict(int) + weights = numPap[i:j+1] + numSeg = j - i + 1 + for k in range(numSeg): + timePoint = segment[k] + for item in timePoint: + avgs[item] += timePoint[item] + + ## extract the items whose relative frequency is >= self.alpha + total = float(sum(weights)) + signifItems = [ + item for item, value in avgs.items() + if value / total >= self.alpha + ] + #print signifItems, i, j + #sys.exit(1) + return signifItems + + + def optimalSeg(self, segDiff, n): + """ + apply dynamic programming to optimally segment the time series + """ + + ## initialize the R and T arrays, which will contain the loss values + ## and the segmentation path, respectively + R = [[0 for x in range(self.p)] for x in range(n)] + T = [[0 for x in range(self.p)] for x in range(n)] + + for j in range(n): + R[j][0] = segDiff[0][j] + kmax = min(j + 1, self.p) + + for k in range(1, kmax): + R[j][k] = R[k - 1][k - 1] + segDiff[k][j] + T[j][k] = k - 1 + + for z in range(k - 1, j): + if (R[z][k - 1] + segDiff[z + 1][j]) < R[j][k]: + R[j][k] = R[z][k - 1] + segDiff[z + 1][j] + T[j][k] = z + + return R, T -###################################################################### -# MAIN PROGRAM # -###################################################################### - -if __name__ == "__main__": - ## parse the parameters - if len(sys.argv) != 5: - print "Usage: tempSeg.py TIME_SERIES_DIR ALL_ABSTR ALPHA P" - sys.exit(1) - - timeSeriesDir, allAbstrFileName, alpha, p = sys.argv[1:] - alpha = float(alpha) - p = int(p) - ## calculate the number of papers for each year - numPap = calcNumPap(allAbstrFileName) - ## acquire the time series - timeSeries = acquireTimeSeries(timeSeriesDir) - ## compute the segment difference for each possible segment and - ## the significant items - segDiff, signifItems = computeSegDiff(timeSeries, numPap, alpha) - ## run the optimal segmentation algorithm - R, T = optimalSeg(segDiff, len(timeSeries), p) - for i in range(len(T)): - print i, T[i] - ## calculate the decomposition - decomp = calcDecomp(T) - print decomp - #print signifItems[10] - #print "***" - #print signifItems[1] \ No newline at end of file + def run(self): + ## calculate the number of papers for each year + numPap = self.calcNumPap() + ## acquire the time series + timeSeries = self.acquireTimeSeries() + ## compute the segment difference for each possible segment and + ## the significant items + segDiff, signifItems = self.computeSegDiff(timeSeries, numPap) + ## run the optimal segmentation algorithm + R, T = self.optimalSeg(segDiff, len(timeSeries), p) + for i in range(len(T)): + print i, T[i] + ## calculate the decomposition + decomp = self.calcDecomp(T) + return decomp