|
1 | 1 | var _ = require('lodash');
|
2 | 2 | var Promise = require('bluebird');
|
3 | 3 | var dateFormat = require('dateformat');
|
| 4 | +var CSV = require('csv-js'); |
| 5 | +var r = require('request'); |
| 6 | +var zlib = require('zlib'); |
4 | 7 |
|
5 | 8 | module.exports = {
|
6 | 9 |
|
@@ -28,67 +31,112 @@ module.exports = {
|
28 | 31 | return _.sortedIndexOf(haystack, needle) !== -1 ;
|
29 | 32 | },
|
30 | 33 |
|
31 |
| - processDownloads:function(response,directDownloads,indirectDownloads,total,callback) { |
32 |
| - var hits = response.hits.hits; |
33 |
| - var _response = { |
34 |
| - hits: { total: response.hits.total }, |
35 |
| - _scroll_id: response._scroll_id |
| 34 | + getDailyDownloads: function (day, callback) { |
| 35 | + |
| 36 | + var dayDateString = dateFormat(day, "yyyy-mm-dd").toString(); |
| 37 | + var url = `http://cran-logs.rstudio.com/${day.getFullYear()}/${dayDateString}.csv.gz`; |
| 38 | + |
| 39 | + var requestSettings = { |
| 40 | + method: 'GET', |
| 41 | + url, |
| 42 | + encoding: null, |
36 | 43 | };
|
37 |
| - var hit_date = hits[1].fields.datetime[0]; |
38 |
| - var date = new Date(hit_date); |
39 |
| - var formattedDate = dateFormat(date, "yyyy-mm-dd").toString(); |
40 | 44 |
|
41 |
| - Promise.map(hits, function(hit, i) { |
42 |
| - //execute queries to find inverse dependencies for all hits asynchronous, and find indirect hits before and after in ordered records |
43 |
| - var package_name = hit.fields.package[0]; |
| 45 | + console.info('Sending request ...'); |
| 46 | + r(requestSettings, function(error, response, buf) { |
| 47 | + if(response.statusCode === 404){ |
| 48 | + return callback({message: "empty"}); |
| 49 | + } |
| 50 | + else if(response.statusCode === 200){ |
| 51 | + console.info('Unzipping ...'); |
| 52 | + zlib.gunzip(buf, function(err, dezipped) { |
| 53 | + if(err){ |
| 54 | + return callback(err); |
| 55 | + } |
| 56 | + console.info('Parsing csv ...'); |
| 57 | + var downloads = CSV.parse(dezipped.toString()); |
| 58 | + downloads.shift(); // remove header line |
| 59 | + var downloads = _.map(downloads, function(download){ |
| 60 | + return { |
| 61 | + date: download[0], |
| 62 | + time: download[1], |
| 63 | + dateTime: new Date(`${download[0]}T${download[1]}Z`), |
| 64 | + package: download[6], |
| 65 | + ip_id: download[9] |
| 66 | + } |
| 67 | + }); |
| 68 | + downloads.sort(function(download1, download2){ |
| 69 | + return download1.dateTime.getTime - download2.dateTime.getTime; |
| 70 | + }); |
| 71 | + DownloadStatsService.processDailyDownloads(day, downloads, callback); |
| 72 | + }); |
| 73 | + } |
| 74 | + }); |
| 75 | + |
| 76 | + |
| 77 | + }, |
| 78 | + |
| 79 | + processDailyDownloads: function(date, downloads, callback) { |
| 80 | + console.info('Processing downloads ...'); |
| 81 | + var indirectDownloads = {}; // The value for every key is a set with ip_id's, this will automatically only count unique ip's |
| 82 | + var directDownloads = {}; |
| 83 | + Promise.map(downloads, function(download, i) { |
| 84 | + var package_name = download.package; |
| 85 | + |
| 86 | + function addDownloadTo(hash, download) { |
| 87 | + if(!hash[package_name]) |
| 88 | + hash[package_name] = new Set(); |
| 89 | + hash[package_name].add(download.ip_id); |
| 90 | + } |
| 91 | + |
44 | 92 | return DownloadStatsService.getReverseDependencies(package_name).then(function(rootPackageNames) {
|
45 | 93 |
|
46 | 94 | var indirect = false;
|
47 | 95 | var j=i+1;
|
48 | 96 |
|
49 |
| - var thisHitTimestamp = new Date(hit.fields.datetime[0]).getTime(); |
| 97 | + var downloadTime = download.dateTime.getTime(); |
50 | 98 |
|
51 |
| - while (!indirect && j<hits.length && hits[j].fields.ip_id[0] == hit.fields.ip_id[0] && |
52 |
| - new Date(hits[j].fields.datetime[0]).getTime()< (thisHitTimestamp+60000) |
53 |
| - ) { |
54 |
| - if(DownloadStatsService.binarySearchIncludes(rootPackageNames,hits[j].fields.package[0])) { |
55 |
| - indirectDownloads[package_name] = indirectDownloads[package_name]+1 || 1; |
56 |
| - indirect=true; |
| 99 | + for(j= i + 1; j < downloads.length; j++) { |
| 100 | + if(indirect || downloads[j].dateTime.getTime() > downloadTime + 60 * 1000) |
| 101 | + break; |
| 102 | + if(downloads[j].ip_id === download.ip_id && DownloadStatsService.binarySearchIncludes(rootPackageNames, downloads[j].package)){ |
| 103 | + addDownloadTo(indirectDownloads, download) |
| 104 | + indirect = true; |
57 | 105 | }
|
58 |
| - j+=1; |
59 | 106 | }
|
60 |
| - j=i-1; |
61 |
| - while (j>=0 && hits[j].fields.ip_id[0] == hit.fields.ip_id[0] && |
62 |
| - new Date(hits[j].fields.datetime[0]).getTime()+60000> (thisHitTimestamp) && |
63 |
| - !(indirect) |
64 |
| - ) { |
65 |
| - if(DownloadStatsService.binarySearchIncludes(rootPackageNames,hits[j].fields.package[0])) { |
66 |
| - indirectDownloads[package_name] = indirectDownloads[package_name]+1 || 1; |
67 |
| - indirect=true; |
| 107 | + |
| 108 | + for(j= i - 1; j >= 0; j--) { |
| 109 | + if(indirect || downloads[j].dateTime.getTime() < downloadTime - 60 * 1000) |
| 110 | + break; |
| 111 | + if(downloads[j].ip_id === download.ip_id && DownloadStatsService.binarySearchIncludes(rootPackageNames, downloads[j].package)){ |
| 112 | + addDownloadTo(indirectDownloads, download) |
| 113 | + indirect = true; |
68 | 114 | }
|
69 |
| - j-=1; |
70 | 115 | }
|
| 116 | + |
71 | 117 | if(!indirect){
|
72 |
| - directDownloads[package_name] = directDownloads[package_name]+1 || 1; |
| 118 | + addDownloadTo(directDownloads, download) |
73 | 119 | }
|
74 | 120 | });
|
75 |
| - |
76 |
| - }, {concurrency: 10}).then(function(){ |
77 |
| - |
78 |
| - return ElasticSearchService.scrollDailyDownloadsBulk(_response,formattedDate,directDownloads,indirectDownloads,total,callback); |
| 121 | + }, {concurrency: 10}) |
| 122 | + .then(function(){ |
| 123 | + DownloadStatsService.writeDownloadsToDB(date, directDownloads, indirectDownloads) |
| 124 | + .then(function(result){ |
| 125 | + console.info('Downloads written to database!'); |
| 126 | + callback(null,result); |
| 127 | + }); |
79 | 128 | });
|
80 | 129 | },
|
81 | 130 |
|
82 |
| - //write all splitted download counts to the database |
83 |
| - writeSplittedDownloadCounts: function(date,directDownloads,indirectDownloads){ |
84 |
| - console.log("writing data"); |
| 131 | + writeDownloadsToDB: function(date,directDownloads,indirectDownloads){ |
| 132 | + console.info("Writing data to database ..."); |
85 | 133 | return Package.findAll({attributes: ['name']}).then(function(packages) {
|
86 | 134 | var records = _.map(packages, function(_package) {
|
87 | 135 | return {
|
88 | 136 | package_name: _package.name,
|
89 | 137 | date: date,
|
90 |
| - indirect_downloads: indirectDownloads[_package.name] || 0, |
91 |
| - direct_downloads: directDownloads[_package.name] || 0 |
| 138 | + indirect_downloads: (indirectDownloads[_package.name] || new Set()).size, |
| 139 | + direct_downloads: (directDownloads[_package.name] || new Set()).size |
92 | 140 | };
|
93 | 141 | });
|
94 | 142 | var groups = _.chunk(records,500);
|
|
0 commit comments