From 901808c7a6ce5450921c01e48b8958a51aa9480d Mon Sep 17 00:00:00 2001 From: mlwmlw <520cutecat@gmail.com> Date: Wed, 15 Apr 2015 00:13:13 +0800 Subject: [PATCH] fix parse awarding --- awarding.ls | 82 +++++++++++++++++++++++++++++++++++------------------ main.ls | 6 ++-- server.ls | 79 ++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 114 insertions(+), 53 deletions(-) diff --git a/awarding.ls b/awarding.ls index eff18d8..0624aa1 100644 --- a/awarding.ls +++ b/awarding.ls @@ -28,9 +28,13 @@ parseFailed = (url, cb) -> cb {origin_publish: trim($ '.main tr' .eq(6) .find 'td' .text!).replace /(\d+)\/(\d+)\/(\d+)/, (date, year, month, day) -> (+year + 1911) + "-" + month + "-" + day } -parseAward = (url, cb) -> +parseAward = (url, cb, mode) -> award = {} - error, res <- request.get url + if mode + modeUrl = url+"&contentMode="+mode + else + modeUrl = url+"&contentMode=0" + error, res <- request.get modeUrl $ = cheerio.load res.body merchants = {} merchant = trim($ '.award_table_tr_4 tr' .eq 5 .find 'td' .text!) @@ -40,33 +44,53 @@ parseAward = (url, cb) -> (+year + 1911) + "-" + month + "-" + day return false $rows = $ '.award_table_tr_3 tr' - $th = $rows.find 'th' - $td = $rows.find 'td' total = $rows.eq 0 .find 'td' .text!.replace(/\s+/g, '') - 1 id = null - map = {'廠商代碼': '_id', '廠商名稱': 'name', '廠商電話': 'phone', '廠商地址': 'address', '廠商業別': 'industry', '組織型態': 'org', '僱用員工總人數是否超過100人': 'over100', '決標金額': 'amount'} - for i to $th.length - 1 - if i == 0 - continue; - key = $th.eq(i).text!.replace(/\s+/g, '') - value = $td.eq(i).text!.replace(/\s+/g, '') - if map[key] == '_id' - id = value - merchants[id] = {} - else if map[key] == 'industry' - split = $td.eq(i).text!.split(/\s+/) - value = split[1] - if split.length > 3 - merchants[id].registration = split[3] - else if map[key] == 'amount' - value = +value.replace(/[元,]/g, '') - if !map[key] - continue; - merchants[id][map[key]] = value + map = {'廠商代碼': '_id', '廠商名稱': 'name', '廠商電話': 'phone', '廠商地址': 'address', '廠商業別': 'industry', '組織型態': 'org', '僱用員工總人數是否超過100人': 'over100', '決標金額': 'amount', '是否得標': 'awarding', '得標廠商國別': 'country', '有無在我國辦理分公司登記': 'tw_branch'} + #multiple mode +# console.log url, $rows.length + if $rows.length == 0 + $rows = $ '.award_table_tr_3' + value = trim($rows.eq(2).find 'td' .text!) + if /完整資料/.test value + return parseAward url, cb, 1 + else + value = value.replace /\s+/g, ' ' + ms = value.match /(\d+)\s+(\S+)\s+(\S+)\s+(\S+)/g + for i, raw of ms + m = raw.split /[ ]+/ + id = m[2] + merchants[id] = { + _id: id, + awarding: {'得標': 1, '未得標': 0}[m[1]], + name: m[3] + } + else + for i to $rows.length - 1 + if i == 0 + continue; + $row = $rows.eq i + key = $row.find 'th' .text!.replace /\s+/g, '' + value = $row.find 'td' .text!.replace /\s+/g, '' + if !map[key] + continue; + if map[key] == '_id' + id = value + merchants[id] = {} + else if map[key] == 'industry' + split = $row.find 'td' .text!.split(/\s+/) + value = split[1] + if split.length > 3 + merchants[id].registration = split[3] + else if map[key] == 'amount' + value = +value.replace(/[元,]/g, '') + else if map[key] == 'awarding' + value = {'是': 1, '否': 0}[value] + merchants[id][map[key]] = value award.merchants = [] for i, merchant of merchants clone = {} <<< merchant - if merchant.amount + if merchant.awarding award.merchants.push clone delete merchant.amount cb(award, merchants) @@ -120,6 +144,7 @@ getDocs = (date, page) -> merchants.push m; row.merchants = award.merchants; row.origin_publish = award.origin_publish + row.candidates = ms awardDeferred.resolve row else row.url = 'http://web.pcc.gov.tw/tps/pss/' + row.failed_url @@ -143,6 +168,9 @@ getDocs = (date, page) -> return deferred.promise #parseAward 'http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51511072&tenderCaseNo=GAA0326001' -#parseAward 'http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51510570&tenderCaseNo=NO1040271' -#getDocsByDate '2015-03-23' .then (data) -> -# console.log data +#parseAward 'http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51500178&tenderCaseNo=104-0029' +#parseAward 'http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51519966&tenderCaseNo=GF3-104011&contentMode=0', (abc, ms) -> +#parseAward 'http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51105162&tenderCaseNo=LP5-102022', (abc, ms) -> +# console.log abc, ms +#getDocsByDate '2015-01-19' .then (data) -> + #console.log data diff --git a/main.ls b/main.ls index df642ee..202daa7 100644 --- a/main.ls +++ b/main.ls @@ -35,9 +35,9 @@ client = client.connect uri, (err, db) -> row._id = row.key if /更正公告/.test row.name delete row.name - publish = moment.min moment(date), moment(row.publish) .toDate! + publish = moment.max moment(date), moment(row.publish) .toDate! delete row.publish - bulk.find {_id: row.key} .upsert!.update { $set: row, $min: {publish: publish} } + bulk.find {_id: row.key} .upsert!.update { $set: row, $max: {publish: publish} } console.log "tender " + res.length if res.length promiseMain.add bulk.execute @@ -58,7 +58,7 @@ client = client.connect uri, (err, db) -> $set: a } pccBulk.find {id: a.id, publish: publish, unit: new RegExp a.unit} .update { - $set: {award: {_id: a.key, merchants: a.merchants || [], url: a.url, publish: a.publish}} + $set: {merchants: a.candidates || [], award: {_id: a.key, merchants: a.merchants || [], url: a.url, publish: a.publish}} } promiseSub.add awardBulk.execute promiseSub.add pccBulk.execute diff --git a/server.ls b/server.ls index 5a2ec9b..326bf25 100644 --- a/server.ls +++ b/server.ls @@ -91,10 +91,41 @@ app.get '/category/:category', (req, res) -> db.collection 'pcc' .find { category: req.params.category } .limit 200 .toArray (err, docs) -> res.send docs -app.get '/merchants/', (req, res) -> - err, merchants <- db.collection 'merchants' .find {} .toArray +app.get '/rank/merchants/:order?/:year?', (req, res) -> + year = req.params.year + start = new Date year, 0, 1 + end = new Date year, 11, 31 + $sort = {} + $sort.$sort = {}; + $sort.$sort[req.params.order || "sum"] = -1; + $match = { "award.merchants._id": {$ne: ""}} + $match.publish = {$gte: start, $lte: end} + err, merchants <- db.collection 'pcc' .aggregate [ + { $unwind: "$award.merchants" }, + { $match: $match}, + { $group : {_id: "$award.merchants._id", merchants: {$addToSet: "$award.merchants"}, count: {$sum: 1}, sum: {$sum: "$award.merchants.amount"}}}, + $sort, + { $limit: 100}] + for i,m of merchants + m.merchant = m.merchants.pop! + delete m.merchants res.send merchants + + +app.get '/merchants/:id?', (req, res) -> + id = req.params.id + filter = {} + if /\d+/.test id + filter = {_id: id} + else + filter = {name: id} + err, merchants <- db.collection 'merchants' .find filter .toArray + if id + res.send merchants[0] + else + res.send merchants + app.get '/merchant/:id?', (req, res) -> id = req.params.id if !id @@ -106,15 +137,33 @@ app.get '/merchant/:id?', (req, res) -> err, docs <- db.collection 'pcc' .find filter .toArray res.send docs -app.get '/tender/rank/', (req, res) -> - start = moment!.startOf 'month' .toDate! - end = moment!.endOf 'month' .toDate! +app.get '/tender/:id/:unit?', (req, res) -> + id = req.params.id + unit = req.params.unit + if !id + return res.send {} + filter = {id: id} + if unit + filter.unit = new RegExp(unit - /\s+/g) + err, tenders <- db.collection 'pcc' .find filter .sort {publish: -1} .toArray + res.send tenders + + +app.get '/rank/tender/:month?', (req, res) -> + m = req.params.month + start = moment m .startOf 'month' .toDate! + end = moment m .endOf 'month' .toDate! err, tenders <- db.collection 'pcc' .find {publish: {$gte: start, $lte: end}} .sort {price: -1} .limit 100 .toArray res.send tenders -app.get '/partner', (req, res) -> +app.get '/partner/:year?', (req, res) -> + year = req.params.year + start = new Date year, 0, 1 + end = new Date year, 11, 31 + $match = {merchants: {$exists: 1}} + $match.publish = {$gte: start, $lte: end} db.collection 'award' .aggregate [ - {$match: {merchants: {$exists: 1}}}, + {$match: $match}, {$unwind: "$merchants"}, {$group: {_id: {unit: "$unit", merchant:"$merchants.name", merchant_id: "$merchants._id"}, price: {$sum: "$price"}, count: {$sum: 1}}}, {$sort: {count: -1}}, @@ -123,21 +172,6 @@ app.get '/partner', (req, res) -> ], (err, docs) -> res.send docs -app.get '/merchants/rank/:order?', (req, res) -> - $sort = {} - $sort.$sort = {}; - $sort.$sort[req.params.order || "sum"] = -1; - err, merchants <- db.collection 'pcc' .aggregate [ - { $unwind: "$award.merchants" }, - { $match: { "award.merchants._id": {$ne: ""}}}, - { $group : {_id: "$award.merchants._id", merchants: {$addToSet: "$award.merchants"}, count: {$sum: 1}, sum: {$sum: "$award.merchants.amount"}}}, - $sort, - { $limit: 100}] - for i,m of merchants - m.merchant = m.merchants.pop! - delete m.merchants - res.send merchants - app.get '/units/:id?', (req, res) -> if req.params.id == 'all' err, docs <- db.collection 'pcc' .aggregate { $group: { _id: '$unit'}} @@ -158,7 +192,6 @@ app.get '/unit/:unit/:month?', (req, res) -> end = new Date req.params.month + "-01" end.setMonth end.getMonth!+1 filter.publish = {$gte: start, $lt: end} - console.log filter db.collection 'pcc' .find filter .toArray (err, docs) -> docs.sort (a, b) -> return b.publish - a.publish