From 772af2163f1b24dab2da3077f004b6cd0f99fd53 Mon Sep 17 00:00:00 2001 From: Emerson Rocha Date: Sun, 4 Sep 2022 11:07:18 -0300 Subject: [PATCH] (#5): scripts/etc/woah-reflab-downloader.js: started --- i18n/zxx/woah.meta.yml | 10 +- scripts/common.lib.sh | 71 ++++++++++- scripts/etc/whocc-downloader.js | 2 +- scripts/etc/woah-reflab-downloader.js | 173 ++++++++++++++++++++++++++ scripts/via-github-action.sh | 5 + 5 files changed, 254 insertions(+), 7 deletions(-) create mode 100644 scripts/etc/woah-reflab-downloader.js diff --git a/i18n/zxx/woah.meta.yml b/i18n/zxx/woah.meta.yml index 58c539e..28494eb 100644 --- a/i18n/zxx/woah.meta.yml +++ b/i18n/zxx/woah.meta.yml @@ -14,7 +14,6 @@ # http://www.animalhealthsurveillance.agriculture.gov.ie/oielisteddiseases/ - # oie-cmn-a.tm.hxl.csv # Ancienne classification des maladies notifiables à l’OIE – Liste A @@ -31,7 +30,12 @@ # >>> Multiple species diseases >>> Maladies communes à plusieurs espèces >>> Enfermedades comunes a varias especies - # Manually download the file https://web.oie.int/delegateweb/fr/manuels/Template_Reference_data_English_French_Spanish.xlsx # Manually download the file https://www.oie.int/fileadmin/Home/eng/Solidarity/Prioritisation_v5.8_locked.xls -# Manually download the file https://www.woah.org/fileadmin/Home/eng/Health_standards/tahm/XX_LIST_LAB.pdf \ No newline at end of file +# Manually download the file https://www.woah.org/fileadmin/Home/eng/Health_standards/tahm/XX_LIST_LAB.pdf + +#### Reference Laboratories ____________________________________________________ +# - https://www.woah.org/en/what-we-offer/expertise-network/reference-laboratories/ +# - https://crm.oie.int/interconnexion/laboratoires.php?LANG=EN +# - https://crm.oie.int/interconnexion/laboratoires.php?LANG=FR +# - https://crm.oie.int/interconnexion/laboratoires.php?LANG=ES \ No newline at end of file diff --git a/scripts/common.lib.sh b/scripts/common.lib.sh index 66e767e..f0a94a8 100644 --- a/scripts/common.lib.sh +++ b/scripts/common.lib.sh @@ -33,6 +33,8 @@ __BUILDTEMPDIR="$ROOTDIR/partials/temp" BUILDTEMPDIR="${BUILDTEMPDIR:-$__BUILDTEMPDIR}" #### Configurable variables - - - - - - - - - - - - - - - - - - - - - - - - - - + +WHOA_LANGS=("EN" "FR" "ES") # from https://apps.who.int/whocc/Search.aspx WHO_REGIONS=("AFRO" "AMRO" "EMRO" "EURO" "SEARO" "WPRO") LSF_REMOTE_GIT="https://github.com/EticaAI/lexicographi-sine-finibus.git" @@ -57,6 +59,69 @@ tty_normal=$(tput sgr0) # printf "\t%40s\n" "${tty_red} ERROR: [] ${tty_normal}" #### Fancy colors constants - - - - - - - - - - - - - - - - - - - - - - - - - - +####################################### +# crawler_who_cc fetch reference laboratories from +# World Organisation for Animal Health (WOAH, founded as OIE) +# +# Globals: +# ROOTDIR +# BUILDTEMPDIR +# WHOA_LANGS +# Arguments: +# repo Repository to fetch the data +# savepath (optional) Path to store the metadata +# Returns +# None +####################################### +crawler_woah_reflab() { + # echo "${FUNCNAME[0]} TODO" + printf "\n\t%40s\n" "${tty_blue}${FUNCNAME[0]} STARTED ${tty_normal}" + + _data_published="$ROOTDIR/data/whoa-reference-laboratories.hxl.csv" + + outputs=() + for lang in "${WHOA_LANGS[@]}"; do + output="$BUILDTEMPDIR/woah_reflab_$lang.csv" + outputs+=("$output") + # crawler_woah_reflab "$lang" "$output" + set -x + node "$ROOTDIR/scripts/etc/woah-reflab-downloader.js" \ + --woah-language "$lang" \ + --output "$output" + set +x + frictionless validate "$output" + sleep 10 + done + + set -x + # shellcheck disable=SC2048,SC2086 + csvjoin ${outputs[*]} >"$BUILDTEMPDIR/woah_reflab_all.csv" + frictionless validate "$BUILDTEMPDIR/woah_reflab_all.csv" + csvsort -c 1,2 "$BUILDTEMPDIR/woah_reflab_all.csv" >"$BUILDTEMPDIR/woah_reflab.csv" + # csvsort -c 1,2 "$BUILDTEMPDIR/woah_reflab_.csv" >"$BUILDTEMPDIR/whocc.csv" + + echo "@TODO" + + exit 1 + ./scripts/readme-from-csv.py \ + --method=table-rename \ + --table-meta=i18n/zxx/who-cc.meta.yml \ + "$BUILDTEMPDIR/whoa-reference-laboratories.csv" \ + >"$BUILDTEMPDIR/whoa-reference-laboratories.hxl.csv" + + frictionless validate "$BUILDTEMPDIR/whocc.hxl.csv" + + if [ -f "$_data_published" ]; then + echo "deleting old [$_data_published]" + # rm "$_data_published" + fi + + cp "$BUILDTEMPDIR/whocc.hxl.csv" "$_data_published" + + set +x + printf "\t%40s\n" "${tty_green}${FUNCNAME[0]} FINISHED OKAY ${tty_normal}" +} + ####################################### # crawler_who_cc raw CSVs from WHO Collaborating Centres # @@ -80,9 +145,9 @@ crawler_who_cc() { for region in "${WHO_REGIONS[@]}"; do output="$BUILDTEMPDIR/$region.csv" outputs+=("$output") - # crawler_who_cc_fech_region "$region" "$output" - # frictionless validate "$output" - # sleep 10 + crawler_who_cc_fech_region "$region" "$output" + frictionless validate "$output" + sleep 10 done set -x diff --git a/scripts/etc/whocc-downloader.js b/scripts/etc/whocc-downloader.js index 7df6f79..34adc1d 100644 --- a/scripts/etc/whocc-downloader.js +++ b/scripts/etc/whocc-downloader.js @@ -24,7 +24,7 @@ * SPDX-License-Identifier: Unlicense * VERSION: v1.0 * CREATED: 2022-09-03 09:21 UTC - * REVISION: 2022-09-03 10:40 UTC v1.1 build-readme.sh -> common.lib.sh + * REVISION: --- *******************************************************************************/ // > TL:DR: do this diff --git a/scripts/etc/woah-reflab-downloader.js b/scripts/etc/woah-reflab-downloader.js new file mode 100644 index 0000000..d0ad9a0 --- /dev/null +++ b/scripts/etc/woah-reflab-downloader.js @@ -0,0 +1,173 @@ +#!/usr/bin/env node +/******************************************************************************* + * + * + * FILE: woah-reflab-downloader.js + * scripts/etc/woah-reflab-downloader.js + * + * USAGE: node scripts/etc/woah-reflab-downloader.js \ + * --woah-language 'AFRO' \ + * --output 'partials/temp/AFRO.csv + * + * DESCRIPTION: Download CSVs from https://apps.who.int/whocc/Search.aspx + * and do some clean up + * + * REQUIREMENTS: - nodejs + * - puppeteer (yarn add i puppeteer) + * (Down: Chromium (~170MB Mac, ~282MB Linux, ~280MB Win)) + * - commander (yarn add i commander) + * - csv (yarn add i csv) + * BUGS: --- + * NOTES: --- + * AUTHOR: Emerson Rocha + * COMPANY: EticaAI + * LICENSE: Public Domain dedication + * SPDX-License-Identifier: Unlicense + * VERSION: v1.0 + * CREATED: 2022-09-03 12:42 UTC started. based on whocc-downloader.js + * REVISION: --- +*******************************************************************************/ + +// > TL:DR: do this +// node scripts/etc/woah-reflab-downloader.js --woah-language 'EN' --output 'partials/temp/woah-reflab-en.csv' +// node scripts/etc/woah-reflab-downloader.js --woah-language 'FR' --output 'partials/temp/woah-reflab-fr.csv' +// node scripts/etc/woah-reflab-downloader.js --woah-language 'ES' --output 'partials/temp/woah-reflab-es.csv' + +// > To Debug: +// node scripts/etc/woah-reflab-downloader.js --woah-language 'es' --output 'partials/temp/woah-reflab-es.csv' --show-browser +// > To check if is valid: +// frictionless validate partials/temp/woah-reflab-es.csv + +// node --trace-warnings scripts/etc/who_cc.js +// yarn add i commander +// yarn add i puppeteer +// "When you install Puppeteer, it downloads a recent version of +// Chromium (~170MB Mac, ~282MB Linux, ~280MB Win) +const os = require("os"); +const fs = require('fs'); +const csv = require('csv'); +const fsPromises = fs.promises; +const iso6393a2toa3 = { + 'EN': 'eng', + 'FR': 'fra', + 'ES': 'spa', +} + +const puppeteer = require('puppeteer'); +const { program } = require('commander'); + +program + .name('woah') + .description('Data mine World Organisation for Animal Health reference laboratories') + .option('--woah-language', 'Language. Example: "EN", "FR", "ES"') + .option('--output', 'Path to output. Example: temp/AFRO.csv') + .option('--show-browser', 'If need show browser (use as last option)', false) + // .option('--teste', 'Path to output. Defaults to region.csv') + // .option('--tempdir', 'Path to a temporary dir', null) + ; + +program.parse(process.argv); +const options = program.opts(); + +// const project_woahlang = options.whoRegion; +const project_woahlang = program.args[0].toLocaleUpperCase(); +const project_woahlang_a3 = iso6393a2toa3[project_woahlang]; +// const project_output = options.output ? options.output : project_woahlang + '.csv' +const project_output = program.args[1]; +// const project_tempdirdir = options.tempdir +const show_browser = options.showBrowser; + +console.log(project_woahlang, project_woahlang_a3, project_output, program.args, program.args[0]); + +// https://crm.oie.int/interconnexion/laboratoires.php?LANG=EN +// https://crm.oie.int/interconnexion/laboratoires.php?LANG=FR +// https://crm.oie.int/interconnexion/laboratoires.php?LANG=ES +const project_page_start = `https://crm.oie.int/interconnexion/laboratoires.php?LANG=` + project_woahlang; +const project_name = 'woah'; + +(async () => { + // console.log('started'); + + const project_tempdirdir = await fsPromises.mkdtemp(os.tmpdir() + "/woah-", (err, folder) => { + if (err) + console.log(err); + else { + console.log("The temporary folder path is:", folder); + } + return folder + '/'; + }); + console.log('Started. Tempdir at: ', project_tempdirdir) + + const browser = await puppeteer.launch( + { + // headless: is_headless, // Here can enable/disable show the browser + headless: !show_browser, // Here can enable/disable show the browser + } + ); + + const page = await browser.newPage(); + const client = await page.target().createCDPSession(); + await client.send('Page.setDownloadBehavior', { + behavior: 'allow', downloadPath: project_tempdirdir + }); + + await page.goto(project_page_start); + await page.screenshot({ path: project_tempdirdir + '/' + project_name + '_v1.png' }); + + await page.waitForSelector('#shortcutstable') + + // let titles = await(await page.$$('h3')).evaluate(node => node.innerText); + + all_titles = [] + let titles = await page.$$('h3'); + for (const title of titles) { + // videoLinks.push(await link.evaluate( node => node.getAttribute('href'))); + all_titles.push(await title.evaluate(node => node.innerText)); + }; + + // console.log(titles) + // console.log(all_titles) + + let data = [] + data.push('#item+rem+i_' + project_woahlang_a3 + '+is_latn') + data = data.concat(all_titles) + + let dataToWrite = '' + + // Poor's man matrix to CSV string. + data.forEach((line) => { + line_items = [] + if (!Array.isArray(line)) { + line = [line] + } + line.forEach((item) => { + item = item.trim() + if (item.indexOf(",") > -1){ + line_items.push('"' + item + '"') + } else { + line_items.push(item) + } + }); + dataToWrite += line_items.join(",") + "\n" + }); + + fs.writeFile(project_output, dataToWrite, 'utf8', function (err) { + if (err) { + console.log('Some error occured', project_output); + } else { + console.log('It\'s saved!', project_output); + } + }); + + // await Promise.all([ + // new Promise(r => setTimeout(r, 60000)) + // ]).catch(function (err) { + // console.log(err.message); + // process.exit(1); + // }); + + + console.log('TODO: delete tempdir', project_tempdirdir) + await browser.close(); +})(); + diff --git a/scripts/via-github-action.sh b/scripts/via-github-action.sh index 5cd97b8..c293206 100755 --- a/scripts/via-github-action.sh +++ b/scripts/via-github-action.sh @@ -54,6 +54,11 @@ if [ "$OPERATION" = "crawler_wikidata_who_icd" ]; then crawler_wikidata_who_icd exit 0 fi +# OPERATION=crawler_woah_reflab ./scripts/via-github-action.sh +if [ "$OPERATION" = "crawler_woah_reflab" ]; then + crawler_woah_reflab + exit 0 +fi echo "unknow operation [$OPERATION]" exit 1