Skip to content

Commit

Permalink
(#5): scripts/etc/woah-reflab-downloader.js: started
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed Sep 4, 2022
1 parent 6b7de71 commit 772af21
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 7 deletions.
10 changes: 7 additions & 3 deletions i18n/zxx/woah.meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

# http://www.animalhealthsurveillance.agriculture.gov.ie/oielisteddiseases/


# oie-cmn-a.tm.hxl.csv
# Ancienne classification des maladies notifiables à l’OIE – Liste A

Expand All @@ -31,7 +30,12 @@

# >>> Multiple species diseases >>> Maladies communes à plusieurs espèces >>> Enfermedades comunes a varias especies


# Manually download the file https://web.oie.int/delegateweb/fr/manuels/Template_Reference_data_English_French_Spanish.xlsx
# Manually download the file https://www.oie.int/fileadmin/Home/eng/Solidarity/Prioritisation_v5.8_locked.xls
# Manually download the file https://www.woah.org/fileadmin/Home/eng/Health_standards/tahm/XX_LIST_LAB.pdf
# Manually download the file https://www.woah.org/fileadmin/Home/eng/Health_standards/tahm/XX_LIST_LAB.pdf

#### Reference Laboratories ____________________________________________________
# - https://www.woah.org/en/what-we-offer/expertise-network/reference-laboratories/
# - https://crm.oie.int/interconnexion/laboratoires.php?LANG=EN
# - https://crm.oie.int/interconnexion/laboratoires.php?LANG=FR
# - https://crm.oie.int/interconnexion/laboratoires.php?LANG=ES
71 changes: 68 additions & 3 deletions scripts/common.lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ __BUILDTEMPDIR="$ROOTDIR/partials/temp"
BUILDTEMPDIR="${BUILDTEMPDIR:-$__BUILDTEMPDIR}"

#### Configurable variables - - - - - - - - - - - - - - - - - - - - - - - - - -

WHOA_LANGS=("EN" "FR" "ES")
# from https://apps.who.int/whocc/Search.aspx
WHO_REGIONS=("AFRO" "AMRO" "EMRO" "EURO" "SEARO" "WPRO")
LSF_REMOTE_GIT="https://github.com/EticaAI/lexicographi-sine-finibus.git"
Expand All @@ -57,6 +59,69 @@ tty_normal=$(tput sgr0)
# printf "\t%40s\n" "${tty_red} ERROR: [] ${tty_normal}"
#### Fancy colors constants - - - - - - - - - - - - - - - - - - - - - - - - - -

#######################################
# crawler_who_cc fetch reference laboratories from
# World Organisation for Animal Health (WOAH, founded as OIE)
#
# Globals:
# ROOTDIR
# BUILDTEMPDIR
# WHOA_LANGS
# Arguments:
# repo Repository to fetch the data
# savepath (optional) Path to store the metadata
# Returns
# None
#######################################
crawler_woah_reflab() {
# echo "${FUNCNAME[0]} TODO"
printf "\n\t%40s\n" "${tty_blue}${FUNCNAME[0]} STARTED ${tty_normal}"

_data_published="$ROOTDIR/data/whoa-reference-laboratories.hxl.csv"

outputs=()
for lang in "${WHOA_LANGS[@]}"; do
output="$BUILDTEMPDIR/woah_reflab_$lang.csv"
outputs+=("$output")
# crawler_woah_reflab "$lang" "$output"
set -x
node "$ROOTDIR/scripts/etc/woah-reflab-downloader.js" \
--woah-language "$lang" \
--output "$output"
set +x
frictionless validate "$output"
sleep 10
done

set -x
# shellcheck disable=SC2048,SC2086
csvjoin ${outputs[*]} >"$BUILDTEMPDIR/woah_reflab_all.csv"
frictionless validate "$BUILDTEMPDIR/woah_reflab_all.csv"
csvsort -c 1,2 "$BUILDTEMPDIR/woah_reflab_all.csv" >"$BUILDTEMPDIR/woah_reflab.csv"
# csvsort -c 1,2 "$BUILDTEMPDIR/woah_reflab_.csv" >"$BUILDTEMPDIR/whocc.csv"

echo "@TODO"

exit 1
./scripts/readme-from-csv.py \
--method=table-rename \
--table-meta=i18n/zxx/who-cc.meta.yml \
"$BUILDTEMPDIR/whoa-reference-laboratories.csv" \
>"$BUILDTEMPDIR/whoa-reference-laboratories.hxl.csv"

frictionless validate "$BUILDTEMPDIR/whocc.hxl.csv"

if [ -f "$_data_published" ]; then
echo "deleting old [$_data_published]"
# rm "$_data_published"
fi

cp "$BUILDTEMPDIR/whocc.hxl.csv" "$_data_published"

set +x
printf "\t%40s\n" "${tty_green}${FUNCNAME[0]} FINISHED OKAY ${tty_normal}"
}

#######################################
# crawler_who_cc raw CSVs from WHO Collaborating Centres
#
Expand All @@ -80,9 +145,9 @@ crawler_who_cc() {
for region in "${WHO_REGIONS[@]}"; do
output="$BUILDTEMPDIR/$region.csv"
outputs+=("$output")
# crawler_who_cc_fech_region "$region" "$output"
# frictionless validate "$output"
# sleep 10
crawler_who_cc_fech_region "$region" "$output"
frictionless validate "$output"
sleep 10
done

set -x
Expand Down
2 changes: 1 addition & 1 deletion scripts/etc/whocc-downloader.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
* SPDX-License-Identifier: Unlicense
* VERSION: v1.0
* CREATED: 2022-09-03 09:21 UTC
* REVISION: 2022-09-03 10:40 UTC v1.1 build-readme.sh -> common.lib.sh
* REVISION: ---
*******************************************************************************/

// > TL:DR: do this
Expand Down
173 changes: 173 additions & 0 deletions scripts/etc/woah-reflab-downloader.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
#!/usr/bin/env node
/*******************************************************************************
*
*
* FILE: woah-reflab-downloader.js
* scripts/etc/woah-reflab-downloader.js
*
* USAGE: node scripts/etc/woah-reflab-downloader.js \
* --woah-language 'AFRO' \
* --output 'partials/temp/AFRO.csv
*
* DESCRIPTION: Download CSVs from https://apps.who.int/whocc/Search.aspx
* and do some clean up
*
* REQUIREMENTS: - nodejs
* - puppeteer (yarn add i puppeteer)
* (Down: Chromium (~170MB Mac, ~282MB Linux, ~280MB Win))
* - commander (yarn add i commander)
* - csv (yarn add i csv)
* BUGS: ---
* NOTES: ---
* AUTHOR: Emerson Rocha <rocha[at]ieee.org>
* COMPANY: EticaAI
* LICENSE: Public Domain dedication
* SPDX-License-Identifier: Unlicense
* VERSION: v1.0
* CREATED: 2022-09-03 12:42 UTC started. based on whocc-downloader.js
* REVISION: ---
*******************************************************************************/

// > TL:DR: do this
// node scripts/etc/woah-reflab-downloader.js --woah-language 'EN' --output 'partials/temp/woah-reflab-en.csv'
// node scripts/etc/woah-reflab-downloader.js --woah-language 'FR' --output 'partials/temp/woah-reflab-fr.csv'
// node scripts/etc/woah-reflab-downloader.js --woah-language 'ES' --output 'partials/temp/woah-reflab-es.csv'

// > To Debug:
// node scripts/etc/woah-reflab-downloader.js --woah-language 'es' --output 'partials/temp/woah-reflab-es.csv' --show-browser
// > To check if is valid:
// frictionless validate partials/temp/woah-reflab-es.csv

// node --trace-warnings scripts/etc/who_cc.js
// yarn add i commander
// yarn add i puppeteer
// "When you install Puppeteer, it downloads a recent version of
// Chromium (~170MB Mac, ~282MB Linux, ~280MB Win)
const os = require("os");
const fs = require('fs');
const csv = require('csv');
const fsPromises = fs.promises;
const iso6393a2toa3 = {
'EN': 'eng',
'FR': 'fra',
'ES': 'spa',
}

const puppeteer = require('puppeteer');
const { program } = require('commander');

program
.name('woah')
.description('Data mine World Organisation for Animal Health reference laboratories')
.option('--woah-language', 'Language. Example: "EN", "FR", "ES"')
.option('--output', 'Path to output. Example: temp/AFRO.csv')
.option('--show-browser', 'If need show browser (use as last option)', false)
// .option('--teste', 'Path to output. Defaults to region.csv')
// .option('--tempdir', 'Path to a temporary dir', null)
;

program.parse(process.argv);
const options = program.opts();

// const project_woahlang = options.whoRegion;
const project_woahlang = program.args[0].toLocaleUpperCase();
const project_woahlang_a3 = iso6393a2toa3[project_woahlang];
// const project_output = options.output ? options.output : project_woahlang + '.csv'
const project_output = program.args[1];
// const project_tempdirdir = options.tempdir
const show_browser = options.showBrowser;

console.log(project_woahlang, project_woahlang_a3, project_output, program.args, program.args[0]);

// https://crm.oie.int/interconnexion/laboratoires.php?LANG=EN
// https://crm.oie.int/interconnexion/laboratoires.php?LANG=FR
// https://crm.oie.int/interconnexion/laboratoires.php?LANG=ES
const project_page_start = `https://crm.oie.int/interconnexion/laboratoires.php?LANG=` + project_woahlang;
const project_name = 'woah';

(async () => {
// console.log('started');

const project_tempdirdir = await fsPromises.mkdtemp(os.tmpdir() + "/woah-", (err, folder) => {
if (err)
console.log(err);
else {
console.log("The temporary folder path is:", folder);
}
return folder + '/';
});
console.log('Started. Tempdir at: ', project_tempdirdir)

const browser = await puppeteer.launch(
{
// headless: is_headless, // Here can enable/disable show the browser
headless: !show_browser, // Here can enable/disable show the browser
}
);

const page = await browser.newPage();
const client = await page.target().createCDPSession();
await client.send('Page.setDownloadBehavior', {
behavior: 'allow', downloadPath: project_tempdirdir
});

await page.goto(project_page_start);
await page.screenshot({ path: project_tempdirdir + '/' + project_name + '_v1.png' });

await page.waitForSelector('#shortcutstable')

// let titles = await(await page.$$('h3')).evaluate(node => node.innerText);

all_titles = []
let titles = await page.$$('h3');
for (const title of titles) {
// videoLinks.push(await link.evaluate( node => node.getAttribute('href')));
all_titles.push(await title.evaluate(node => node.innerText));
};

// console.log(titles)
// console.log(all_titles)

let data = []
data.push('#item+rem+i_' + project_woahlang_a3 + '+is_latn')
data = data.concat(all_titles)

let dataToWrite = ''

// Poor's man matrix to CSV string.
data.forEach((line) => {
line_items = []
if (!Array.isArray(line)) {
line = [line]
}
line.forEach((item) => {
item = item.trim()
if (item.indexOf(",") > -1){
line_items.push('"' + item + '"')
} else {
line_items.push(item)
}
});
dataToWrite += line_items.join(",") + "\n"
});

fs.writeFile(project_output, dataToWrite, 'utf8', function (err) {
if (err) {
console.log('Some error occured', project_output);
} else {
console.log('It\'s saved!', project_output);
}
});

// await Promise.all([
// new Promise(r => setTimeout(r, 60000))
// ]).catch(function (err) {
// console.log(err.message);
// process.exit(1);
// });


console.log('TODO: delete tempdir', project_tempdirdir)
await browser.close();
})();

5 changes: 5 additions & 0 deletions scripts/via-github-action.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ if [ "$OPERATION" = "crawler_wikidata_who_icd" ]; then
crawler_wikidata_who_icd
exit 0
fi
# OPERATION=crawler_woah_reflab ./scripts/via-github-action.sh
if [ "$OPERATION" = "crawler_woah_reflab" ]; then
crawler_woah_reflab
exit 0
fi

echo "unknow operation [$OPERATION]"
exit 1

0 comments on commit 772af21

Please sign in to comment.