From 992f18c11cccb64eb6f87a4f9e7d675daf24d2fd Mon Sep 17 00:00:00 2001 From: Roman Kalyakin Date: Wed, 2 Oct 2024 22:25:29 +0200 Subject: [PATCH 1/4] [issue-1311] remove parentheses (#426) * remove parentheses * no console --- src/util/solr/filterReducers.js | 2 +- test/util/solr/reducers.test.js | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/util/solr/filterReducers.js b/src/util/solr/filterReducers.js index 8038edac..36b8ec5d 100644 --- a/src/util/solr/filterReducers.js +++ b/src/util/solr/filterReducers.js @@ -217,7 +217,7 @@ const booleanHandler = (filters, field, filterRule) => { const textAsOpenEndedSearchString = (text, field) => { const parts = text.split(' ').filter(v => v !== '') const statement = parts - .map(part => part.replace(/"/g, '\\"')) + .map(part => part.replace(/"/g, '\\"').replace(/\(/g, '').replace(/\)/g, '')) .map((part, index, arr) => { const suffix = index === arr.length - 1 ? '*' : '' return `${field}:${part}${suffix}` diff --git a/test/util/solr/reducers.test.js b/test/util/solr/reducers.test.js index 472cc895..f04fcd68 100644 --- a/test/util/solr/reducers.test.js +++ b/test/util/solr/reducers.test.js @@ -6,6 +6,17 @@ const { filtersToQueryAndVariables } = require('../../../src/util/solr/index') const { InvalidArgumentError } = require('../../../src/util/error') describe('filtersToSolr', () => { + it('escapes parentheses', () => { + const filter = { + type: 'string', + q: 'H. Allen Smith (represen', + } + const query = filtersToSolr([filter], SolrNamespaces.Entities) + const expectedQuery = + '(entitySuggest:H. AND entitySuggest:Allen AND entitySuggest:Smith AND entitySuggest:represen*)' + assert.strictEqual(query, expectedQuery) + }) + it('throws an error for an unknown filter type', () => { const filter = { type: 'booomooo', From 960b95bf8c1e3747ba4c55a091f6c40b3f9ffb71 Mon Sep 17 00:00:00 2001 From: Roman Kalyakin Date: Fri, 4 Oct 2024 12:54:26 +0200 Subject: [PATCH 2/4] moved scripts to src and removed obsolete scripts (#427) * moved scripts to src and removed obsolete scripts * make linter happy --- package.json | 17 +- scripts/bulk.js | 119 ----------- scripts/import-articles.js | 134 ------------- scripts/import-issues.js | 83 -------- scripts/import-newspapers.js | 27 --- scripts/import-pages.js | 26 --- scripts/import-tags.js | 35 ---- scripts/s3/import-regions.js | 115 ----------- scripts/tags.json | 165 ---------------- scripts/update-facet-ranges.js | 48 ----- scripts/update-topics-positions.js | 85 -------- src/models/generated/schemas.d.ts | 187 +++++++++++++++++- {scripts => src/scripts}/generate-types.js | 0 {scripts => src/scripts}/loadtests/README.md | 4 +- .../scripts}/loadtests/articleSearch.js | 32 +-- .../scripts}/loadtests/embeddings.js | 35 ++-- .../scripts}/loadtests/suggestions.js | 41 ++-- src/scripts/update-facet-ranges.js | 56 ++++++ {scripts => src/scripts}/update-newspapers.js | 14 +- src/scripts/update-topics-positions.js | 92 +++++++++ .../scripts}/update-topics-related.js | 6 +- {scripts => src/scripts}/update-topics.js | 6 +- {scripts => src/scripts}/update-years.js | 23 +-- 23 files changed, 419 insertions(+), 931 deletions(-) delete mode 100644 scripts/bulk.js delete mode 100644 scripts/import-articles.js delete mode 100644 scripts/import-issues.js delete mode 100644 scripts/import-newspapers.js delete mode 100644 scripts/import-pages.js delete mode 100644 scripts/import-tags.js delete mode 100644 scripts/s3/import-regions.js delete mode 100644 scripts/tags.json delete mode 100644 scripts/update-facet-ranges.js delete mode 100644 scripts/update-topics-positions.js rename {scripts => src/scripts}/generate-types.js (100%) rename {scripts => src/scripts}/loadtests/README.md (59%) rename {scripts => src/scripts}/loadtests/articleSearch.js (74%) rename {scripts => src/scripts}/loadtests/embeddings.js (65%) rename {scripts => src/scripts}/loadtests/suggestions.js (62%) create mode 100644 src/scripts/update-facet-ranges.js rename {scripts => src/scripts}/update-newspapers.js (86%) create mode 100644 src/scripts/update-topics-positions.js rename {scripts => src/scripts}/update-topics-related.js (97%) rename {scripts => src/scripts}/update-topics.js (92%) rename {scripts => src/scripts}/update-years.js (84%) diff --git a/package.json b/package.json index bb5043a0..3ee4b485 100644 --- a/package.json +++ b/package.json @@ -34,17 +34,12 @@ "start": "node dist/", "dev": "nodemon --enable-source-maps --inspect dist/", "cli": "node src/admin/cli.js", - "generate-types": "node scripts/generate-types.js", - "update-newspapers": "node scripts/update-newspapers.js", - "update-years": "node scripts/update-years.js", - "update-topics": "node scripts/update-topics.js && node scripts/update-topics-related.js && node scripts/update-topics-positions.js", - "update-topics-related": "node scripts/update-topics-related.js", - "update-topics-positions": "node scripts/update-topics-positions.js", - "import-articles": "node scripts/import-articles.js", - "import-pages": "node scripts/import-pages.js", - "import-issues": "node scripts/import-issues.js", - "import-tags": "node scripts/import-tags.js", - "import-newspapers": "node scripts/import-newspapers.js" + "generate-types": "node dist/scripts/generate-types.js", + "update-newspapers": "node dist/scripts/update-newspapers.js", + "update-years": "node dist/scripts/update-years.js", + "update-topics": "node dist/scripts/update-topics.js && node dist/scripts/update-topics-related.js && node dist/scripts/update-topics-positions.js", + "update-topics-related": "node dist/scripts/update-topics-related.js", + "update-topics-positions": "node dist/scripts/update-topics-positions.js" }, "dependencies": { "@feathersjs/authentication": "5.0.25", diff --git a/scripts/bulk.js b/scripts/bulk.js deleted file mode 100644 index 509b3224..00000000 --- a/scripts/bulk.js +++ /dev/null @@ -1,119 +0,0 @@ -const config = require('@feathersjs/configuration')()(); -const debug = require('debug')('impresso/scripts:bulk'); -const verbose = require('debug')('verbose:impresso/scripts:bulk'); - -const sequelize = require('../src/sequelize').client(config.sequelize); -const neo4j = require('../src/neo4j').client(config.neo4j); -const { neo4jPrepare, neo4jSummary } = require('../src/services/neo4j.utils'); - -const session = neo4j.session(); - -const merge = (modelName, modelMapper, limit = 100) => { - const Klass = require(`../src/models/${modelName}.model`).model(sequelize); - const queries = require('decypher')(`${__dirname}/../src/services/${modelName}/${modelName}.queries.cyp`); - - debug(`merge: starting ${modelName}...`); - - async function waterfall() { - const total = await Klass.count(); - const steps = Math.ceil(total / limit); - - for (let i = 0; i < steps; i++) { - const items = await Klass.scope('findAll').findAll({ offset: i * limit, limit }); - debug('tx starting - offset:', i * limit, '- total:', total, '- limit:', limit); - - await session.writeTransaction((tx) => { - for (item of items) { - const params = { - Project: 'impresso', - ...modelMapper(item), - }; - verbose(`adding ${modelName} - uid: ${params.uid} - offset:`, i * limit, '- total:', total, params); - tx.run(neo4jPrepare(queries.merge, params), params); - } - }).then((res) => { - debug('tx success! - offset:', i * limit, '- total:', total, '- limit:', limit); - }); - } - } - - debug(`merge: ${modelName} done.`); - - return waterfall(); -}; - -const query = (modelName, queryName, items, limit = 100) => { - const queries = require('decypher')(`${__dirname}/../src/services/${modelName}/${modelName}.queries.cyp`); - - debug(`query: executing ${modelName}/${queryName}...`); - async function waterfall() { - const total = items.length; - const steps = Math.ceil(total / limit); - - for (let i = 0; i < steps; i++) { - debug('query: tx starting - offset:', i * limit, '- total:', total, '- limit:', limit); - - await session.writeTransaction((tx) => { - for (item of items) { - const params = { - Project: 'impresso', - ...item, - }; - verbose(`query: adding ${modelName} - uid: ${params.uid} - offset:`, i * limit, '- total:', total, params); - tx.run(neo4jPrepare(queries[queryName], params), params); - } - }).then((res) => { - debug('query: tx success! - offset:', i * limit, '- total:', total, '- limit:', limit); - }).catch((err) => { - console.log(err); - - debug('query: tx ERROR! - offset:', i * limit, '- total:', total, '- limit:', limit); - throw 'error in neo4j transaction'; - }); - } - } - - return waterfall(); -}; - -const count = (modelName, params) => { - const queries = require('decypher')(`${__dirname}/../src/services/${modelName}/${modelName}.queries.cyp`); - - debug(`count: ${modelName} using query 'count'`); - - return session.writeTransaction((tx) => { - verbose('count: ', queries.count); - return tx.run(queries.count, { - Project: 'impresso', - ...params, - }).then((res) => { - debug(`count: ${modelName} using query 'count' success!`); - verbose('count: ', neo4jSummary(res)); - return res; - }); - }); -}; - -// execute custom APOC call -const apoc = (modelName, queryName, params) => { - const queries = require('decypher')(`${__dirname}/../src/services/${modelName}/${modelName}.queries.cyp`); - - debug(`apoc: ${modelName} using query: ${queryName}.`); - - return session.writeTransaction(tx => tx.run(queries[queryName], { - Project: 'impresso', - ...params, - }).then((res) => { - debug(`apoc: ${modelName} using query: ${queryName} success!`); - verbose('apoc: ', neo4jSummary(res)); - return res; - })); -}; - -module.exports = { - merge, - count, - apoc, - query, - config, -}; diff --git a/scripts/import-articles.js b/scripts/import-articles.js deleted file mode 100644 index c0b69310..00000000 --- a/scripts/import-articles.js +++ /dev/null @@ -1,134 +0,0 @@ -const { query, count, apoc, config } = require('./bulk') -const debug = require('debug')('impresso/scripts:import-articles') -const fs = require('fs') -const _ = require('lodash') -const solr = require('../src/solr').client(config.solr, config.solrConnectionPool) -const Eta = require('node-eta') - -debug("start! '__dirname':", __dirname) -async function waterfall() { - // load first 1000 ids directly from solr. - // - const limit = 100 - const consumed = process.env.START_AT || 0 - let _solr = await solr.findAll({ - q: '*:*', - fl: 'id', - limit: 1, - offset: 0, - }) - - const total = _solr.response.numFound - const steps = Math.ceil(total / limit) - const eta = new Eta(steps - consumed, true) - - for (let i = consumed; i < steps; i++) { - _solr = await solr.findAll({ - q: '*:*', - fl: 'id,page_id_ss,meta_journal_s,meta_year_i,meta_date_dt,', - limit, - offset: i * limit, - }) - - // unique page uids from the given set of articles - const pagesUids = _(_solr.response.docs).map('page_id_ss').flatten().uniq().value() - - // merge pages. Longer but safer. - await query( - 'pages', - 'merge', - pagesUids.map(uid => { - const parts = uid.match(/^([a-zA-Z\d-]+)-p0+(\d+)$/) - - // console.log({ - // uid: uid, - // page_number: parseInt(parts[2], 10), - // issue_uid: parts[1], - // }); - // console.log(pageUid.match(/^([a-zA-Z\d-]+)-p0+(\d+)$/)[1]); - return { - uid: uid, - page_number: parts[2], - issue_uid: parts[1], - } - }) - ) - - await query( - 'articles', - 'merge', - _solr.response.docs.map(d => ({ - uid: d.id, - year: d.meta_year_i, - date: d.id.match(/\d{4}-\d{2}-\d{2}/)[0], - page__uids: d.page_id_ss, - newspaper__uid: d.meta_journal_s, - })) - ) - - eta.iterate() - debug(`import step ${i} / ${steps} completed, eta ${eta.format('{{etah}}')}!`) - - // debug(`'waterfall': start:${_solr.responseHeader.params.start}, rows:${_solr.responseHeader.params.rows}, numFound:${_solr.response.numFound}`); - } - // console.log(_solr.response.numFound); - // - // for( let doc of _solr.response.docs ){ - // console.log(doc) - // } - - // foreach loop awaiting - // for (let file of files) { - // const contents = await fs.readFile(file, 'utf8'); - // console.log(contents); - // } - - // - // const page = JSON.parse(fs.readFileSync(`${__dirname}/__pages/GDL-1811-11-22-a-p0001.json`, 'utf8')); - // // console.log(page); - // - // // import artuckes for this page - // // group regiosn by pOf - // const articles = _groupby(page.r, 'pOf'); - // console.log(articles); - // - // debug('merging articles...'); - // await query('articles', 'merge', _map(articles, (sections, uid) => { - // const regions = sections.reduce((acc, value) => { - // console.log(value) - // return acc.concat(value.c) - // }, []); - // // console.log(regions) - // // const paragraphs = sections.reduce((acc, value) => acc.concat(value.p[0].l[0].c), []); - // // get page id - // return { - // page__uid: 'GDL-1811-11-22-a-p0001', - // newspaper__uid: 'GDL', - // date: '1811-11-22', - // regions, - // uid, - // } - // }), limit = 100); - // - // debug('merge articles done.'); - // debug('calling APOC_set_issue__count_articles ...'); - // await apoc('articles', 'APOC_set_issue__count_articles'); - // debug('APOC_set_issue__count_articles done.'); - // debug('calling APOC_set_newspaper__count_articles ...'); - // await apoc('articles', 'APOC_set_newspaper__count_articles'); - // debug('APOC_set_newspaper__count_articles done.'); - // await apoc('pages', 'APOC_set_issue__count_pages'); - // debug('APOC_set_issue__count_pages done.'); - // await apoc('pages', 'APOC_set_newspaper__count_pages'); - // debug('APOC_set_newspaper__count_pages done.'); -} - -waterfall() - .then(res => { - debug('done, exit.') // prints 60 after 2 seconds. - process.exit() - }) - .catch(err => { - console.log(err) - process.exit() - }) diff --git a/scripts/import-issues.js b/scripts/import-issues.js deleted file mode 100644 index 2adc177b..00000000 --- a/scripts/import-issues.js +++ /dev/null @@ -1,83 +0,0 @@ -const { query, count, apoc, config } = require('./bulk') - -const debug = require('debug')('impresso/scripts:import-issues') -const solr = require('../src/solr').client(config.solr, config.solrConnectionPool) -const Eta = require('node-eta') - -debug('start!') -async function waterfall() { - debug('merging issues...') - const limit = 100 - const consumed = 0 - - let _solr = await solr.findAll({ - q: '*:*', - fl: 'id', - limit: 1, - offset: 0, - group_by: 'meta_issue_id_s', - }) - - const total = _solr.response.numFound - const steps = Math.ceil(total / limit) - const eta = new Eta(steps - consumed, true) - - for (let i = consumed; i < steps; i++) { - _solr = await solr.findAll({ - q: '*:*', - fl: 'id,meta_issue_id_s,meta_year_i', - limit, - offset: i * limit, - group_by: 'meta_issue_id_s', - }) - const issues = _solr.response.docs.map(d => { - const parts = d.groupValue.match(/^([A-Z]{3,})\-(\d{4}\-\d{2}\-\d{2})/) - return { - uid: d.groupValue, - year: d.doclist.docs[0].meta_year_i, - count_articles: d.doclist.numFound, - date: parts[2], - newspaper_uid: parts[1], - } - }) - - // use cypher - await query('issues', 'merge', issues, limit) - eta.iterate() - debug(`import step ${i} / ${steps} completed, eta ${eta.format('{{etah}}')}!`) - } - // - // await query('issues', 'merge', _solr.response.docs.map((group) => { - // console.log(pageUid, pageUid.match(/-p0+(\d+)$/)[1]); - // // console.log(pageUid.match(/^([a-zA-Z\d-]+)-p0+(\d+)$/)[1]); - // return { - // uid: pageUid, - // page_number: pageUid.match(/-p0+(\d+)$/)[1], - // issue_uid: pageUid.match(/^([a-zA-Z\d-]+)-p0+(\d+)$/)[1], - // }; - // }), limit); - // const merging = await merge('issues', (item) => { - // const date = item.uid.match(/[A-Z]{3,}\-(\d{4}\-\d{2}\-\d{2})/); - // return { - // uid: item.uid, - // year: item.year, - // date: date[1], - // newspaper_uid: item.newspaper_uid, - // }; - // }, 500); - // debug('merge issues done.'); - // debug('setting count ...'); - // - // await count('issues'); - // await apoc('issues', 'APOC_set_newspaper__count_issues'); -} - -waterfall() - .then(res => { - debug('done, exit.') // prints 60 after 2 seconds. - process.exit() - }) - .catch(err => { - console.log(err) - process.exit() - }) diff --git a/scripts/import-newspapers.js b/scripts/import-newspapers.js deleted file mode 100644 index f31b5e5a..00000000 --- a/scripts/import-newspapers.js +++ /dev/null @@ -1,27 +0,0 @@ -const { merge, count } = require('./bulk'); -const debug = require('debug')('impresso/scripts:import-newspapers'); - -debug('start!'); -async function waterfall() { - debug('merging newspapers...'); - const savenewspapers = await merge('newspapers', item => ({ - uid: item.uid, - acronym: item.uid, - name: item.title, - start_year: item.start_year, - end_year: item.end_year, - delta_year: item.end_year - item.start_year, - languages: item.languages.map(d => d.code), - }), 500); - debug('merge newspapers done.'); - debug('setting count ...'); - const savecount = await count('newspapers'); -} - -waterfall().then((res) => { - debug('done, exit.'); // prints 60 after 2 seconds. - process.exit(); -}).catch((err) => { - console.log(err); - process.exit(); -}); diff --git a/scripts/import-pages.js b/scripts/import-pages.js deleted file mode 100644 index 3b2082d0..00000000 --- a/scripts/import-pages.js +++ /dev/null @@ -1,26 +0,0 @@ -const { merge, count, apoc } = require('./bulk'); -const debug = require('debug')('impresso/scripts:import-pages'); - -debug('start!'); -async function waterfall() { - debug('merging pages...'); - const merging = await merge('pages', item => ({ - uid: item.uid, - page_number: item.page_number, - issue_uid: item.issue_uid, - }), 500); - debug('merge pages done.'); - debug('setting count ...'); - - await count('pages'); - await apoc('pages', 'APOC_set_issue__count_pages'); - await apoc('pages', 'APOC_set_newspaper__count_pages'); -} - -waterfall().then((res) => { - debug('done, exit.'); // prints 60 after 2 seconds. - process.exit(); -}).catch((err) => { - console.log(err); - process.exit(); -}); diff --git a/scripts/import-tags.js b/scripts/import-tags.js deleted file mode 100644 index 34974f9b..00000000 --- a/scripts/import-tags.js +++ /dev/null @@ -1,35 +0,0 @@ -// import tags from a well defined list of tags -const debug = require('debug')('impresso/scripts:import-tags'); -const tags = require('./tags'); -const slugify = require('slugify'); -const shash = require('short-hash'); -const { query, count } = require('./bulk'); - -debug('start!'); - -async function waterfall() { - debug('merging tags...'); - - await query('tags', 'merge', tags.map((d) => { - // generate uuid from source - d.slug = slugify(d.name); - d.provider_code = d.provider_code || shash(d.name); - d.uid = `${d.provider}-${d.provider_code}`; - d.Project = 'impresso'; - return d; - })); - - debug('merge tags done.'); - debug('SET project count_tags ...'); - - await count('tags'); - debug('SET project count_tags done.'); -} - -waterfall().then((res) => { - debug('done, exit.'); // prints 60 after 2 seconds. - process.exit(); -}).catch((err) => { - console.log(err); - process.exit(); -}); diff --git a/scripts/s3/import-regions.js b/scripts/s3/import-regions.js deleted file mode 100644 index 27be763c..00000000 --- a/scripts/s3/import-regions.js +++ /dev/null @@ -1,115 +0,0 @@ -/** - * Note: this won't work without the aws installed. - * We do not ship it with the package.json since it is a standalone task - * that probably need to be done only once. - * - * doc aws https://docs.aws.amazon.com/sdk-for-javascript/v2/developer-guide/welcome.html - */ -const { config, query } = require('../bulk'); -const aws = require('aws-sdk'); -const fs = require('fs'); -const _ = require('lodash'); -const { eachSeries } = require('async'); -const debug = require('debug')('impresso/scripts/s3:import-regions'); - -const s3 = new aws.S3({ - endpoint: config.s3.host, - accessKeyId: config.s3.accessKey, - secretAccessKey: config.s3.secretKey, -}); -const BUCKET = 'original-canonical-data'; -const LIMIT = 1000; - -debug(`s3.listObjects of BUCKET : '${BUCKET}' using host: ${config.s3.host}.`); - -fs.readFile('./.marker', (err, marker) => { - console.log(marker); - s3.listObjects({ - Bucket: BUCKET, - MaxKeys: LIMIT, - Marker: marker ? marker.toString() : undefined, - Prefix: 'GDL/195', - }, async (err, data) => { - if (err) { - console.log('err', err); - throw 'error in getting s3 data'; - } - debug(`s3.listObjects of BUCKET : '${BUCKET}' success! Marker: ${marker}, next marker: ${data.NextMarker}.`); - - // console.log('data', data.NextMarker, data); - let i = 0; - const l = data.Contents.length; - const keyWithErrors = []; - eachSeries(data.Contents, (d, cb) => { - // console.log('give me', d); - i += 1; - s3.getObject({ - Bucket: BUCKET, - Key: d.Key, - }, async (err, res) => { - if (err) { - return cb(err); - } - let body; - try { - body = JSON.parse(res.Body.toString()); - } catch (e) { - keyWithErrors.push(d.Key); - // console.log(d.Key,'body:', res.Body.toString()); - return cb(); - } - if (!body.r) { - return cb(); - } - - const versionId = res.VersionId; - const pageUid = d.Key.match(/\/([^.\/]+?)\.json$/)[1]; - - debug(`s3.listObjects eachSeries.getObject version: '${versionId}', pageUid:'${pageUid}', ${i}/${l}`); - - const pagesToArticles = _(body.r).groupBy('pOf').map((sections, articleUid) => { - // concatenate all regions in sections { c: [ 122, 1367, 953, 600 ],} - const regions = sections.reduce((acc, value) => acc.concat(value.c), []); - return { - page_uid: pageUid, - versionId, - uid: articleUid, - regions, - }; - }).value(); - debug(`s3.listObjects eachSeries.getObject saving REGIONS: ${pagesToArticles.length} ...`); - - const relationships = await query('articles', 'merge_regions', pagesToArticles, 100) - .catch((err) => { - cb(err); - }); - - debug(`s3.listObjects eachSeries.getObject saving REGIONS n: ${pagesToArticles.length} done!`); - - cb(); - }); - }, (err) => { - if (err) { console.log('err', err); } else { - // write marker to disk - if (!data.NextMarker) { - // console.log(data) - throw 'no next marker'; - } - fs.writeFileSync('./.marker', data.NextMarker); - if (keyWithErrors.length) { - fs.appendFileSync('./.keyWithErrors', `${keyWithErrors.join('\n')}\n`); - } - console.log('all good, next marker:', data.NextMarker); - } - process.exit(); - }); - // for(const file of data.Contents) { - // const result = await s3get({ - // Bucket: 'original-canonical-data', - // Key: file.Key - // }); - // - // console.log(result); - // } - }); -}); diff --git a/scripts/tags.json b/scripts/tags.json deleted file mode 100644 index 3547fb75..00000000 --- a/scripts/tags.json +++ /dev/null @@ -1,165 +0,0 @@ -[ - { - "name": "crime, law and justice", - "description": "Establishment and/or statement of the rules of behaviour in society, the enforcement of these rules, breaches of the rules and the punishment of offenders. Organisations and bodies involved in these activities.", - "provider": "iptc", - "url": "", - "provider_code": "02000000", - "used_for": "court report, police report", - "applies_to": ["text-content"] - }, - { - "name": "arts, culture and entertainment", - "description": "Matters pertaining to the advancement and refinement of the human mind, of interests, skills, tastes and emotions", - "provider": "iptc", - "url": "", - "provider_code": "01000000", - "used_for": "book review, theater annoncement, cinema programm", - "applies_to": ["text-content"] - }, - { - "name": "disaster, accident and emergency incident", - "description": "Man made and natural events resulting in loss of life or injury to living creatures and/or damage to inanimate objects or property.", - "provider": "iptc", - "url": "", - "provider_code": "03000000", - "applies_to": ["text-content"] - }, - { - "name": "economy, business and finance", - "description": "All matters concerning the planning, production and exchange of wealth.", - "provider": "iptc", - "url": "", - "provider_code": "04000000", - "used_for": "stock exchange information", - "applies_to": ["text-content"] - }, - { - "name": "education", - "description": "All aspects of furthering knowledge of human individuals from birth to death.", - "provider": "iptc", - "url": "", - "provider_code": "05000000", - "applies_to": ["text-content"] - }, - { - "name": "environment", - "description": "All aspects of protection, damage, and condition of the ecosystem of the planet earth and its surroundings.", - "provider": "iptc", - "url": "", - "provider_code": "06000000", - "applies_to": ["text-content"] - }, - { - "name": "health", - "description": "All aspects pertaining to the physical and mental welfare of human beings.", - "provider": "iptc", - "url": "", - "provider_code": "07000000", - "applies_to": ["text-content"] - }, - { - "name": "human interest", - "description": "Items about individuals, groups, animals, plants or other objects with a focus on emotional facets", - "provider": "iptc", - "url": "", - "provider_code": "08000000", - "applies_to": ["text-content"] - }, - { - "name": "labour", - "description": "Social aspects, organisations, rules and conditions affecting the employment of human effort for the generation of wealth or provision of services and the economic support of the unemployed.", - "provider": "iptc", - "url": "", - "provider_code": "09000000", - "applies_to": ["text-content"] - }, - { - "name": "lifestyle and leisure", - "description": "Activities undertaken for pleasure, relaxation or recreation outside paid employment, including eating and travel.", - "provider": "iptc", - "url": "", - "provider_code": "10000000", - "applies_to": ["text-content"] - }, - { - "name": "politics", - "description": "Local, regional, national and international exercise of power, or struggle for power, and the relationships between governing bodies and states.", - "provider": "iptc", - "url": "", - "provider_code": "11000000", - "applies_to": ["text-content"] - }, - { - "name": "religion and belief", - "description": "All aspects of human existence involving theology, philosophy, ethics and spirituality.", - "provider": "iptc", - "url": "", - "provider_code": "12000000", - "applies_to": ["text-content"] - }, - { - "name": "science and technology", - "description": "All aspects pertaining to human understanding of nature and the physical world and the development and application of this knowledge", - "provider": "iptc", - "url": "", - "provider_code": "13000000", - "applies_to": ["text-content"] - }, - { - "name": "society", - "description": "Aspects of the life of humans affecting its relationships", - "provider": "iptc", - "url": "", - "provider_code": "14000000", - "applies_to": ["text-content"] - }, - { - "name": "sport", - "description": "Competitive exercise involving physical effort. Organizations and bodies involved in these activities.", - "provider": "iptc", - "url": "", - "provider_code": "15000000", - "applies_to": ["text-content"] - }, - { - "name": "conflicts, war and peace", - "description": "Acts of socially or politically motivated protest and/or violence and actions to end them", - "provider": "iptc", - "url": "", - "provider_code": "16000000", - "applies_to": ["text-content"] - }, - { - "name": "weather", - "description": "The study, reporting and prediction of meteorological phenomena.", - "provider": "iptc", - "url": "", - "provider_code": "17000000", - "applies_to": ["text-content"] - }, - { - "name": "advertisement", - "provider": "bruniv", - "description": "A message printed in the newspaper in space paid for by the advertiser.", - "applies_to": ["layout"] - }, - { - "name": "comic strip", - "description": "Three-or four-panel drawings that tell a story, usually humorous.", - "provider": "bruniv", - "applies_to": ["layout"] - }, - { - "name": "editorial cartoon", - "description": "Drawing on editorial page employing exaggeration, satire, and symbolism.", - "provider": "bruniv", - "applies_to": ["layout"] - }, - { - "name": "feature story", - "description": "A story that goes further than straight news coverage, and usually focuses on the human interest elements of a situation or event. The feature story may be written to inform or entertain, and it can be on a multitude of topics from the trivial (students favorite singing groups) to the serious (teenage depression).", - "provider": "bruniv", - "applies_to": ["text-content"] - } -] diff --git a/scripts/update-facet-ranges.js b/scripts/update-facet-ranges.js deleted file mode 100644 index ea6411f3..00000000 --- a/scripts/update-facet-ranges.js +++ /dev/null @@ -1,48 +0,0 @@ -const fs = require('fs'); -const config = require('@feathersjs/configuration')()(); - -const solrClient = require('../src/solr').client(config.solr, config.solrConnectionPool); - -const { - SolrMappings, -} = require('../src/data/constants'); - -async function getFacetsRanges(index) { - const facetQueryPart = Object.entries(SolrMappings[index].facets) - .filter(([, { type }]) => type === 'range') - .reduce((acc, [facet, descriptor]) => { - acc[`${facet}__min`] = `min(${descriptor.field})`; - acc[`${facet}__max`] = `max(${descriptor.field})`; - return acc; - }, {}); - const query = { - 'json.facet': JSON.stringify(facetQueryPart), - rows: 0, - q: '*:*', - hl: false, - }; - const { facets = {} } = await solrClient.requestGetRaw(query, index); - - return Object.entries(facets || {}).reduce((acc, [key, value]) => { - if (key === 'count') return acc; - const [facetKey, field] = key.split('__'); - const nestedValue = acc[facetKey] || {}; - nestedValue[field] = value; - acc[facetKey] = nestedValue; - return acc; - }, {}); -} - -Promise.all(Object.keys(SolrMappings).map(async index => ({ - index, - facets: await getFacetsRanges(index), -}))).then((items) => { - const itemsMap = items.reduce((acc, { index, facets }) => { - acc[index] = facets; - return acc; - }, {}); - - const fileName = './data/facetRanges.json'; - fs.writeFileSync(fileName, JSON.stringify(itemsMap)); -}).then(() => { console.info('Done'); process.exit(0); }) - .catch((error) => { console.error(error.message); process.exit(1); }); diff --git a/scripts/update-topics-positions.js b/scripts/update-topics-positions.js deleted file mode 100644 index cc8a4f4b..00000000 --- a/scripts/update-topics-positions.js +++ /dev/null @@ -1,85 +0,0 @@ -const fs = require('fs'); -const debug = require('debug')('impresso/scripts:update-topics-positions'); -const Graph = require('graphology'); -const forceAtlas2 = require('graphology-layout-forceatlas2'); -const pagerank = require('graphology-pagerank'); -const louvain = require('graphology-communities-louvain'); -const hits = require('graphology-hits'); -const { circular } = require('graphology-layout'); -const topics = require('../data/topics.json'); - -const graph = new Graph(); - -graph.import({ - attributes: { - name: 'the awesome topic graph', - }, - nodes: Object.values(topics).map(topic => ({ - key: topic.uid, - attributes: { - x: topic.x, - y: topic.y, - weight: topic.countItems, - }, - })), - edges: Object.values(topics) - .map(topic => topic.relatedTopics.map(rel => ({ - source: topic.uid, - target: rel.uid, - attributes: { - weight: rel.w, - }, - }))).reduce((acc, d) => acc.concat(d), []), -}); - -debug('Number of nodes', graph.order); -debug('Number of edges', graph.size); - -const { x, y } = graph.getNodeAttributes(graph.nodes()[1]); -debug('Get x y of the first node:', x, y); - -if (!x && !y) { - debug('No initial xy, do circular layout first.'); - circular.assign(graph); -} - -const positions = forceAtlas2(graph, { - iterations: 100, - settings: { - gravity: 20, - linLogMode: false, - }, -}); - -const pageranks = pagerank(graph, { alpha: 0.9, weighted: true }); -const communities = louvain(graph); -const { hubs, authorities } = hits(graph, { normalize: false }); -// const degreesPerCommunity = groupBy(communities, 'uid'); - -debug('positions n.', Object.keys(pageranks).length); -debug('pageranks n.', Object.keys(pageranks).length); -debug('communities n.', Object.keys(communities).length); -debug('hubs n.', Object.keys(hubs).length); -debug('authorities n.', Object.keys(authorities).length); - - -Object.keys(positions).forEach((uid) => { - topics[uid].x = positions[uid].x; - topics[uid].y = positions[uid].y; - topics[uid].pagerank = pageranks[uid]; - topics[uid].community = communities[uid]; - topics[uid].hub = hubs[uid]; - topics[uid].authority = authorities[uid]; - - debug( - 'topic', uid, - '- x y:', topics[uid].x, topics[uid].y, - '- p:', topics[uid].pagerank, - '- c:', topics[uid].community, - ); -}); - -const filename = './data/topics.json'; - -fs.writeFileSync(filename, JSON.stringify(topics)); -debug(`success, saved ${filename}`); diff --git a/src/models/generated/schemas.d.ts b/src/models/generated/schemas.d.ts index 82e91a08..fa6fe9b4 100644 --- a/src/models/generated/schemas.d.ts +++ b/src/models/generated/schemas.d.ts @@ -561,9 +561,11 @@ export interface TopicWord { * Request body for the authentication endpoint */ export interface AuthenticationCreateRequest { - strategy: "local"; - email: string; - password: string; + strategy: "local" | "jwt-app"; + email?: string; + password?: string; + accessToken?: string; + [k: string]: unknown; } @@ -940,6 +942,185 @@ export interface TextReuseClusterDetails { } +/** + * Impresso NER entity + */ +export interface ImpressoNamedEntityRecognitionEntity { + /** + * ID of the entity + */ + id: string; + /** + * Type of the entity + */ + type: + | "comp.demonym" + | "comp.function" + | "comp.name" + | "comp.qualifier" + | "comp.title" + | "loc" + | "loc.add.elec" + | "loc.add.phys" + | "loc.adm.nat" + | "loc.adm.reg" + | "loc.adm.sup" + | "loc.adm.town" + | "loc.fac" + | "loc.oro" + | "loc.phys.astro" + | "loc.phys.geo" + | "loc.phys.hydro" + | "loc.unk" + | "org" + | "org.adm" + | "org.ent" + | "org.ent.pressagency" + | "pers" + | "pers.coll" + | "pers.ind" + | "pers.ind.articleauthor" + | "prod" + | "prod.doctr" + | "prod.media" + | "time" + | "time.date.abs" + | "time.hour.abs"; + /** + * Surface form of the entity + */ + surfaceForm: string; + offset: { + /** + * Start offset of the entity in the text + */ + start: number; + /** + * End offset of the entity in the text + */ + end: number; + }; + /** + * Whether the entity type is nested + */ + isTypeNested: boolean; + confidence: { + /** + * Confidence score for the named entity recognition + */ + ner: number; + /** + * Confidence score for the named entity linking + */ + nel?: number; + }; +} + + +/** + * Request body for the Impresso NER endpoint + */ +export interface ImpressoNamedEntityRecognitionRequest { + /** + * Text to be processed for named entity recognition + */ + text: string; +} + + +/** + * Response of the Impresso NER endpoint + */ +export interface ImpressoNamedEntityRecognitionResponse { + /** + * ID of the model used for the named entity recognition + */ + modelId: string; + /** + * Text processed for named entity recognition + */ + text: string; + /** + * Timestamp of when named entity recognition was performed + */ + timestamp: string; + entities: ImpressoNamedEntityRecognitionEntity[]; +} +/** + * Impresso NER entity + */ +export interface ImpressoNamedEntityRecognitionEntity { + /** + * ID of the entity + */ + id: string; + /** + * Type of the entity + */ + type: + | "comp.demonym" + | "comp.function" + | "comp.name" + | "comp.qualifier" + | "comp.title" + | "loc" + | "loc.add.elec" + | "loc.add.phys" + | "loc.adm.nat" + | "loc.adm.reg" + | "loc.adm.sup" + | "loc.adm.town" + | "loc.fac" + | "loc.oro" + | "loc.phys.astro" + | "loc.phys.geo" + | "loc.phys.hydro" + | "loc.unk" + | "org" + | "org.adm" + | "org.ent" + | "org.ent.pressagency" + | "pers" + | "pers.coll" + | "pers.ind" + | "pers.ind.articleauthor" + | "prod" + | "prod.doctr" + | "prod.media" + | "time" + | "time.date.abs" + | "time.hour.abs"; + /** + * Surface form of the entity + */ + surfaceForm: string; + offset: { + /** + * Start offset of the entity in the text + */ + start: number; + /** + * End offset of the entity in the text + */ + end: number; + }; + /** + * Whether the entity type is nested + */ + isTypeNested: boolean; + confidence: { + /** + * Confidence score for the named entity recognition + */ + ner: number; + /** + * Confidence score for the named entity linking + */ + nel?: number; + }; +} + + export type StatusOfTheCollection = string; /** diff --git a/scripts/generate-types.js b/src/scripts/generate-types.js similarity index 100% rename from scripts/generate-types.js rename to src/scripts/generate-types.js diff --git a/scripts/loadtests/README.md b/src/scripts/loadtests/README.md similarity index 59% rename from scripts/loadtests/README.md rename to src/scripts/loadtests/README.md index 739edd37..764dac20 100644 --- a/scripts/loadtests/README.md +++ b/src/scripts/loadtests/README.md @@ -5,9 +5,9 @@ Using [k6](https://k6.io/). Remember to **disable cache** in config before testing. ```shell -k6 run scripts/loadtests/articleSearch.js +k6 run dist/scripts/loadtests/articleSearch.js ``` ```shell -k6 run scripts/loadtests/embeddings.js +k6 run dist/scripts/loadtests/embeddings.js ``` diff --git a/scripts/loadtests/articleSearch.js b/src/scripts/loadtests/articleSearch.js similarity index 74% rename from scripts/loadtests/articleSearch.js rename to src/scripts/loadtests/articleSearch.js index 8fccee84..d2658f90 100644 --- a/scripts/loadtests/articleSearch.js +++ b/src/scripts/loadtests/articleSearch.js @@ -1,35 +1,37 @@ -import http from 'k6/http'; -import { check, sleep } from 'k6'; +/* eslint-disable max-len */ +import http from 'k6/http' +import { check, sleep } from 'k6' export const options = { - stages: [2, 3, 4, 5, 6] - .map(target => ({ duration: '15s', target: target * 20 })), -}; + stages: [2, 3, 4, 5, 6].map(target => ({ duration: '15s', target: target * 20 })), +} const tokens = ` Le 29 février 2020, le premier cas de covid-19 au Grand-Duché était officiellement annoncé par le ministère de la Santé. Mais selon les scientifiques du List, cette contamination est intervenue quelques jours après la présence effective du coronavirus dans le pays, puisque les premières traces détectées sont comprises dans un délai situé «entre le 12 et le 25 février», Henry-Michel Cauchie, responsable de l'étude Coronastep au sein du List, qui estime que le virus était présent «relativement tôt». Une affirmation basée sur l'analyse plus approfondie d'anciens échantillons d'eau usée, datant d'avril 2019. Si les recherches étaient centrées sur le norovirus, à l'origine de la grippe intestinale, les résultats ont démontré que le covid-19 avait atteint le Luxembourg bien avant le premier décès, recensé le 13 mars dernier. Jour où le pays enregistrait officiellement 26 cas. -`.split(' '); +`.split(' ') export default function main() { - const randomIndexes = [1, 2, 3].map(() => Math.round(Math.random() * tokens.length)); + const randomIndexes = [1, 2, 3].map(() => Math.round(Math.random() * tokens.length)) const queryParameters = { group_by: 'articles', limit: 1, 'filters[0][type]': 'string', 'filters[0][operator]': 'AND', - }; + } randomIndexes.forEach((idx, i) => { - queryParameters[`filters[0][q][${i}]`] = encodeURIComponent(tokens[idx]); - }); - const qs = Object.entries(queryParameters).map(([k, v]) => `${k}=${v}`).join('&'); + queryParameters[`filters[0][q][${i}]`] = encodeURIComponent(tokens[idx]) + }) + const qs = Object.entries(queryParameters) + .map(([k, v]) => `${k}=${v}`) + .join('&') - const url = `http://localhost:3030/search?${qs}`; + const url = `http://localhost:3030/search?${qs}` // const url = `http://dev.impresso-project.ch/api/search?${qs}`; // console.log(url); - const res = http.get(url); - check(res, { 'status was 200': r => r.status === 200 }); - sleep(1); + const res = http.get(url) + check(res, { 'status was 200': r => r.status === 200 }) + sleep(1) } diff --git a/scripts/loadtests/embeddings.js b/src/scripts/loadtests/embeddings.js similarity index 65% rename from scripts/loadtests/embeddings.js rename to src/scripts/loadtests/embeddings.js index 05307496..5f0dd99c 100644 --- a/scripts/loadtests/embeddings.js +++ b/src/scripts/loadtests/embeddings.js @@ -1,36 +1,37 @@ -import http from 'k6/http'; -import { check, sleep } from 'k6'; +/* eslint-disable max-len */ +import http from 'k6/http' +import { check, sleep } from 'k6' export const options = { - stages: [2, 3, 4, 5, 6] - .map(target => ({ duration: '15s', target: target * 20 })), -}; + stages: [2, 3, 4, 5, 6].map(target => ({ duration: '15s', target: target * 20 })), +} const tokens = ` Le 29 février 2020, le premier cas de covid-19 au Grand-Duché était officiellement annoncé par le ministère de la Santé. Mais selon les scientifiques du List, cette contamination est intervenue quelques jours après la présence effective du coronavirus dans le pays, puisque les premières traces détectées sont comprises dans un délai situé «entre le 12 et le 25 février», Henry-Michel Cauchie, responsable de l'étude Coronastep au sein du List, qui estime que le virus était présent «relativement tôt». Une affirmation basée sur l'analyse plus approfondie d'anciens échantillons d'eau usée, datant d'avril 2019. Si les recherches étaient centrées sur le norovirus, à l'origine de la grippe intestinale, les résultats ont démontré que le covid-19 avait atteint le Luxembourg bien avant le premier décès, recensé le 13 mars dernier. Jour où le pays enregistrait officiellement 26 cas. - `.split(' '); + `.split(' ') function getRandomToken() { - const randomIndex = Math.floor(Math.random() * tokens.length); - const token = tokens[randomIndex].replace(/[^A-zÀ-ÿ]/g, ''); - if (token.length > 0) return token; - return getRandomToken(); + const randomIndex = Math.floor(Math.random() * tokens.length) + const token = tokens[randomIndex].replace(/[^A-zÀ-ÿ]/g, '') + if (token.length > 0) return token + return getRandomToken() } export default function test() { - const queryParameters = { language: 'fr', q: encodeURIComponent(getRandomToken()), - }; - const qs = Object.entries(queryParameters).map(([k, v]) => `${k}=${v}`).join('&'); + } + const qs = Object.entries(queryParameters) + .map(([k, v]) => `${k}=${v}`) + .join('&') - const url = `http://localhost:3030/embeddings?${qs}`; + const url = `http://localhost:3030/embeddings?${qs}` // const url = `http://dev.impresso-project.ch/api/embeddings?${qs}`; // console.log(url); - const res = http.get(url); - check(res, { 'status was 200': r => r.status === 200 }); - sleep(1); + const res = http.get(url) + check(res, { 'status was 200': r => r.status === 200 }) + sleep(1) } diff --git a/scripts/loadtests/suggestions.js b/src/scripts/loadtests/suggestions.js similarity index 62% rename from scripts/loadtests/suggestions.js rename to src/scripts/loadtests/suggestions.js index f82beef2..8c598b98 100644 --- a/scripts/loadtests/suggestions.js +++ b/src/scripts/loadtests/suggestions.js @@ -1,41 +1,42 @@ -import http from 'k6/http'; -import { check, sleep } from 'k6'; +/* eslint-disable max-len */ +import http from 'k6/http' +import { check, sleep } from 'k6' export const options = { - stages: [2, 3, 4, 5, 6] - .map(target => ({ duration: '15s', target: target * 20 })), -}; + stages: [2, 3, 4, 5, 6].map(target => ({ duration: '15s', target: target * 20 })), +} const tokens = ` Le 29 février 2020, le premier cas de covid-19 au Grand-Duché était officiellement annoncé par le ministère de la Santé. Mais selon les scientifiques du List, cette contamination est intervenue quelques jours après la présence effective du coronavirus dans le pays, puisque les premières traces détectées sont comprises dans un délai situé «entre le 12 et le 25 février», Henry-Michel Cauchie, responsable de l'étude Coronastep au sein du List, qui estime que le virus était présent «relativement tôt». Une affirmation basée sur l'analyse plus approfondie d'anciens échantillons d'eau usée, datant d'avril 2019. Si les recherches étaient centrées sur le norovirus, à l'origine de la grippe intestinale, les résultats ont démontré que le covid-19 avait atteint le Luxembourg bien avant le premier décès, recensé le 13 mars dernier. Jour où le pays enregistrait officiellement 26 cas. - `.split(' '); + `.split(' ') function getRandomToken() { - const randomIndex = Math.floor(Math.random() * tokens.length); - const token = tokens[randomIndex].replace(/[^A-zÀ-ÿ]/g, ''); - if (token.length < 2) return getRandomToken(); + const randomIndex = Math.floor(Math.random() * tokens.length) + const token = tokens[randomIndex].replace(/[^A-zÀ-ÿ]/g, '') + if (token.length < 2) return getRandomToken() - let randomLength = Math.floor(Math.random() * token.length); - randomLength = randomLength < 2 ? 2 : randomLength; - const randomStart = Math.floor(Math.random() * (token.length - randomLength)); + let randomLength = Math.floor(Math.random() * token.length) + randomLength = randomLength < 2 ? 2 : randomLength + const randomStart = Math.floor(Math.random() * (token.length - randomLength)) - return token.slice(randomStart, randomStart + randomLength); + return token.slice(randomStart, randomStart + randomLength) } export default function test() { - const queryParameters = { q: encodeURIComponent(getRandomToken()), - }; + } // console.log('**', JSON.stringify(queryParameters)); - const qs = Object.entries(queryParameters).map(([k, v]) => `${k}=${v}`).join('&'); + const qs = Object.entries(queryParameters) + .map(([k, v]) => `${k}=${v}`) + .join('&') - const url = `http://localhost:3030/suggestions?${qs}`; + const url = `http://localhost:3030/suggestions?${qs}` // const url = `http://dev.impresso-project.ch/api/suggestions?${qs}`; // console.log(url); - const res = http.get(url); - check(res, { 'status was 200': r => r.status === 200 }); - sleep(1); + const res = http.get(url) + check(res, { 'status was 200': r => r.status === 200 }) + sleep(1) } diff --git a/src/scripts/update-facet-ranges.js b/src/scripts/update-facet-ranges.js new file mode 100644 index 00000000..3bf0170e --- /dev/null +++ b/src/scripts/update-facet-ranges.js @@ -0,0 +1,56 @@ +const fs = require('fs') +const config = require('@feathersjs/configuration')()() + +const solrClient = require('../solr').client(config.solr, config.solrConnectionPool) + +const { SolrMappings } = require('../data/constants') + +async function getFacetsRanges(index) { + const facetQueryPart = Object.entries(SolrMappings[index].facets) + .filter(([, { type }]) => type === 'range') + .reduce((acc, [facet, descriptor]) => { + acc[`${facet}__min`] = `min(${descriptor.field})` + acc[`${facet}__max`] = `max(${descriptor.field})` + return acc + }, {}) + const query = { + 'json.facet': JSON.stringify(facetQueryPart), + rows: 0, + q: '*:*', + hl: false, + } + const { facets = {} } = await solrClient.requestGetRaw(query, index) + + return Object.entries(facets || {}).reduce((acc, [key, value]) => { + if (key === 'count') return acc + const [facetKey, field] = key.split('__') + const nestedValue = acc[facetKey] || {} + nestedValue[field] = value + acc[facetKey] = nestedValue + return acc + }, {}) +} + +Promise.all( + Object.keys(SolrMappings).map(async index => ({ + index, + facets: await getFacetsRanges(index), + })) +) + .then(items => { + const itemsMap = items.reduce((acc, { index, facets }) => { + acc[index] = facets + return acc + }, {}) + + const fileName = './data/facetRanges.json' + fs.writeFileSync(fileName, JSON.stringify(itemsMap)) + }) + .then(() => { + console.info('Done') // eslint-disable-line no-console + process.exit(0) + }) + .catch(error => { + console.error(error.message) + process.exit(1) + }) diff --git a/scripts/update-newspapers.js b/src/scripts/update-newspapers.js similarity index 86% rename from scripts/update-newspapers.js rename to src/scripts/update-newspapers.js index fe451991..8e14cf92 100644 --- a/scripts/update-newspapers.js +++ b/src/scripts/update-newspapers.js @@ -3,11 +3,11 @@ const fs = require('fs') const lodash = require('lodash') const debug = require('debug')('impresso/scripts:update-data') const config = require('@feathersjs/configuration')()() -const sequelizeClient = require('../src/sequelize').client(config.sequelize) -const solrClient = require('../src/solr').client(config.solr, config.solrConnectionPool) +const sequelizeClient = require('../sequelize').client(config.sequelize) +const solrClient = require('../solr').client(config.solr, config.solrConnectionPool) -const Newspaper = require('../src/models/newspapers.model') -const Issue = require('../src/models/issues.model') +const Newspaper = require('../models/newspapers.model') +const Issue = require('../models/issues.model') debug('start!') @@ -43,7 +43,7 @@ async function waterfall() { }) }) .catch(err => { - console.log(err) + console.log(err) // eslint-disable-line no-console throw err }) @@ -92,7 +92,7 @@ async function waterfall() { }) }) .catch(err => { - console.log(err) + console.log(err) // eslint-disable-line no-console }) debug('saving', Object.keys(newspapers).length, 'newspapers...') @@ -109,6 +109,6 @@ waterfall() process.exit(0) }) .catch(err => { - console.log(err) + console.log(err) // eslint-disable-line no-console process.exit(1) }) diff --git a/src/scripts/update-topics-positions.js b/src/scripts/update-topics-positions.js new file mode 100644 index 00000000..65314f74 --- /dev/null +++ b/src/scripts/update-topics-positions.js @@ -0,0 +1,92 @@ +const fs = require('fs') +const debug = require('debug')('impresso/scripts:update-topics-positions') +const Graph = require('graphology') +const forceAtlas2 = require('graphology-layout-forceatlas2') +const pagerank = require('graphology-pagerank') +const louvain = require('graphology-communities-louvain') +const hits = require('graphology-hits') +const { circular } = require('graphology-layout') +const topics = require('../../data/topics.json') + +const graph = new Graph() + +graph.import({ + attributes: { + name: 'the awesome topic graph', + }, + nodes: Object.values(topics).map(topic => ({ + key: topic.uid, + attributes: { + x: topic.x, + y: topic.y, + weight: topic.countItems, + }, + })), + edges: Object.values(topics) + .map(topic => + topic.relatedTopics.map(rel => ({ + source: topic.uid, + target: rel.uid, + attributes: { + weight: rel.w, + }, + })) + ) + .reduce((acc, d) => acc.concat(d), []), +}) + +debug('Number of nodes', graph.order) +debug('Number of edges', graph.size) + +const { x, y } = graph.getNodeAttributes(graph.nodes()[1]) +debug('Get x y of the first node:', x, y) + +if (!x && !y) { + debug('No initial xy, do circular layout first.') + circular.assign(graph) +} + +const positions = forceAtlas2(graph, { + iterations: 100, + settings: { + gravity: 20, + linLogMode: false, + }, +}) + +const pageranks = pagerank(graph, { alpha: 0.9, weighted: true }) +const communities = louvain(graph) +const { hubs, authorities } = hits(graph, { normalize: false }) +// const degreesPerCommunity = groupBy(communities, 'uid'); + +debug('positions n.', Object.keys(pageranks).length) +debug('pageranks n.', Object.keys(pageranks).length) +debug('communities n.', Object.keys(communities).length) +debug('hubs n.', Object.keys(hubs).length) +debug('authorities n.', Object.keys(authorities).length) + +Object.keys(positions).forEach(uid => { + topics[uid].x = positions[uid].x + topics[uid].y = positions[uid].y + topics[uid].pagerank = pageranks[uid] + topics[uid].community = communities[uid] + topics[uid].hub = hubs[uid] + topics[uid].authority = authorities[uid] + + debug( + 'topic', + uid, + '- x y:', + topics[uid].x, + topics[uid].y, + '- p:', + topics[uid].pagerank, + '- c:', + topics[uid].community + ) +}) + +const filename = './data/topics.json' + +fs.writeFileSync(filename, JSON.stringify(topics)) +debug(`success, saved ${filename}`) diff --git a/scripts/update-topics-related.js b/src/scripts/update-topics-related.js similarity index 97% rename from scripts/update-topics-related.js rename to src/scripts/update-topics-related.js index e5c71a13..51c31bc9 100644 --- a/scripts/update-topics-related.js +++ b/src/scripts/update-topics-related.js @@ -3,8 +3,8 @@ const path = require('path') const { chunk } = require('lodash') const debug = require('debug')('impresso/scripts:update-topics-related') const Eta = require('node-eta') -const app = require('../src/app') -const topics = require('../data/topics.json') +const app = require('../app') +const topics = require('../../data/topics.json') const Threshold = parseFloat(process.env.THRESHOLD || 0.5) const RelatedThreshold = parseFloat(process.env.RELATED_THRESHOLD || 0.1) @@ -14,7 +14,7 @@ const LimitRelatedTopics = 300 const RelatedTopicsChunkSize = parseInt(process.env.CHUNK_SIZE || 2, 10) const initialTopicUids = process.env.TOPICS ? process.env.TOPICS.split(',') : [] // topics filename, for fs; -const filename = path.join(__dirname, '../data/topics.json') +const filename = path.join(__dirname, '../../data/topics.json') // get all topics where is greater than threshold let topicUids = Object.keys(topics) diff --git a/scripts/update-topics.js b/src/scripts/update-topics.js similarity index 92% rename from scripts/update-topics.js rename to src/scripts/update-topics.js index 7ae917a0..a0cb9095 100644 --- a/scripts/update-topics.js +++ b/src/scripts/update-topics.js @@ -3,8 +3,8 @@ const fs = require('fs') const lodash = require('lodash') const debug = require('debug')('impresso/scripts:update-data') const config = require('@feathersjs/configuration')()() -const solrClient = require('../src/solr').client(config.solr, config.solrConnectionPool) -const Topic = require('../src/models/topics.model') +const solrClient = require('../solr').client(config.solr, config.solrConnectionPool) +const Topic = require('../models/topics.model') debug('start!') @@ -82,6 +82,6 @@ waterfall() process.exit(0) }) .catch(err => { - console.log(err) + console.log(err) // eslint-disable-line no-console process.exit(1) }) diff --git a/scripts/update-years.js b/src/scripts/update-years.js similarity index 84% rename from scripts/update-years.js rename to src/scripts/update-years.js index f055467a..9b464810 100644 --- a/scripts/update-years.js +++ b/src/scripts/update-years.js @@ -5,12 +5,9 @@ const fs = require('fs') const debug = require('debug')('impresso/scripts:update-timelines') const config = require('@feathersjs/configuration')()() -const solrClient = require('../src/solr').client( - config.solr, - config.solrConnectionPool -) +const solrClient = require('../solr').client(config.solr, config.solrConnectionPool) -const Year = require('../src/models/years.model') +const Year = require('../models/years.model') debug('start!') @@ -31,7 +28,7 @@ async function waterfall() { }), namespace: 'search', }) - .then((res) => + .then(res => res.facets.year.buckets.reduce((acc, bucket) => { // save a dictionary year:Year instance acc[bucket.val] = new Year({ @@ -59,8 +56,8 @@ async function waterfall() { }), namespace: 'search', }) - .then((res) => - res.facets.year.buckets.forEach((bucket) => { + .then(res => + res.facets.year.buckets.forEach(bucket => { // save to the dictionary year:Year instance years[bucket.val].refs.a = parseFloat(bucket.count) }, {}) @@ -81,14 +78,14 @@ async function waterfall() { }), namespace: 'images', }) - .then((res) => - res.facets.year.buckets.forEach((bucket) => { + .then(res => + res.facets.year.buckets.forEach(bucket => { // save to the dictionary year:Year instance years[bucket.val].refs.m = parseFloat(bucket.count) }, {}) ) - console.log(years) + console.log(years) // eslint-disable-line no-console debug('saving', Object.keys(years).length, 'years ...') @@ -102,7 +99,7 @@ waterfall() debug('done, exit.') // prints 60 after 2 seconds. process.exit(0) }) - .catch((err) => { - console.log(err) + .catch(err => { + console.log(err) // eslint-disable-line no-console process.exit(1) }) From 5d5029abafc5f8c413ef8f3b051f848bcaedf905 Mon Sep 17 00:00:00 2001 From: Roman Kalyakin Date: Fri, 4 Oct 2024 12:56:01 +0200 Subject: [PATCH 3/4] removed scripts folder from dockerfile --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c11a85a0..4eb4fba5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,6 @@ RUN npm run copy-files RUN ls -la ./dist COPY public ./public -COPY scripts ./scripts RUN mkdir -p config COPY ./config/default.json ./config From fc19bdcf8511507b43daca2ebff7574bfdff0073 Mon Sep 17 00:00:00 2001 From: Roman Kalyakin Date: Mon, 7 Oct 2024 19:57:29 +0200 Subject: [PATCH 4/4] parse result before returning job task ID (#428) --- src/services/jobs/jobs.class.js | 146 +++++++++++++++++--------------- 1 file changed, 80 insertions(+), 66 deletions(-) diff --git a/src/services/jobs/jobs.class.js b/src/services/jobs/jobs.class.js index a126a021..eac47fdf 100644 --- a/src/services/jobs/jobs.class.js +++ b/src/services/jobs/jobs.class.js @@ -1,108 +1,122 @@ /* eslint-disable no-unused-vars */ -const debug = require('debug')('impresso/services:jobs'); -const { BadGateway, NotFound, NotImplemented } = require('@feathersjs/errors'); -const SequelizeService = require('../sequelize.service'); -const { STATUS_KILLED, STATUS_DONE } = require('../../models/jobs.model'); -const { measureTime } = require('../../util/instruments'); +const debug = require('debug')('impresso/services:jobs') +const { BadGateway, NotFound, NotImplemented } = require('@feathersjs/errors') +const SequelizeService = require('../sequelize.service') +const { STATUS_KILLED, STATUS_DONE } = require('../../models/jobs.model') +const { measureTime } = require('../../util/instruments') class Service { - constructor (options) { - this.options = options; + constructor(options) { + this.options = options } - setup (app) { - this.app = app; - this.name = 'jobs'; + setup(app) { + this.app = app + this.name = 'jobs' this.sequelizeService = new SequelizeService({ app, name: this.name, - }); + }) } - async find (params) { + async find(params) { const where = { creatorId: params.user.id, - }; + } - return measureTime(() => this.sequelizeService.find({ - query: { - ...params.query, - }, - where, - }), 'jobs.find.db.find'); + return measureTime( + () => + this.sequelizeService.find({ + query: { + ...params.query, + }, + where, + }), + 'jobs.find.db.find' + ) } - async get (id, params) { + async get(id, params) { const where = { id, - }; + } if (params.user.uid) { - where['$creator.profile.uid$'] = params.user.uid; + where['$creator.profile.uid$'] = params.user.uid } else { - where.creatorId = params.user.id; + where.creatorId = params.user.id } - return measureTime(() => this.sequelizeService.get(id, { where }) - .then(job => job.toJSON()), 'jobs.get.db.get'); + return measureTime(() => this.sequelizeService.get(id, { where }).then(job => job.toJSON()), 'jobs.get.db.get') } - async create (data, params) { + async create(data, params) { // create a test job - const client = this.app.get('celeryClient'); + const client = this.app.get('celeryClient') if (!client) { - throw new BadGateway('celery is not ready'); + throw new BadGateway('celery is not ready') } - debug(`create '${this.name}', test task`); + debug(`create '${this.name}', test task`) - return client.run({ - task: 'impresso.tasks.test', - args: [ - // user id - params.user.id, - ], - }).catch((err) => { - if (err.result.exc_type === 'DoesNotExist') { - throw new NotFound(err.result.exc_message); - } else if (err.result.exc_type === 'OperationalError') { - // probably db is not availabe - throw new NotImplemented(); - } - throw new NotImplemented(); - }); + return client + .run({ + task: 'impresso.tasks.test', + args: [ + // user id + params.user.id, + ], + }) + .then(result => { + return { taskId: result.taskId } + }) + .catch(err => { + if (err.result.exc_type === 'DoesNotExist') { + throw new NotFound(err.result.exc_message) + } else if (err.result.exc_type === 'OperationalError') { + // probably db is not availabe + throw new NotImplemented() + } + throw new NotImplemented() + }) } - async update (id, data, params) { - return data; + async update(id, data, params) { + return data } - async patch (id, data, params) { + async patch(id, data, params) { const where = { creatorId: params.user.id, - }; - debug(`[patch] id:${id}, params.user.uid:${params.user.uid}, where:`, where); - return this.sequelizeService.patch(id, { - status: data.sanitized.status, - }, { where }); + } + debug(`[patch] id:${id}, params.user.uid:${params.user.uid}, where:`, where) + return this.sequelizeService.patch( + id, + { + status: data.sanitized.status, + }, + { where } + ) } - async remove (id, params) { - debug(`[remove] id:${id}, params.user.uid:${params.user.uid}`); - return this.sequelizeService.bulkRemove({ - id, - creatorId: params.user.id, - status: [STATUS_KILLED, STATUS_DONE], - }).then(removed => ({ - params: { + async remove(id, params) { + debug(`[remove] id:${id}, params.user.uid:${params.user.uid}`) + return this.sequelizeService + .bulkRemove({ id, - }, - removed, - })); + creatorId: params.user.id, + status: [STATUS_KILLED, STATUS_DONE], + }) + .then(removed => ({ + params: { + id, + }, + removed, + })) } } module.exports = function (options) { - return new Service(options); -}; + return new Service(options) +} -module.exports.Service = Service; +module.exports.Service = Service