From 28b162ca061e483501a68016e3af9dcc07c35d6f Mon Sep 17 00:00:00 2001 From: Roman Kalyakin Date: Fri, 16 Aug 2024 15:52:44 +0200 Subject: [PATCH] [feature] Entities Wikidata id filter (#406) * added patch endpoint to collectable items * small change to make it redeploy * added wikidata id filter --- README.md | 1 + package-lock.json | 17 +- package.json | 12 +- src/configuration.ts | 3 + .../{entities.class.js => entities.class.ts} | 113 ++++-- src/services/entities/entities.hooks.ts | 8 +- src/services/entities/util.ts | 30 ++ src/util/solr/filterReducers.js | 371 +++++++++--------- src/util/solr/solrFilters.yml | 3 + 9 files changed, 327 insertions(+), 231 deletions(-) rename src/services/entities/{entities.class.js => entities.class.ts} (57%) create mode 100644 src/services/entities/util.ts diff --git a/README.md b/README.md index a2f91581..1d834a28 100644 --- a/README.md +++ b/README.md @@ -261,3 +261,4 @@ The 'impresso - Media Monitoring of the Past' project is funded by the Swiss Nat Copyright (C) 2020 The _impresso_ team. Contributors to this program include: [Daniele Guido](https://github.com/danieleguido), [Roman Kalyakin](https://github.com/theorm), [Thijs van Beek](https://github.com/tvanbeek). This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. See the [GNU Affero General Public License](https://github.com/impresso/impresso-middle-layer/blob/master/LICENSE) for more details. + diff --git a/package-lock.json b/package-lock.json index a66980da..c92269ab 100644 --- a/package-lock.json +++ b/package-lock.json @@ -55,7 +55,7 @@ "graphology-pagerank": "^1.1.0", "helmet": "^3.21.1", "http-proxy-middleware": "^2.0.1", - "impresso-jscommons": "https://github.com/impresso/impresso-jscommons/tarball/v1.4.1", + "impresso-jscommons": "https://github.com/impresso/impresso-jscommons/tarball/v1.4.3", "json2csv": "^4.3.3", "jsonschema": "^1.4.1", "lodash": "^4.17.21", @@ -6135,9 +6135,10 @@ } }, "node_modules/google-protobuf": { - "version": "3.10.0", - "resolved": "https://registry.npmjs.org/google-protobuf/-/google-protobuf-3.10.0.tgz", - "integrity": "sha512-d0cMO8TJ6xtB/WrVHCv5U81L2ulX+aCD58IljyAN6mHwdHHJ2jbcauX5glvivi3s3hx7EYEo7eUA9WftzamMnw==" + "version": "3.21.2", + "resolved": "https://registry.npmjs.org/google-protobuf/-/google-protobuf-3.21.2.tgz", + "integrity": "sha512-3MSOYFO5U9mPGikIYCzK0SaThypfGgS6bHqrUGXG3DPHCrb+txNqeEcns1W0lkGfk0rCyNXm7xB9rMxnCiZOoA==", + "license": "(BSD-3-Clause AND Apache-2.0)" }, "node_modules/gopd": { "version": "1.0.1", @@ -6671,13 +6672,13 @@ } }, "node_modules/impresso-jscommons": { - "version": "1.4.1", - "resolved": "https://github.com/impresso/impresso-jscommons/tarball/v1.4.1", - "integrity": "sha512-SdNvICAXLAtjLIoGWsTYguHVvYqNkmvQlwQrnhALhx8oAOTOi8z0/Rw+e75Bo2DHjJQTkfdzu+fy/wlWREbZjQ==", + "version": "1.4.3", + "resolved": "https://github.com/impresso/impresso-jscommons/tarball/v1.4.3", + "integrity": "sha512-Kawk9rmZ6mSr5gKMWRZaeyIsfCrmv5ytD80BwlFSaYZzcMLPYs4f3EDMpcGKFXgGMupwwMIJSHXhQSikaY0hBA==", "dependencies": { "base64-js": "1.3.1", "case": "1.6.2", - "google-protobuf": "3.10.0" + "google-protobuf": "3.21.2" } }, "node_modules/imurmurhash": { diff --git a/package.json b/package.json index ed267dd7..afdda5ef 100644 --- a/package.json +++ b/package.json @@ -94,7 +94,7 @@ "graphology-pagerank": "^1.1.0", "helmet": "^3.21.1", "http-proxy-middleware": "^2.0.1", - "impresso-jscommons": "https://github.com/impresso/impresso-jscommons/tarball/v1.4.1", + "impresso-jscommons": "https://github.com/impresso/impresso-jscommons/tarball/v1.4.3", "json2csv": "^4.3.3", "jsonschema": "^1.4.1", "lodash": "^4.17.21", @@ -131,13 +131,13 @@ }, "devDependencies": { "@openapi-contrib/json-schema-to-openapi-schema": "3.0.1", + "@stoplight/spectral-cli": "6.11.1", + "@stoplight/spectral-owasp-ruleset": "2.0.1", "@types/cache-manager": "^2.10.3", - "@types/ioredis": "^4.28.5", "@types/generic-pool": "^3.1.9", + "@types/ioredis": "^4.28.5", "@types/mocha": "10.0.6", "@types/node-fetch": "^2.5.6", - "@stoplight/spectral-cli": "6.11.1", - "@stoplight/spectral-owasp-ruleset": "2.0.1", "eslint": "^8.18.0", "eslint-config-standard": "^17.0.0", "eslint-plugin-import": "^2.26.0", @@ -147,8 +147,8 @@ "mocha": "10.4.0", "nodemon": "3.1.0", "prettier": "3.2.5", + "ts-node": "10.9.2", "typescript": "5.4.3", - "typescript-cp": "0.1.9", - "ts-node": "10.9.2" + "typescript-cp": "0.1.9" } } diff --git a/src/configuration.ts b/src/configuration.ts index 4e8d1b45..6e1d1a51 100644 --- a/src/configuration.ts +++ b/src/configuration.ts @@ -97,6 +97,9 @@ export interface Configuration { useDbUserInRequestContext?: boolean problemUriBase?: string features?: FeaturesConfiguration + paginate?: PaginationConfiguration + + // TODO: move to services: authentication: LocalAuthenticationConfiguration /** * Configuration for the auth strategy in Public API diff --git a/src/services/entities/entities.class.js b/src/services/entities/entities.class.ts similarity index 57% rename from src/services/entities/entities.class.js rename to src/services/entities/entities.class.ts index 7068f10d..087a7dc3 100644 --- a/src/services/entities/entities.class.js +++ b/src/services/entities/entities.class.ts @@ -1,3 +1,11 @@ +import { CachedSolrClient } from '../../cachedSolr' +import { ImpressoApplication } from '../../types' +import { Service as SequelizeService } from '../sequelize.service' +import User from '../../models/users.model' +import { Params } from '@feathersjs/feathers' +import { Filter } from 'impresso-jscommons' +import { buildSequelizeWikidataIdFindEntitiesCondition, sortFindEntitiesFilters } from './util' + /* eslint-disable no-unused-vars */ const debug = require('debug')('impresso/services:entities') const lodash = require('lodash') @@ -7,35 +15,88 @@ const { NotFound } = require('@feathersjs/errors') const wikidata = require('../wikidata') const Entity = require('../../models/entities.model') -const SequelizeService = require('../sequelize.service') const { measureTime } = require('../../util/instruments') const { buildSearchEntitiesSolrQuery } = require('./logic') +interface Sanitized { + sanitized: T + originalQuery: any +} + +interface WithUser { + user?: User +} + +interface FindQuery { + filters: Filter[] + limit?: number + offset?: number + order_by?: string + resolve?: string +} + class Service { - constructor({ app }) { + app: ImpressoApplication + name: string + sequelizeService: SequelizeService + solr: CachedSolrClient + + constructor({ app }: { app: ImpressoApplication }) { this.app = app this.name = 'entities' this.sequelizeService = new SequelizeService({ - app, + app: app as any as null, name: this.name, }) - /** @type {import('../../cachedSolr').CachedSolrClient} */ this.solr = app.service('cachedSolr') } - async create(data, params) { + async create(data: any, params: any) { params.query = data return this.find(params) } - async find(params) { - debug('[find] with params:', params.query) + async find(params: Params & Sanitized & WithUser) { + const qp = params.query! + debug('[find] with params:', qp) + + // split filters into solr and sequelize filters + const { solrFilters, sequelizeFilters } = sortFindEntitiesFilters(qp.filters) + // build sequelize condition for wikidata IDs + const sequelizeWikidataFindEntitiesCondition = buildSequelizeWikidataIdFindEntitiesCondition(sequelizeFilters) + + // if condition was built - run the query against the db + // and collect matched entity ids + let constraintIds: string[] | undefined = undefined + if (sequelizeWikidataFindEntitiesCondition != null) { + const records = await this.sequelizeService.find({ + findAllOnly: true, + query: { + limit: 1000000, + offset: 0, + }, + where: sequelizeWikidataFindEntitiesCondition, + }) + + constraintIds = records.data.map((d: any) => d.uid) + } + + debug('[find] constraintIds:', constraintIds) + + // if ids were collected - add them as a filter for solr + const uidFilter: Filter | undefined = + constraintIds != null + ? { + type: 'uid', + q: constraintIds, + } + : undefined const query = buildSearchEntitiesSolrQuery({ - filters: params.query.filters, - orderBy: params.query.order_by, - limit: params.query.limit, - offset: params.query.offset, + filters: uidFilter != null ? [uidFilter, ...solrFilters] : solrFilters, + orderBy: qp.order_by, + limit: qp.limit, + offset: qp.offset, }) debug('[find] solr query:', query) @@ -52,8 +113,8 @@ class Service { return { total: 0, data: [], - limit: params.query.limit, - offset: params.query.offset, + limit: qp.limit, + offset: qp.offset, info: { ...params.originalQuery, }, @@ -62,7 +123,7 @@ class Service { // generate the sequelize clause. const where = { id: { - [Op.in]: entities.map(d => d.uid), + [Op.in]: entities.map((d: any) => d.uid), }, } // get sequelize results @@ -83,9 +144,9 @@ class Service { const sequelizeEntitiesIndex = lodash.keyBy(sequelizeResult.data, 'uid') const result = { total: solrResult.response.numFound, - limit: params.query.limit, - offset: params.query.offset, - data: entities.map(d => { + limit: qp.limit, + offset: qp.offset, + data: entities.map((d: any) => { if (sequelizeEntitiesIndex[d.uid]) { // enrich with wikidataID d.wikidataId = sequelizeEntitiesIndex[d.uid].wikidataId @@ -112,10 +173,10 @@ class Service { const wkdIds = lodash(sequelizeEntitiesIndex).map('wikidataId').compact().value() debug('[find] wikidata loading:', wkdIds.length) - const resolvedEntities = {} + const resolvedEntities: Record = {} return Promise.all( - wkdIds.map(wkdId => + wkdIds.map((wkdId: string) => measureTime( () => wikidata @@ -123,7 +184,7 @@ class Service { ids: [wkdId], cache: this.app.service('redisClient').client, }) - .then(resolved => { + .then((resolved: any) => { resolvedEntities[wkdId] = resolved[wkdId] }), 'entities.find.wikidata.get' @@ -132,7 +193,7 @@ class Service { ) .then(res => { debug('[find] wikidata success!') - result.data = result.data.map(d => { + result.data = result.data.map((d: any) => { if (d.wikidataId) { d.wikidata = resolvedEntities[d.wikidataId] } @@ -146,7 +207,7 @@ class Service { }) } - async get(id, params) { + async get(id: string, params: any) { return this.find({ ...params, query: { @@ -168,20 +229,20 @@ class Service { }) } - async update(id, data, params) { + async update(id: string, data: any, params: any) { return data } - async patch(id, data, params) { + async patch(id: string, data: any, params: any) { return data } - async remove(id, params) { + async remove(id: string, params: any) { return { id } } } -module.exports = function (options) { +module.exports = function (options: any) { return new Service(options) } diff --git a/src/services/entities/entities.hooks.ts b/src/services/entities/entities.hooks.ts index 54eed479..e6a10d67 100644 --- a/src/services/entities/entities.hooks.ts +++ b/src/services/entities/entities.hooks.ts @@ -1,3 +1,5 @@ +import { SolrNamespaces } from '../../solr' + const { validate, validateEach, queryWithCommonParams, utils } = require('../../hooks/params') const { qToSolrFilter, filtersToSolrQuery } = require('../../hooks/search') @@ -49,7 +51,7 @@ export default { defaultValue: 'OR', }, type: { - choices: ['string', 'type', 'uid'], + choices: ['string', 'type', 'uid', 'wikidataId'], required: true, // trasform is required because they shoyd be related to entities namespace. // transform: (d) => { @@ -65,7 +67,9 @@ export default { } ), qToSolrFilter('string'), - filtersToSolrQuery(), + filtersToSolrQuery({ + solrIndexProvider: () => SolrNamespaces.Entities, + }), queryWithCommonParams(), ], get: [], diff --git a/src/services/entities/util.ts b/src/services/entities/util.ts new file mode 100644 index 00000000..7dc3fd7e --- /dev/null +++ b/src/services/entities/util.ts @@ -0,0 +1,30 @@ +import { Filter } from 'impresso-jscommons' +import { Op } from 'sequelize' + +interface FilterTuple { + solrFilters: Filter[] + sequelizeFilters: Filter[] +} + +export const sortFindEntitiesFilters = (filters: Filter[]): FilterTuple => { + const solrFilters = filters.filter(f => f.type !== 'wikidataId') + const sequelizeFilters = filters.filter(f => f.type === 'wikidataId') + return { solrFilters, sequelizeFilters } +} + +export const buildSequelizeWikidataIdFindEntitiesCondition = (filters: Filter[]): Record | undefined => { + const supportedFilters = filters.filter(f => f.type === 'wikidataId' && f.q != null) + + const items = supportedFilters.map(f => { + const operator = f.context === 'exclude' ? Op.notIn : Op.in + return { + [operator]: typeof f.q === 'string' ? [f.q] : f.q, + } + }) + + if (items.length === 0) { + return undefined + } + + return { wikidataId: { [Op.and]: items } } +} diff --git a/src/util/solr/filterReducers.js b/src/util/solr/filterReducers.js index fd6454b7..d78f6ac5 100644 --- a/src/util/solr/filterReducers.js +++ b/src/util/solr/filterReducers.js @@ -1,50 +1,50 @@ -const YAML = require('yaml'); -const { readFileSync } = require('fs'); -const { InvalidArgumentError } = require('../error'); +const YAML = require('yaml') +const { readFileSync } = require('fs') +const { InvalidArgumentError } = require('../error') -const filtersConfig = YAML.parse(readFileSync(`${__dirname}/solrFilters.yml`).toString()); +const filtersConfig = YAML.parse(readFileSync(`${__dirname}/solrFilters.yml`).toString()) -const escapeValue = value => value.replace(/[()\\+&|!{}[\]?:;,]/g, d => `\\${d}`); +const escapeValue = value => value.replace(/[()\\+&|!{}[\]?:;,]/g, d => `\\${d}`) const getValueWithFields = (value, fields) => { if (Array.isArray(fields)) { - return fields.map(field => getValueWithFields(value, field)).join(' OR '); + return fields.map(field => getValueWithFields(value, field)).join(' OR ') } - return `${fields}:${escapeValue(value)}`; -}; -const RangeValueRegex = /^\s*\d+\s+TO\s+\d+\s*$/; + return `${fields}:${escapeValue(value)}` +} +const RangeValueRegex = /^\s*\d+\s+TO\s+\d+\s*$/ const reduceNumericRangeFilters = (filters, field) => { const items = filters.reduce((sq, filter) => { - let q; // q is in the form array ['1 TO 10', '20 TO 30'] (OR condition) + let q // q is in the form array ['1 TO 10', '20 TO 30'] (OR condition) // or simple string '1 TO X'; if (Array.isArray(filter.q)) { if (filter.q.length !== 2 || !filter.q.every(v => Number.isFinite(parseInt(v, 10)))) { - throw new InvalidArgumentError(`"numericRange" filter rule: unknown values encountered in "q": ${filter.q}`); + throw new InvalidArgumentError(`"numericRange" filter rule: unknown values encountered in "q": ${filter.q}`) } - q = `${field}:[${filter.q[0]} TO ${filter.q[1]}]`; + q = `${field}:[${filter.q[0]} TO ${filter.q[1]}]` } else if (filter.q != null) { if (!filter.q.match(RangeValueRegex)) { - throw new InvalidArgumentError(`"numericRange" filter rule: unknown value encountered in "q": ${filter.q}`); + throw new InvalidArgumentError(`"numericRange" filter rule: unknown value encountered in "q": ${filter.q}`) } - q = `${field}:[${filter.q}]`; + q = `${field}:[${filter.q}]` } else { - q = `${field}:*`; + q = `${field}:*` } if (filter.context === 'exclude') { - q = sq.length > 0 ? `NOT (${q})` : `*:* AND NOT (${q})`; + q = sq.length > 0 ? `NOT (${q})` : `*:* AND NOT (${q})` } - sq.push(q); - return sq; - }, []); + sq.push(q) + return sq + }, []) - return items.join(' AND '); -}; + return items.join(' AND ') +} -const SolrSupportedLanguages = ['en', 'fr', 'de']; +const SolrSupportedLanguages = ['en', 'fr', 'de'] -const fullyEscapeValue = value => escapeValue(value).replace(/"/g, d => `\\${d}`); +const fullyEscapeValue = value => escapeValue(value).replace(/"/g, d => `\\${d}`) /** * Convert filter to a Solr request. @@ -53,43 +53,43 @@ const fullyEscapeValue = value => escapeValue(value).replace(/"/g, d => `\\${d}` * @param {import('../../models').FilterPrecision} precision filter precision. */ const getStringQueryWithFields = (value, solrFields, precision) => { - let q; + let q if (value != null) { - q = value.trim(); - const hasMultipleWords = q.split(/\s/).length > 1; - const isExact = q.match(/^"(.*)"(~[12345])?$/); - const isFuzzy = q.match(/^(.*)~([12345])$/); + q = value.trim() + const hasMultipleWords = q.split(/\s/).length > 1 + const isExact = q.match(/^"(.*)"(~[12345])?$/) + const isFuzzy = q.match(/^(.*)~([12345])$/) if (isExact && isFuzzy) { - q = `"${fullyEscapeValue(isExact[1])}"${isExact[2]}`; + q = `"${fullyEscapeValue(isExact[1])}"${isExact[2]}` } else if (isExact) { - q = `"${fullyEscapeValue(isExact[1])}"`; + q = `"${fullyEscapeValue(isExact[1])}"` } else if (isFuzzy) { - q = `"${fullyEscapeValue(isFuzzy[1])}"`; + q = `"${fullyEscapeValue(isFuzzy[1])}"` } else { // use filter properties if set - q = fullyEscapeValue(q); + q = fullyEscapeValue(q) if (precision === 'soft') { - q = `(${q.split(/\s+/g).join(' OR ')})`; + q = `(${q.split(/\s+/g).join(' OR ')})` } else if (precision === 'fuzzy') { // "richard chase"~1 - q = `"${q.split(/\s+/g).join(' ')}"~1`; + q = `"${q.split(/\s+/g).join(' ')}"~1` } else if (precision === 'exact') { - q = `"${q}"`; + q = `"${q}"` } else if (hasMultipleWords) { // text:"Richard Chase" - q = q.replace(/"/g, ' '); - q = `"${q.split(/\s+/g).join(' ')}"`; - q = `(${q.split(/\s+/g).join(' ')})`; + q = q.replace(/"/g, ' ') + q = `"${q.split(/\s+/g).join(' ')}"` + q = `(${q.split(/\s+/g).join(' ')})` } } } else { - q = '*'; + q = '*' } - const items = solrFields.map(f => `${f}:${q}`); - const statement = items.join(' OR '); - return items.length > 1 ? `(${statement})` : statement; -}; + const items = solrFields.map(f => `${f}:${q}`) + const statement = items.join(' OR ') + return items.length > 1 ? `(${statement})` : statement +} /** * String type filter handler @@ -98,187 +98,179 @@ const getStringQueryWithFields = (value, solrFields, precision) => { * @return {string} solr query */ const reduceStringFiltersToSolr = (filters, field) => { - const languages = SolrSupportedLanguages; - const items = filters.map(({ - q, - op = 'OR', - precision, - context, - }, index) => { - let fields = []; - - if (typeof field === 'string') fields = [field]; - else if (Array.isArray(field)) fields = field; - else if (field.prefix != null) fields = languages.map(lang => `${field.prefix}${lang}`); - else throw new InvalidArgumentError(`Unknown type of Solr field: ${JSON.stringify(field)}`); - - let queryList = [null]; + const languages = SolrSupportedLanguages + const items = filters.map(({ q, op = 'OR', precision, context }, index) => { + let fields = [] + + if (typeof field === 'string') fields = [field] + else if (Array.isArray(field)) fields = field + else if (field.prefix != null) fields = languages.map(lang => `${field.prefix}${lang}`) + else throw new InvalidArgumentError(`Unknown type of Solr field: ${JSON.stringify(field)}`) + + let queryList = [null] if (Array.isArray(q) && q.length > 0) { - queryList = q.filter(v => v != null && v !== ''); - if (queryList.length === 0) queryList = [null]; - } else if (typeof q === 'string' && q != null && q !== '') queryList = [q]; + queryList = q.filter(v => v != null && v !== '') + if (queryList.length === 0) queryList = [null] + } else if (typeof q === 'string' && q != null && q !== '') queryList = [q] let transformedQuery = queryList .map(value => getStringQueryWithFields(value, fields, precision)) // @ts-ignore .flat() - .join(` ${op} `); + .join(` ${op} `) if (context === 'exclude') { - transformedQuery = index > 0 ? `NOT (${transformedQuery})` : `*:* AND NOT (${transformedQuery})`; + transformedQuery = index > 0 ? `NOT (${transformedQuery})` : `*:* AND NOT (${transformedQuery})` } - return queryList.length > 1 ? `(${transformedQuery})` : transformedQuery; - }); + return queryList.length > 1 ? `(${transformedQuery})` : transformedQuery + }) // @ts-ignore - return items.flat().join(' AND '); -}; + return items.flat().join(' AND ') +} -const DateRangeValueRegex = /^\s*[TZ:\d-]+\s+TO\s+[TZ:\d-]+\s*$/; +const DateRangeValueRegex = /^\s*[TZ:\d-]+\s+TO\s+[TZ:\d-]+\s*$/ const reduceDaterangeFiltersToSolr = (filters, field, rule) => { const items = filters.reduce((sq, filter) => { - const query = Array.isArray(filter.q) && filter.q.length === 1 - ? filter.q[0] - : filter.q; - const op = filter.op || 'OR'; + const query = Array.isArray(filter.q) && filter.q.length === 1 ? filter.q[0] : filter.q + const op = filter.op || 'OR' - let q; + let q if (Array.isArray(query)) { if (query.length !== 2) { - throw new InvalidArgumentError(`"${rule}" filter rule: unknown values encountered in "q": ${filter.q}`); + throw new InvalidArgumentError(`"${rule}" filter rule: unknown values encountered in "q": ${filter.q}`) } - q = `${query.map(d => `${field}:[${d}]`).join(` ${op} `)}`; + q = `${query.map(d => `${field}:[${d}]`).join(` ${op} `)}` if (query.length > 1) { - q = `(${q})`; + q = `(${q})` } } else if (query != null) { if (!query.match(DateRangeValueRegex)) { - throw new InvalidArgumentError(`"${rule}" filter rule: unknown value encountered in "q": ${filter.q}`); + throw new InvalidArgumentError(`"${rule}" filter rule: unknown value encountered in "q": ${filter.q}`) } - q = `${field}:[${query}]`; + q = `${field}:[${query}]` } else { - q = `${field}:*`; + q = `${field}:*` } if (filter.context === 'exclude') { - q = sq.length > 0 ? `NOT (${q})` : `*:* AND NOT (${q})`; + q = sq.length > 0 ? `NOT (${q})` : `*:* AND NOT (${q})` } - sq.push(q); - return sq; - }, []); - return items.join(' AND '); -}; - -const reduceFiltersToSolr = ( - filters, - field, - rule, - transformValue = v => v, -) => filters.reduce((sq, filter) => { - let qq = ''; - const op = filter.op || 'OR'; - - if (Array.isArray(filter.q)) { - const values = filter.q.length > 0 ? filter.q : ['*']; - qq = values - .map(transformValue) - .map(value => getValueWithFields(value, field)).join(` ${op} `); - qq = `(${qq})`; - } else if (typeof filter.q === 'string' && filter.q != null && filter.q !== '') { - qq = getValueWithFields(transformValue(filter.q), field); - } else { - qq = getValueWithFields('*', field); - } - if (filter.context === 'exclude') { - qq = sq.length > 0 ? `NOT (${qq})` : `*:* AND NOT (${qq})`; - } - sq.push(qq); - return sq; -}, []).join(' AND '); + sq.push(q) + return sq + }, []) + return items.join(' AND ') +} + +const reduceFiltersToSolr = (filters, field, rule, transformValue = v => v) => + filters + .reduce((sq, filter) => { + let qq = '' + const op = filter.op || 'OR' + + if (Array.isArray(filter.q)) { + const values = filter.q.length > 0 ? filter.q : ['*'] + qq = values + .map(transformValue) + .map(value => getValueWithFields(value, field)) + .join(` ${op} `) + qq = `(${qq})` + } else if (typeof filter.q === 'string' && filter.q != null && filter.q !== '') { + qq = getValueWithFields(transformValue(filter.q), field) + } else { + qq = getValueWithFields('*', field) + } + if (filter.context === 'exclude') { + qq = sq.length > 0 ? `NOT (${qq})` : `*:* AND NOT (${qq})` + } + sq.push(qq) + return sq + }, []) + .join(' AND ') const reduceRegexFiltersToSolr = (filters, field) => { - let fields = []; - if (typeof field === 'string') fields = [field]; - else if (Array.isArray(field)) fields = field; - else if (field.prefix != null) fields = SolrSupportedLanguages.map(lang => `${field.prefix}${lang}`); - else throw new InvalidArgumentError(`Unknown type of Solr field: ${JSON.stringify(field)}`); - - return filters.reduce((reduced, { q, op = 'OR' }) => { - // cut regexp at any . not preceded by an escape sign. - let queryString; - if (Array.isArray(q)) { - if (q.length > 1) { - throw new InvalidArgumentError(`"regex" filter rule supports only single element arrays in "q": ${JSON.stringify(q)}`); - } else if (q.length === 0) { - queryString = '/.*/'; + let fields = [] + if (typeof field === 'string') fields = [field] + else if (Array.isArray(field)) fields = field + else if (field.prefix != null) fields = SolrSupportedLanguages.map(lang => `${field.prefix}${lang}`) + else throw new InvalidArgumentError(`Unknown type of Solr field: ${JSON.stringify(field)}`) + + return filters + .reduce((reduced, { q, op = 'OR' }) => { + // cut regexp at any . not preceded by an escape sign. + let queryString + if (Array.isArray(q)) { + if (q.length > 1) { + throw new InvalidArgumentError( + `"regex" filter rule supports only single element arrays in "q": ${JSON.stringify(q)}` + ) + } else if (q.length === 0) { + queryString = '/.*/' + } else { + queryString = q[0].trim() + } + } else if (q != null) { + queryString = q.trim() } else { - queryString = q[0].trim(); + queryString = '/.*/' } - } else if (q != null) { - queryString = q.trim(); - } else { - queryString = '/.*/'; - } - const queryValues = queryString - // get rid of first / and last / - .replace(/^\/|\/$/g, '') - // split on point or spaces - .split(/\\?\.[*+]/) - // filterout empty stuff - .filter(d => d.length); - - const query = (queryValues.length > 0 ? queryValues : ['.*']) - // rebuild; - .map(d => fields.map(f => `${f}:/${d}/`).join(` ${op} `)); - return reduced.concat(query.map(v => (fields.length > 1 ? `(${v})` : v))); - }, []).join(' AND '); -}; + const queryValues = queryString + // get rid of first / and last / + .replace(/^\/|\/$/g, '') + // split on point or spaces + .split(/\\?\.[*+]/) + // filterout empty stuff + .filter(d => d.length) + + const query = (queryValues.length > 0 ? queryValues : ['.*']) + // rebuild; + .map(d => fields.map(f => `${f}:/${d}/`).join(` ${op} `)) + return reduced.concat(query.map(v => (fields.length > 1 ? `(${v})` : v))) + }, []) + .join(' AND ') +} const minLengthOneHandler = (filters, field, filterRule) => { - if (typeof field !== 'string') throw new InvalidArgumentError(`"${filterRule}" supports only "string" fields`); - return `${field}:[1 TO *]`; -}; + if (typeof field !== 'string') throw new InvalidArgumentError(`"${filterRule}" supports only "string" fields`) + return `${field}:[1 TO *]` +} const booleanHandler = (filters, field, filterRule) => { - if (typeof field !== 'string') throw new InvalidArgumentError(`"${filterRule}" supports only "string" fields`); - return `${field}:1`; -}; + if (typeof field !== 'string') throw new InvalidArgumentError(`"${filterRule}" supports only "string" fields`) + return `${field}:1` +} -const reduceCapitalisedValue = (filters, field, rule) => reduceFiltersToSolr( - filters, - field, - rule, - v => v.charAt(0).toUpperCase() + v.slice(1), -); +const reduceCapitalisedValue = (filters, field, rule) => + reduceFiltersToSolr(filters, field, rule, v => v.charAt(0).toUpperCase() + v.slice(1)) const textAsOpenEndedSearchString = (text, field) => { - const parts = text.split(' ').filter(v => v !== ''); + const parts = text.split(' ').filter(v => v !== '') const statement = parts .map(part => part.replace(/"/g, '\\"')) .map((part, index, arr) => { - const suffix = index === arr.length - 1 ? '*' : ''; - return `${field}:${part}${suffix}`; + const suffix = index === arr.length - 1 ? '*' : '' + return `${field}:${part}${suffix}` }) - .join(' AND '); - return parts.length > 1 ? `(${statement})` : statement; -}; + .join(' AND ') + return parts.length > 1 ? `(${statement})` : statement +} const reduceOpenEndedStringValue = (filters, field) => { const outerStatement = filters - .map((filter) => { - const strings = Array.isArray(filter.q) ? filter.q : [filter.q]; - const statement = strings - .map(v => textAsOpenEndedSearchString(v, field)) - .join(` ${filter.op || 'OR'} `); - return strings.length > 1 ? `(${statement})` : statement; + .map(filter => { + const strings = Array.isArray(filter.q) ? filter.q : [filter.q] + const statement = strings.map(v => textAsOpenEndedSearchString(v, field)).join(` ${filter.op || 'OR'} `) + return strings.length > 1 ? `(${statement})` : statement }) - .join(' AND '); - return filters.length > 1 ? `(${outerStatement})` : outerStatement; -}; + .join(' AND ') + return filters.length > 1 ? `(${outerStatement})` : outerStatement +} + +const noopHandler = () => '*:*' const FiltersHandlers = Object.freeze({ minLengthOne: minLengthOneHandler, @@ -290,7 +282,8 @@ const FiltersHandlers = Object.freeze({ regex: reduceRegexFiltersToSolr, capitalisedValue: reduceCapitalisedValue, openEndedString: reduceOpenEndedStringValue, -}); + noop: noopHandler, +}) /** * Convert a set of filters of the same type to a SOLR query string. @@ -302,24 +295,24 @@ const FiltersHandlers = Object.freeze({ * @returns {string} a SOLR query string that can be wrapped into a `filter()` statement. */ const filtersToSolr = (filters, solrNamespace) => { - if (filters.length < 1) throw new InvalidArgumentError('At least one filter must be provided'); - const types = [...new Set(filters.map(({ type }) => type))]; - if (types.length > 1) throw new InvalidArgumentError(`Filters must be of the same type. Found types: "${types}"`); - const type = types[0]; - - const filtersRules = filtersConfig.indexes[solrNamespace] - ? filtersConfig.indexes[solrNamespace].filters - : {}; - const filterRules = filtersRules[type]; - if (filterRules == null) throw new InvalidArgumentError(`Unknown filter type "${type}" in namespace "${solrNamespace}"`); + if (filters.length < 1) throw new InvalidArgumentError('At least one filter must be provided') + const types = [...new Set(filters.map(({ type }) => type))] + if (types.length > 1) throw new InvalidArgumentError(`Filters must be of the same type. Found types: "${types}"`) + const type = types[0] + + const filtersRules = filtersConfig.indexes[solrNamespace] ? filtersConfig.indexes[solrNamespace].filters : {} + const filterRules = filtersRules[type] + if (filterRules == null) { + throw new InvalidArgumentError(`Unknown filter type "${type}" in namespace "${solrNamespace}"`) + } - const handler = FiltersHandlers[filterRules.rule]; - if (handler == null) throw new InvalidArgumentError(`Could not find handler for rule ${filterRules.rule}`); + const handler = FiltersHandlers[filterRules.rule] + if (handler == null) throw new InvalidArgumentError(`Could not find handler for rule ${filterRules.rule}`) - return handler(filters, filterRules.field, filterRules.rule); -}; + return handler(filters, filterRules.field, filterRules.rule) +} module.exports = { filtersToSolr, escapeValue, -}; +} diff --git a/src/util/solr/solrFilters.yml b/src/util/solr/solrFilters.yml index d7a4abbe..00e9be2d 100644 --- a/src/util/solr/solrFilters.yml +++ b/src/util/solr/solrFilters.yml @@ -196,3 +196,6 @@ indexes: uid: field: id rule: value + wikidataId: + field: na + rule: noop