diff --git a/.github/workflows/build-and-release.yml b/.github/workflows/build-and-release.yml
index 69c7afd..47787d7 100644
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@@ -47,4 +47,3 @@ jobs:
with:
user: __token__
password: ${{ secrets.pypi_password }}
-
diff --git a/.gitignore b/.gitignore
index bea12b0..e652a7f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -106,4 +106,4 @@ CHANGELOG
.idea
# vscode
-.vscode
\ No newline at end of file
+.vscode
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..2c56732
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.6.0
+ hooks:
+ - id: check-yaml
+ - id: end-of-file-fixer
+ - id: trailing-whitespace
+ - id: fix-byte-order-marker
+ - id: mixed-line-ending
+ - id: name-tests-test
+ args: [ --pytest-test-first ]
+ exclude: '^(?!factories/)'
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.5.6
+ hooks:
+ - id: ruff
+ args: [ --fix ]
diff --git a/examples/demo_parser.py b/examples/demo_parser.py
deleted file mode 100644
index 7946d19..0000000
--- a/examples/demo_parser.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# This file is part of INSPIRE.
-# Copyright (C) 2014-2017 CERN.
-#
-# INSPIRE is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# INSPIRE is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with INSPIRE. If not, see .
-#
-# In applying this license, CERN does not waive the privileges and immunities
-# granted to it by virtue of its status as an Intergovernmental Organization
-# or submit itself to any jurisdiction.
-
-from __future__ import print_function, unicode_literals
-
-import sys
-
-from inspire_query_parser.parser import Query
-from inspire_query_parser.stateful_pypeg_parser import StatefulParser
-from inspire_query_parser.utils.format_parse_tree import emit_tree_format
-from inspire_query_parser.visitors.restructuring_visitor import RestructuringVisitor
-
-
-def repl():
- """Read-Eval-Print-Loop for reading the query, printing it and its parse tree.
-
- Exit the loop either with an interrupt or "quit".
- """
- while True:
- try:
- sys.stdout.write("Type in next query: \n> ")
- import locale
- query_str = raw_input().decode(sys.stdin.encoding or locale.getpreferredencoding(True))
- except KeyboardInterrupt:
- break
-
- if u'quit' in query_str:
- break
-
- print_query_and_parse_tree(query_str)
-
-
-def print_query_and_parse_tree(query_str):
- parser = StatefulParser()
- print('\033[94m' + "Parsing " + '\033[1m' + query_str + "" + '\033[0m')
- _, parse_tree = parser.parse(query_str, Query)
- print('\033[92m' + emit_tree_format(parse_tree.accept(RestructuringVisitor())) + '\033[0m')
- print("————————————————————————————————————————————————————————————————————————————————")
-
-
-if __name__ == '__main__':
- # repl()
-
- # Find keyword combined with other production rules
- print_query_and_parse_tree(r"FIN author:'ellis'")
- print_query_and_parse_tree(r"find a T.A. Aibergenov and date = 1986")
- print_query_and_parse_tree(r'Find author "ellis"')
- print_query_and_parse_tree(r'f author ellis')
-
- # Invenio like search
- print_query_and_parse_tree(r"author:ellis and title:boson")
- print_query_and_parse_tree(r"unknown_keyword:'bar'")
- print_query_and_parse_tree(r"dotted.keyword:'bar'")
-
- # Boolean operator testing (And/Or)
- print_query_and_parse_tree(r"author ellis and title 'boson'")
- print_query_and_parse_tree(r"f a appelquist and date 1983")
- print_query_and_parse_tree(r"fin a henneaux and citedby a nicolai")
- print_query_and_parse_tree(r"au ellis | title 'boson'")
- print_query_and_parse_tree(r"-author ellis OR title 'boson'")
- print_query_and_parse_tree(r"author ellis & title 'boson'")
-
- # Implicit And
- # Works in the case of "A B":
- # 1) B KeywordQuery is of format "keyword:value"
- # 2) B is a NotQuery, e.g. "title foo not title bar"
- # 3) A or B KeywordQueries have a ComplexValue as value, e.g. author 'ellis' title boson
- # 4) B KeywordQuery has a keyword that is a non-shortened version of INSPIRE_KEYWORDS.
- print_query_and_parse_tree(r"author ellis elastic.keyword:'boson'")
- print_query_and_parse_tree(r"find cn atlas not tc c")
- print_query_and_parse_tree(r"author:ellis j title:'boson' reference:M.N.1")
- print_query_and_parse_tree(r"author ellis title 'boson' not title higgs")
- print_query_and_parse_tree(r"author ellis - title 'boson'")
-
- # ##### Boolean operators at terminals level ####
- # 1. Boolean operators among simple values
- print_query_and_parse_tree(r"author ellis, j and smith")
- # 2. An and query among terminals or and "j" signifies the "journal" keyword?
- print_query_and_parse_tree(r"f author ellis, j and patrignani and j Chin.Phys.")
- # This one is ambiguous since first name "j" overlaps with journals
- print_query_and_parse_tree(r"f author ellis, j and patrignani and j ellis")
- # While this is clearer
- print_query_and_parse_tree(r"f author ellis, j and patrignani and j, ellis")
-
- # Negation
- print_query_and_parse_tree(r"ellis and not title 'boson'")
- print_query_and_parse_tree(r"-title 'boson'")
-
- # Nested expressions
- print_query_and_parse_tree(r"author ellis, j. and (title boson or (author /^xi$/ and title foo))")
- print_query_and_parse_tree(r"author ellis, j. and not (title boson or not (author /^xi$/ and title foo))")
-
- # Metadata search
- print_query_and_parse_tree(r'fulltext:boson and (reference:Ellis or reference "Ellis")')
- print_query_and_parse_tree(r"exactauthor:M.Vanderhaeghen.1 and ac: 42")
-
- # Simple phrases
- print_query_and_parse_tree(r'ellis')
- print_query_and_parse_tree(r"'ellis'")
-
- # Parenthesized keyword query values (working also with SPIRES operators - doesn't on legacy)
- print_query_and_parse_tree(r"author:(title ellis)")
- print_query_and_parse_tree(r"author (pardo, f AND slavich) OR (author:bernreuther and not date:2017)")
-
- # Non trivial terminals
- print_query_and_parse_tree(r"author smith and j., ellis")
- print_query_and_parse_tree(r"find title Alternative the Phase-II upgrade of the ATLAS Inner Detector or na61/shine")
- print_query_and_parse_tree(r"find (j phys.rev. and vol d85) or (j phys.rev.lett.,62,1825)")
- print_query_and_parse_tree(r"title e-10 and -author d'hoker")
- print_query_and_parse_tree(r'a pang,yi and ekström and t SU(2)') # Full-width comma unicode character
- print_query_and_parse_tree(r't e(+)e(-) or e+e- Colliders')
- print_query_and_parse_tree(r"title: Si-28(p(pol.),n(pol.))")
- print_query_and_parse_tree(r"t Si28(p→,p→′)Si28(6−,T=1) ")
- print_query_and_parse_tree(r"ti C-12(vec-p,vec-n)N-12 (g.s.,1+)")
-
- # Regex
- print_query_and_parse_tree(r"author:/^Ellis, (J|John)$/")
- print_query_and_parse_tree(r"title:/dense ([^ $]* )?matter/")
-
- # Nestable keywords
- print_query_and_parse_tree(r"referstox:author:s.p.martin.1")
- print_query_and_parse_tree(r"find a parke, s j and refersto author witten")
- print_query_and_parse_tree(r"citedbyx:author:s.p.martin.1")
- print_query_and_parse_tree(r"citedby:author:s.p.martin.1")
- print_query_and_parse_tree(r"-refersto:recid:1374998 and citedby:(A.A.Aguilar.Arevalo.1)")
- print_query_and_parse_tree(r"citedby:(author A.A.Aguilar.Arevalo.1 and not a ellis)")
- print_query_and_parse_tree(r"citedby:refersto:recid:1432705")
-
- # Ranges
- print_query_and_parse_tree(r"d 2015->2017 and cited:1->9")
-
- # Empty query
- print_query_and_parse_tree(r"") # Nothing
- print_query_and_parse_tree(r" ") # Spaces and Tab
-
- # G, GE, LT, LE, E queries
- print_query_and_parse_tree(r"date > 2000-10 and < 2000-12")
- print_query_and_parse_tree(r"date after 10/2000 and before 2000-12")
- print_query_and_parse_tree(r"date >= nov 2000 and d<=2005")
- print_query_and_parse_tree(r"date 1978+ + -ac 100+")
- print_query_and_parse_tree(r"f a wimpenny and date = 1987")
-
- # Date specifiers
- print_query_and_parse_tree(r"date today - 2 and title foo")
- print_query_and_parse_tree(r"date this month author ellis")
- print_query_and_parse_tree(r"date yesterday - 2 - ac 100")
- print_query_and_parse_tree(r"date last month - 2 + ac < 50")
- print_query_and_parse_tree(r"date this month - 2")
- print_query_and_parse_tree(r"du > yesterday - 2")
-
- # Star queries
- print_query_and_parse_tree(r"find a 'o*aigh' and t \"alge*\" and date >2013")
- print_query_and_parse_tree(r"a *alge | a alge* | a o*aigh")
-
- # Unrecognized queries
- print_query_and_parse_tree(r"title and foo")
- print_query_and_parse_tree(r"title γ-radiation and and")
-
- # The query below doesn't work on legacy. Currently, it is recognized as a boolean query (since theory is recognized
- # as a keyword). Can be useful for testing multiple parse trees generation (one with the first parse and a second
- # with removing ":" character (could be one heuristic)).
- # print_query_and_parse_tree(r"find t Closed string field theory: Quantum action")
diff --git a/inspire_query_parser/__init__.py b/inspire_query_parser/__init__.py
index 2598359..ca6c098 100644
--- a/inspire_query_parser/__init__.py
+++ b/inspire_query_parser/__init__.py
@@ -19,10 +19,9 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
-
-"""A PEG-based query parser for INSPIRE"""
+"""A PEG-based query parser for INSPIRE."""
from __future__ import absolute_import, print_function
-from . import config # noqa: F401
-from .parsing_driver import parse_query # noqa: F401
+from inspire_query_parser import config # noqa: F401
+from inspire_query_parser.parsing_driver import parse_query # noqa: F401
diff --git a/inspire_query_parser/ast.py b/inspire_query_parser/ast.py
index 41efa83..cf82f5e 100644
--- a/inspire_query_parser/ast.py
+++ b/inspire_query_parser/ast.py
@@ -19,18 +19,15 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
+"""AbstractSyntaxTree classes along with their concrete ones.
-"""
-AbstractSyntaxTree classes along with their concrete ones.
-
-The module defines a generic AST element along with four AST node categories (which act as a basis for all the concrete
-AST nodes) and finally, the concrete classes which represent the output of the parsing process.
+The module defines a generic AST element along with four AST node
+categories (which act as a basis for all the concrete AST nodes) and
+finally, the concrete classes which represent the output of the parsing
+process.
-The generic AST node categories are:
- - Leaf
- - UnaryOp
- - BinaryOp
- - ListOp
+The generic AST node categories are: - Leaf - UnaryOp -
+BinaryOp - ListOp
The concrete AST nodes, represent higher level (domain specific) nodes.
"""
@@ -40,18 +37,19 @@
# #### Abstract Syntax Tree classes ####
class ASTElement(object):
- """Root AbstractSyntaxTree node that acts as a stub for calling the Visitor's `visit` dispatcher method."""
+ """Root AbstractSyntaxTree node that acts as a stub for calling the
+ Visitor's `visit` dispatcher method."""
+
def accept(self, visitor, *args, **kwargs):
return visitor.visit(self, *args, **kwargs)
class Leaf(ASTElement):
-
def __init__(self, value=None):
self.value = value
def __eq__(self, other):
- return type(self) == type(other) and self.value == other.value
+ return type(self) is type(other) and self.value == other.value
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self.value)
@@ -61,12 +59,11 @@ def __hash__(self):
class UnaryOp(ASTElement):
-
def __init__(self, op):
self.op = op
def __eq__(self, other):
- return type(self) == type(other) and self.op == other.op
+ return type(self) is type(other) and self.op == other.op
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.op)
@@ -76,30 +73,29 @@ def __hash__(self):
class BinaryOp(ASTElement):
-
def __init__(self, left, right):
self.left = left
self.right = right
def __eq__(self, other):
return (
- type(self) == type(other)
- ) and (
- self.left == other.left
- ) and (
- self.right == other.right
+ (type(self) is type(other))
+ and (self.left == other.left)
+ and (self.right == other.right)
)
def __repr__(self):
- return "%s(%s, %s)" % (self.__class__.__name__,
- repr(self.left), repr(self.right))
+ return "%s(%s, %s)" % (
+ self.__class__.__name__,
+ repr(self.left),
+ repr(self.right),
+ )
def __hash__(self):
return hash((self.left, self.right))
class ListOp(ASTElement):
-
def __init__(self, children):
try:
iter(children)
@@ -109,7 +105,7 @@ def __init__(self, children):
self.children = children
def __eq__(self, other):
- return type(self) == type(other) and self.children == other.children
+ return type(self) is type(other) and self.children == other.children
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.children)
@@ -144,15 +140,20 @@ class ValueOp(UnaryOp):
class QueryWithMalformedPart(BinaryOp):
- """A combination of recognized part of a query (with a parse tree) and some malformed input.
+ """A combination of recognized part of a query (with a parse tree) and some
+ malformed input.
- Its left child is the recognized parse tree, while its right child has the :class:`MalformedQuery`.
+ Its left child is the recognized parse tree, while its right child
+ has the :class:`MalformedQuery`.
"""
+
pass
class MalformedQuery(ListOp):
- """A :class:`ListOp` with children the unrecognized words of the parser's input."""
+ """A :class:`ListOp` with children the unrecognized words of the parser's
+ input."""
+
pass
@@ -183,6 +184,7 @@ class Keyword(Leaf):
class GenericValue(Leaf):
"""Represents a generic value, which might contain a wildcard."""
+
WILDCARD_TOKEN = '*'
def __init__(self, value, contains_wildcard=False):
@@ -190,7 +192,10 @@ def __init__(self, value, contains_wildcard=False):
self.contains_wildcard = contains_wildcard
def __eq__(self, other):
- return super(GenericValue, self).__eq__(other) and self.contains_wildcard == other.contains_wildcard
+ return (
+ super(GenericValue, self).__eq__(other)
+ and self.contains_wildcard == other.contains_wildcard
+ )
def __hash__(self):
return hash((super(GenericValue, self).__hash__(), self.contains_wildcard))
diff --git a/inspire_query_parser/config.py b/inspire_query_parser/config.py
index 6601a93..8d9fba7 100644
--- a/inspire_query_parser/config.py
+++ b/inspire_query_parser/config.py
@@ -19,23 +19,20 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
+"""A collection of INSPIRE related keywords.
-"""
-A collection of INSPIRE related keywords.
-
-This dictionary has a twofold use.
-Primarily, the parser uses its keys to generate INSPIRE related keywords (i.e. qualifiers) and secondly, provides
-a normalization of the shortened keywords to their full version.
+This dictionary has a twofold use. Primarily, the parser uses its keys
+to generate INSPIRE related keywords (i.e. qualifiers) and secondly,
+provides a normalization of the shortened keywords to their full
+version.
"""
from __future__ import unicode_literals
INSPIRE_PARSER_NONDATE_KEYWORDS = {
# Abstract
'abstract': 'abstract',
-
# Address
'address': 'address',
-
# Affiliation
'affiliation': 'affiliation',
'affil': 'affiliation',
@@ -43,95 +40,73 @@
'af': 'affiliation',
'institution': 'affiliation',
'inst': 'affiliation',
-
# Affiliation Id
'affid': 'affiliation-id',
'affiliation-id': 'affiliation-id',
-
# Author
'author': 'author',
'au': 'author',
'a': 'author',
'name': 'author',
-
# Author-Count
'author-count': 'author-count',
'authorcount': 'author-count',
'ac': 'author-count',
-
# Cataloguer
'cat': 'cataloguer',
-
# Caption
'caption': 'caption',
-
# Cite, i.e. records that cite the given search term
# Cite and c: SPIRES syntax while reference is INVENIO syntax
'cite': 'cite',
'c': 'cite',
'reference': 'cite',
-
# Citedby related
'citedby': 'citedby', # nested keyword query
-
# Cited by excluding self sites, e.g. citedbyexcludingselfcites:author:M.E.Peskin.1
'citedbyexcludingselfcites': 'citedbyexcludingselfcites',
'citedbyx': 'citedbyexcludingselfcites',
-
# Cited excluding self sites, e.g. citedexcludingselfcites:50+
'citedexcludingselfcites': 'citedexcludingselfcites',
'cx': 'citedexcludingselfcites',
-
# Collaboration
'collaboration': 'collaboration',
'cn': 'collaboration',
-
# Conference number
'cnum': 'confnumber',
-
# Control number
'control_number': 'control_number',
'recid': 'control_number',
-
# Country
'country': 'country',
'cc': 'country',
-
# DOI
'doi': 'doi',
-
# ePrint
'bb': 'eprint',
'bull': 'eprint',
'eprint': 'eprint',
'arxiv': 'eprint',
'arXiv': 'eprint',
-
# Exact-Author
'exact-author': 'exact-author',
'exactauthor': 'exact-author',
'ea': 'exact-author',
-
# Experiment
'experiment': 'experiment',
'exp': 'experiment',
-
# Field-code
'fc': 'field-code',
'field-code': 'field-code',
-
# First-Author
'first-author': 'first_author',
'firstauthor': 'first_author',
'fa': 'first_author',
-
# Fulltext
'fulltext': 'fulltext',
'ft': 'fulltext',
-
# SPIRES identifiers
'irn': 'irn',
-
# Journal related
'coden': 'journal',
'journal': 'journal',
@@ -139,34 +114,28 @@
'published_in': 'journal',
'volume': 'volume',
'vol': 'volume',
-
# Keyword
# keyword is Invenio style, while the rest are from SPIRES syntax.
'keyword': 'keyword',
'keywords': 'keyword',
'kw': 'keyword',
'k': 'keyword',
-
# Primary archive
'primarch': 'primary_arxiv_category',
-
# rawref
'rawref': 'rawref',
-
# Reference
'citation': 'reference',
'jour-vol-page': 'reference',
'jvp': 'reference',
-
# Refersto operator
# Nested keyword query
'refersto': 'refersto',
-
- # Refers to excluding self cites, e.g. referstoexcludingselfcites:author:M.E.Peskin.1
+ # Refers to excluding self cites,
+ # e.g. referstoexcludingselfcites:author:M.E.Peskin.1
# Nested keyword queries
'referstoexcludingselfcites': 'referstoexcludingselfcites',
'referstox': 'referstoexcludingselfcites',
-
# Report number
'reportnumber': 'reportnumber',
'report-num': 'reportnumber',
@@ -174,24 +143,19 @@
'rept': 'reportnumber',
'rn': 'reportnumber',
'r': 'reportnumber',
-
# Subject
'subject': 'subject',
-
# Title
'title': 'title',
'ti': 'title',
't': 'title',
-
# texkey
'texkey': 'texkeys.raw',
-
# Topcite, i.e. citation count
# Cited used to be for Invenio style syntax while topcite for SPIRES
'cited': 'topcite',
'topcit': 'topcite',
'topcite': 'topcite',
-
# Type-Code
'type-code': 'type-code',
'type': 'type-code',
@@ -199,7 +163,7 @@
'ty': 'type-code',
'scl': 'type-code',
'ps': 'type-code',
- 'collection': 'type-code', # Queries for this one include "collection published" only
+ 'collection': 'type-code',# Queries for this one include "collection published" only
}
INSPIRE_PARSER_DATE_KEYWORDS = {
@@ -208,21 +172,17 @@
'd': 'date',
# From queries dataset, users seem to use year and date interchangeably.
'year': 'date',
-
# Date added
'date-added': 'date-added',
'dadd': 'date-added',
'da': 'date-added',
-
# Date earliest
'date-earliest': 'date-earliest',
'de': 'date-earliest',
-
# Date updated
'date-updated': 'date-updated',
'dupd': 'date-updated',
'du': 'date-updated',
-
# Journal year
'journal-year': 'publication_info.year',
'jy': 'publication_info.year',
@@ -242,13 +202,33 @@
DATE_TODAY_REGEX_PATTERN,
DATE_YESTERDAY_REGEX_PATTERN,
DATE_THIS_MONTH_REGEX_PATTERN,
- DATE_LAST_MONTH_REGEX_PATTERN
+ DATE_LAST_MONTH_REGEX_PATTERN,
)
MONTH_REGEX = "|".join(
[
- "january", "jan", "february", "feb", "march", "mar", "april", "apr", "may",
- "june", 'jun', "july", "jul", "august", "aug",
- "september", "sep", "october", "oct", "november", "nov", "december", "dec"
+ "january",
+ "jan",
+ "february",
+ "feb",
+ "march",
+ "mar",
+ "april",
+ "apr",
+ "may",
+ "june",
+ 'jun',
+ "july",
+ "jul",
+ "august",
+ "aug",
+ "september",
+ "sep",
+ "october",
+ "oct",
+ "november",
+ "nov",
+ "december",
+ "dec",
]
)
# #####
diff --git a/inspire_query_parser/parser.py b/inspire_query_parser/parser.py
index f031bab..eff9fb8 100644
--- a/inspire_query_parser/parser.py
+++ b/inspire_query_parser/parser.py
@@ -22,34 +22,55 @@
from __future__ import print_function, unicode_literals
+import datefinder
import six
+from pypeg2 import (
+ Enum,
+ GrammarValueError,
+ K,
+ Keyword,
+ Literal,
+ attr,
+ contiguous,
+ maybe_some,
+ omit,
+ optional,
+ re,
+ some,
+ whitespace,
+)
-from inspire_query_parser.config import DATE_SPECIFIERS_COLLECTION
-from pypeg2 import (Enum, GrammarValueError, K, Keyword, Literal, attr,
- contiguous, maybe_some, omit, optional, re, some,
- whitespace)
+from inspire_query_parser import ast
+from inspire_query_parser.config import (
+ DATE_SPECIFIERS_COLLECTION,
+ INSPIRE_PARSER_DATE_KEYWORDS,
+ INSPIRE_PARSER_KEYWORDS,
+ INSPIRE_PARSER_NONDATE_KEYWORDS,
+ MONTH_REGEX,
+)
-from . import ast
-from .config import MONTH_REGEX, INSPIRE_PARSER_KEYWORDS, INSPIRE_PARSER_DATE_KEYWORDS, INSPIRE_PARSER_NONDATE_KEYWORDS
-from dateutil import parser as date_parser
-import datefinder
# TODO Restrict what a simple query (i.e. Value) can accept (remove LessThanOp, etc.).
-# For 'date > 2013 and < 2017' probably allow LessThanOp into SimpleValueBooleanQuery.
+# For 'date > 2013 and < 2017' probably allow LessThanOp into
+# SimpleValueBooleanQuery.
# TODO 'date > 2000-10 and < date 2000-12' parses without a malformed query. (First fix the above)
# #### Parser customization ####
class CaseInsensitiveKeyword(Keyword):
- """Supports case insensitive keywords
+ """Supports case insensitive keywords.
- All subtypes must declare a grammar attribute with an Enum of accepted keywords/literals.
+ All subtypes must declare a grammar attribute with an Enum of
+ accepted keywords/literals.
"""
+
def __init__(self, keyword):
"""Adds lowercase keyword to the keyword table."""
try:
- self.grammar
+ self.grammar # noqa B018
except AttributeError:
- raise GrammarValueError(self.__class__.__name__ + " expects a grammar attribute (Enum).")
+ raise GrammarValueError(
+ self.__class__.__name__ + " expects a grammar attribute (Enum)."
+ )
keyword = keyword.lower()
if keyword not in Keyword.table:
@@ -63,9 +84,11 @@ def parse(cls, parser, text, pos):
if match:
# Check if match is is not in the grammar of the specific keyword class.
if match.group(0).lower() not in cls.grammar:
- result = text, SyntaxError(repr(match.group(0)) + " is not a member of " + repr(cls.grammar))
+ result = text, SyntaxError(
+ repr(match.group(0)) + " is not a member of " + repr(cls.grammar)
+ )
else:
- result = text[len(match.group(0)):], cls(match.group(0))
+ result = text[len(match.group(0)) :], cls(match.group(0))
else:
result = text, SyntaxError("expecting " + repr(cls.__name__))
return result
@@ -85,6 +108,7 @@ def __repr__(self):
class BooleanOperator(object):
"""Serves as the possible case for a boolean operator."""
+
AND = 'and'
OR = 'or'
@@ -111,13 +135,16 @@ def __init__(self, left=None, right=None):
class BooleanRule(ast.BinaryOp):
"""Represents a boolean query rule.
- This means that there is a left and right node, but also the boolean operator of the rule.
- Can be called by PyPeg framework either when constructing a boolean query (which supports implicit and) or when
- constructing a boolean query among simple values (thus, no implicit and support).
+ This means that there is a left and right node, but also the boolean
+ operator of the rule. Can be called by PyPeg framework either when
+ constructing a boolean query (which supports implicit and) or when
+ constructing a boolean query among simple values (thus, no implicit
+ and support).
- Note:
- When a BooleanRule is created from PyPeg, the format of the arguments is an iterable, when it's created from
- the custom parse method of simple value boolean query, the non-default arguments are being used.
+ Note: When a BooleanRule is created from PyPeg, the format of
+ the arguments is an iterable, when it's created from the custom
+ parse method of simple value boolean query, the non-default
+ arguments are being used.
"""
def __init__(self, args, bool_op=None, right=None):
@@ -133,7 +160,7 @@ def __init__(self, args, bool_op=None, right=None):
self.left = args[0]
if len(args) == 3:
- if isinstance(args[1], And) or isinstance(args[1], Or):
+ if isinstance(args[1], (And, Or)) :
self.bool_op = args[1]
else:
raise ValueError("Unexpected boolean operator: " + repr(args[1]))
@@ -143,13 +170,17 @@ def __init__(self, args, bool_op=None, right=None):
self.right = args[len(args) - 1]
def __eq__(self, other):
- return super(BooleanRule, self).__eq__(other) and type(self.bool_op) == type(other.bool_op) # noqa:E721
+ return super(BooleanRule, self).__eq__(other) and type(self.bool_op) is type(
+ other.bool_op
+ ) # noqa:E721
def __repr__(self):
- return "%s(%r, %r, %r)" % (self.__class__.__name__,
- self.left,
- self.bool_op,
- self.right)
+ return "%s(%r, %r, %r)" % (
+ self.__class__.__name__,
+ self.left,
+ self.bool_op,
+ self.right,
+ )
def __hash__(self):
return hash((self.left, self.bool_op, self.right))
@@ -158,28 +189,31 @@ def __hash__(self):
class ListRule(ast.ListOp):
def __init__(self, children):
super(ListRule, self).__init__(children)
+
+
# ########################
# #### Keywords ####
class And(CIKeyword):
- """
- The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether
- terminal symbols are actually DSL keywords.
- """
+ """The reason for defining an Enum grammar of Keywords is for populating
+ the Keyword.table for checking whether terminal symbols are actually DSL
+ keywords."""
+
regex = re.compile(r"(and|\+|&)", re.IGNORECASE)
grammar = Enum(K("and"), K("+"), K("&"))
def __init__(self, *args):
- # Normalize different AND keywords (ignore the keyword argument that was passed).
+ # Normalize different AND keywords
+ # (ignore the keyword argument that was passed).
super(And, self).__init__(BooleanOperator.AND)
class Or(CIKeyword):
- """
- The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether
- terminal symbols are actually DSL keywords.
- """
+ """The reason for defining an Enum grammar of Keywords is for populating
+ the Keyword.table for checking whether terminal symbols are actually DSL
+ keywords."""
+
regex = re.compile(r"(or|\|)", re.IGNORECASE)
grammar = Enum(K("or"), K("|"))
@@ -189,12 +223,14 @@ def __init__(self, *args):
class Not(CIKeyword):
- """
- The reason for defining an Enum grammar of Keywords is for populating the Keyword.table for checking whether
- terminal symbols are actually DSL keywords.
- """
+ """The reason for defining an Enum grammar of Keywords is for populating
+ the Keyword.table for checking whether terminal symbols are actually DSL
+ keywords."""
+
regex = re.compile(r"(not|-)", re.IGNORECASE)
grammar = Enum(K("not"), K("-"))
+
+
# ########################
@@ -204,7 +240,8 @@ class Whitespace(LeafRule):
class InspireKeyword(LeafRule):
- # InspireKeyword expects a word boundary at its end, excluding [.,] characters, since these might signify names.
+ # InspireKeyword expects a word boundary at its end, excluding [.,] characters,
+ # since these might signify names.
grammar = re.compile(
r"({0})(?![,.])(?=(:|\b))".format(
"|".join(INSPIRE_PARSER_NONDATE_KEYWORDS.keys())
@@ -247,20 +284,26 @@ def parse(cls, parser, text, pos):
class SimpleValueUnit(LeafRule):
- """Represents either a terminal symbol (without parentheses) or a parenthesized SimpleValue.
-
- The parenthesized case (2nd option of SimpleValueUnit) accepts a SimpleValue which is the more generic case of
- plaintext and in turn (its grammar) encapsulates whitespace and SimpleValueUnit recognition.
+ """Represents either a terminal symbol (without parentheses) or a
+ parenthesized SimpleValue.
+ The parenthesized case (2nd option of SimpleValueUnit) accepts a
+ SimpleValue which is the more generic case of plaintext and in turn
+ (its grammar) encapsulates whitespace and SimpleValueUnit
+ recognition.
"""
+
token_regex = re.compile(r"[^\s:)(]+", re.UNICODE)
- date_specifiers_regex = re.compile(r"({})\s*-\s*\d+".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE)
+ date_specifiers_regex = re.compile(
+ r"({})\s*-\s*\d+".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE
+ )
parenthesized_token_grammar = None # is set after SimpleValue definition.
starts_with_colon = re.compile(r"\s*:", re.UNICODE)
- """Used for recognizing whether terminal token is a keyword (i.e. followed by some whitespace and ":"."""
+ """Used for recognizing whether terminal token is a keyword (i.e. followed
+ by some whitespace and ":"."""
def __init__(self, args):
super(SimpleValueUnit, self).__init__()
@@ -273,17 +316,20 @@ def __init__(self, args):
@classmethod
def parse_terminal_token(cls, parser, text):
- """Parses a terminal token that doesn't contain parentheses nor colon symbol.
+ """Parses a terminal token that doesn't contain parentheses nor colon
+ symbol.
- Note:
- Handles a special case of tokens where a ':' is needed (for `texkey` queries).
+ Note: Handles a special case of tokens where a ':' is needed
+ (for `texkey` queries).
- If we're parsing text not in parentheses, then some DSL keywords (e.g. And, Or, Not, defined above) should
- not be recognized as terminals, thus we check if they are in the Keywords table (namespace like structure
- handled by PyPeg).
- This is done only when we are not parsing a parenthesized SimpleValue.
+ If we're parsing text not in parentheses, then some DSL keywords
+ (e.g. And, Or, Not, defined above) should not be recognized as
+ terminals, thus we check if they are in the Keywords table
+ (namespace like structure handled by PyPeg). This is done only
+ when we are not parsing a parenthesized SimpleValue.
- Also, helps in supporting more implicit-and queries cases (last two checks).
+ Also, helps in supporting more implicit-and queries cases (last
+ two checks).
"""
token_regex = cls.token_regex
@@ -291,58 +337,74 @@ def parse_terminal_token(cls, parser, text):
if match:
matched_token = match.group(0)
- # Check if token is a DSL keyword. Disable this check in the case where the parser isn't parsing a
- # parenthesized terminal.
- if not parser._parsing_parenthesized_terminal and matched_token.lower() in Keyword.table:
+ # Check if token is a DSL keyword. Disable this check in the case where
+ # the parser isn't parsing a parenthesized terminal.
+ if (
+ not parser._parsing_parenthesized_terminal
+ and matched_token.lower() in Keyword.table
+ ):
return text, SyntaxError("found DSL keyword: " + matched_token)
- remaining_text = text[len(matched_token):]
+ remaining_text = text[len(matched_token) :]
- # Attempt to recognize whether current terminal is followed by a ":", which definitely signifies that
- # we are parsing a keyword, and we shouldn't.
+ # Attempt to recognize whether current terminal is followed by a ":",
+ # which definitely signifies that we are parsing a keyword,
+ # and we shouldn't.
if cls.starts_with_colon.match(remaining_text):
- return text, \
- SyntaxError("parsing a keyword (token followed by \":\"): \"" + repr(matched_token) + "\"")
+ return text, SyntaxError(
+ "parsing a keyword (token followed by \":\"): \""
+ + repr(matched_token)
+ + "\""
+ )
result = remaining_text, matched_token
else:
- result = text, SyntaxError("expecting match on " + repr(cls.token_regex.pattern))
+ result = text, SyntaxError(
+ "expecting match on " + repr(cls.token_regex.pattern)
+ )
return result
@classmethod
def parse(cls, parser, text, pos):
"""Imitates parsing a list grammar.
- Specifically, this
- grammar = [
- SimpleValueUnit.date_specifiers_regex,
- SimpleValueUnit.token_regex,
- SimpleValueUnit.parenthesized_token_grammar
- ].
+ Specifically, this grammar = [
+ SimpleValueUnit.date_specifiers_regex,
+ SimpleValueUnit.token_regex,
+ SimpleValueUnit.parenthesized_token_grammar ].
- Parses plaintext which matches date specifiers or arxiv_identifier syntax, or is comprised of either 1) simple
+ Parses plaintext which matches date specifiers or
+ arxiv_identifier syntax, or is comprised of either 1) simple
terminal (no parentheses) or 2) a parenthesized SimpleValue.
- For example, "e(+)" will be parsed in two steps, first, "e" token will be recognized and then "(+)", as a
- parenthesized SimpleValue.
+ For example, "e(+)" will be parsed in two steps, first, "e"
+ token will be recognized and then "(+)", as a parenthesized
+ SimpleValue.
"""
found = False
# Attempt to parse date specifier
match = cls.date_specifiers_regex.match(text)
if match:
- remaining_text, token, found = text[len(match.group(0)):], match.group(0), True
+ remaining_text, token, found = (
+ text[len(match.group(0)) :],
+ match.group(0),
+ True,
+ )
else:
# Attempt to parse a terminal token
remaining_text, token = cls.parse_terminal_token(parser, text)
- if type(token) != SyntaxError:
+ if not isinstance(token, SyntaxError):
found = True
else:
# Attempt to parse a terminal with parentheses
try:
- # Enable parsing a parenthesized terminal so that we can accept {+, -, |} as terminals.
+ # Enable parsing a parenthesized terminal so that
+ # we can accept {+, -, |} as terminals.
parser._parsing_parenthesized_terminal = True
- remaining_text, token = parser.parse(text, cls.parenthesized_token_grammar, pos)
+ remaining_text, token = parser.parse(
+ text, cls.parenthesized_token_grammar, pos
+ )
found = True
except SyntaxError:
@@ -368,7 +430,9 @@ class SimpleValueWithColonUnit(SimpleValueUnit):
class SimpleDateValueUnit(LeafRule):
grammar = re.compile(r"[\d*\-\.\/]{4,10}(?=($|\s|\)))", re.UNICODE)
- date_specifiers_regex = re.compile(r"({})\s*(-\s*\d+)?".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE)
+ date_specifiers_regex = re.compile(
+ r"({})\s*(-\s*\d+)?".format('|'.join(DATE_SPECIFIERS_COLLECTION)), re.UNICODE
+ )
string_month_date_regex = re.compile(MONTH_REGEX, re.IGNORECASE)
def __init__(self, args):
@@ -389,7 +453,9 @@ def _parse_date_with_string_month(cls, text):
remaining_text = text[date_end_index:]
result = remaining_text, found_date_string
except StopIteration:
- result = text, SyntaxError("expecting match on " + repr(cls.string_month_date_regex.pattern))
+ result = text, SyntaxError(
+ "expecting match on " + repr(cls.string_month_date_regex.pattern)
+ )
return result
@classmethod
@@ -399,7 +465,7 @@ def parse(cls, parser, text, pos):
match = cls.date_specifiers_regex.match(text)
string_month_date_match = cls.string_month_date_regex.match(text)
if match:
- remaining_text, token = text[len(match.group(0)):], match.group(0)
+ remaining_text, token = text[len(match.group(0)) :], match.group(0)
elif string_month_date_match:
remaining_text, token = cls._parse_date_with_string_month(text)
else:
@@ -411,7 +477,7 @@ def parse(cls, parser, text, pos):
raise
except ValueError:
pass
- if token and type(token) != SyntaxError:
+ if token and not isinstance(token, SyntaxError):
result = remaining_text, cls(token)
else:
result = text, SyntaxError("expecting match on " + cls.__name__)
@@ -431,32 +497,49 @@ def __init__(self, values):
E.g. title top cross section, or title Si-28(p(pol.), n(pol.)).
"""
+
@staticmethod
- def unconsume_and_reconstruct_input(remaining_text, recognized_tokens, complex_value_idx):
- """Reconstruct input in case of consuming a keyword query or a value query with ComplexValue as value.
-
- Un-consuming at most 3 elements and specifically (Keyword,) Whitespace and ComplexValue, while also
- reconstructing parser's input text.
-
- Example:
- Given this query "author foo t 'bar'", r would be:
- r = [SimpleValueUnit("foo"), Whitespace(" "), SimpleValueUnit("t"), Whitespace(" "),
- SimpleValueUnit("'bar'")]
- thus after this method, r would be [SimpleValueUnit("foo"), Whitespace(" ")], while initial text will
- have been reconstructed as "t 'bar' rest_of_the_text".
+ def unconsume_and_reconstruct_input(
+ remaining_text, recognized_tokens, complex_value_idx
+ ):
+ """Reconstruct input in case of consuming a keyword query or a value
+ query with ComplexValue as value.
+
+ Un-consuming at most 3 elements and specifically (Keyword,)
+ Whitespace and ComplexValue, while also reconstructing parser's
+ input text.
+
+ Example: Given this query "author foo t 'bar'", r would be:
+ r = [SimpleValueUnit("foo"), Whitespace(" "),
+ SimpleValueUnit("t"), Whitespace(" "),
+ SimpleValueUnit("'bar'")] thus after this method, r would be
+ [SimpleValueUnit("foo"), Whitespace(" ")], while initial text
+ will have been reconstructed as "t 'bar' rest_of_the_text".
"""
- # Default slicing index: i.e. at most 3 elements will be unconsumed, Keyword, Whitespace and ComplexValue.
+ # Default slicing index: i.e. at most 3 elements will be unconsumed, Keyword,
+ # Whitespace and ComplexValue.
slicing_start_idx = 2
- # Check whether the 3rd element from the end is an InspireKeyword. If not, a Value query with ComplexValue
- # was consumed.
- if not INSPIRE_PARSER_KEYWORDS.get(recognized_tokens[complex_value_idx - slicing_start_idx].value, None):
+ # Check whether the 3rd element from the end is an InspireKeyword. If not,
+ # a Value query with ComplexValue was consumed.
+ if not INSPIRE_PARSER_KEYWORDS.get(
+ recognized_tokens[complex_value_idx - slicing_start_idx].value, None
+ ):
slicing_start_idx = 1
- reconstructed_terminals = recognized_tokens[:complex_value_idx - slicing_start_idx]
+ reconstructed_terminals = recognized_tokens[
+ : complex_value_idx - slicing_start_idx
+ ]
reconstructed_text = '{} {}'.format(
- ''.join([token.value for token in recognized_tokens[complex_value_idx - slicing_start_idx:]]),
- remaining_text
+ ''.join(
+ [
+ token.value
+ for token in recognized_tokens[
+ complex_value_idx - slicing_start_idx :
+ ]
+ ]
+ ),
+ remaining_text,
)
return reconstructed_text, reconstructed_terminals
@@ -465,15 +548,22 @@ def parse(cls, parser, text, pos):
try:
remaining_text, recognized_tokens = parser.parse(text, cls.grammar)
- # Covering a case of implicit-and when one of the SimpleValue tokens is a ComplexValue.
- # This means we either have a KeywordQuery or a ValueQuery with a ComplexValue.
- # E.g. "author foo t 'bar'", since 'bar' is a ComplexValue, then the previous token is a keyword.
+ # Covering a case of implicit-and when one of the SimpleValue tokens
+ # is a ComplexValue.
+ # This means we either have a KeywordQuery or a ValueQuery
+ # with a ComplexValue.
+ # E.g. "author foo t 'bar'", since 'bar' is a ComplexValue,
+ # then the previous token is a keyword.
# This means we have consumed a KeywordQuery (due to 'and' missing).
- # Same goes for "author foo 'bar'", but in this case we have a ValueQuery with a ComplexValue.
+ # Same goes for "author foo 'bar'", but in this case we have a ValueQuery
+ # with a ComplexValue.
found_complex_value = False
for idx, token in enumerate(recognized_tokens):
if ComplexValue.regex.match(token.value):
- reconstructed_text, reconstructed_terminals = cls.unconsume_and_reconstruct_input(
+ (
+ reconstructed_text,
+ reconstructed_terminals,
+ ) = cls.unconsume_and_reconstruct_input(
remaining_text, recognized_tokens, idx
)
found_complex_value = True
@@ -495,15 +585,27 @@ class SimpleValue(SimpleValueGeneric):
E.g. title top cross section, or title Si-28(p(pol.), n(pol.)).
"""
- grammar = contiguous([SimpleValueUnit, SimpleValueWithColonUnit], maybe_some((optional(Whitespace), some(SimpleValueUnit))))
+
+ grammar = contiguous(
+ [SimpleValueUnit, SimpleValueWithColonUnit],
+ maybe_some((optional(Whitespace), some(SimpleValueUnit))),
+ )
class SimpleDateValue(SimpleValueGeneric):
grammar = contiguous(SimpleDateValueUnit, optional(Whitespace))
-SimpleValueUnit.parenthesized_token_grammar = (re.compile(r"\("), SimpleValue, re.compile(r"\)"))
-SimpleDateValueUnit.parenthesized_token_grammar = (re.compile(r"\("), SimpleDateValue, re.compile(r"\)"))
+SimpleValueUnit.parenthesized_token_grammar = (
+ re.compile(r"\("),
+ SimpleValue,
+ re.compile(r"\)"),
+)
+SimpleDateValueUnit.parenthesized_token_grammar = (
+ re.compile(r"\("),
+ SimpleDateValue,
+ re.compile(r"\)"),
+)
# ################################################## #
@@ -511,16 +613,19 @@ class SimpleDateValue(SimpleValueGeneric):
# ################################################## #
class SimpleValueNegation(UnaryRule):
"""Negation accepting only SimpleValues."""
+
grammar = omit(Not), attr('op', SimpleValue)
class SimpleDateValueNegation(UnaryRule):
"""Negation accepting only SimpleValues."""
+
grammar = omit(Not), attr('op', SimpleDateValue)
class SimpleValueBooleanQuery(BooleanRule):
- """For supporting queries like author ellis or smith and not Vanderhaeghen."""
+ """For supporting queries like author ellis or smith and not
+ Vanderhaeghen."""
@classmethod
def parse(cls, parser, text, pos):
@@ -531,14 +636,16 @@ def parse(cls, parser, text, pos):
text_after_left_op, left_operand = parser.parse(text, cls.grammar[0])
# Parse boolean operators
- text_after_bool_op, operator = parser.parse(text_after_left_op, cls.grammar[1])
+ text_after_bool_op, operator = parser.parse(
+ text_after_left_op, cls.grammar[1]
+ )
if not operator: # Implicit AND at terminals level
operator = And(BooleanOperator.AND)
# Parse right operand.
# We don't want to eagerly recognize anything else other than a SimpleValue.
- # So we attempt to recognize the more specific rules, and if we do, then we need to stop identifying this
- # rule.
+ # So we attempt to recognize the more specific rules, and if we do,
+ # then we need to stop identifying this rule.
parser.parse(
text_after_bool_op,
[
@@ -548,22 +655,24 @@ def parse(cls, parser, text, pos):
SpiresDateKeywordQuery,
InvenioKeywordQuery,
SpiresKeywordQuery,
- ]
- ),
+ ],
+ ),
[
RangeOp,
GreaterEqualOp,
LessEqualOp,
GreaterThanOp,
LessThanOp,
- ComplexValue
- ]
- ]
+ ComplexValue,
+ ],
+ ],
)
# Identified something other than a SimpleValue, stop parsing this rule.
- result = text, SyntaxError("expected simple value related rule as right operand of a " +
- cls.__name__)
+ result = text, SyntaxError(
+ "expected simple value related rule as right operand of a "
+ + cls.__name__
+ )
except SyntaxError as e:
result = text, e
@@ -571,13 +680,14 @@ def parse(cls, parser, text, pos):
if left_operand and operator:
# Attempt to parse a right operand
try:
- remaining_text, right_operand = parser.parse(text_after_bool_op, cls.grammar[2])
+ remaining_text, right_operand = parser.parse(
+ text_after_bool_op, cls.grammar[2]
+ )
result = remaining_text, SimpleValueBooleanQuery(
- left_operand,
- bool_op=operator,
- right=right_operand
+ left_operand, bool_op=operator, right=right_operand
)
- except SyntaxError as e: # Actual failure of parsing boolean query at terminals level
+ # Actual failure of parsing boolean query at terminals level
+ except SyntaxError as e:
return text, e
return result
@@ -591,9 +701,7 @@ def parse(cls, parser, text, pos):
SimpleDateValueNegation,
SimpleDateValue,
],
-
[And, Or, None],
-
# Right operand options
[
SimpleValueBooleanQuery,
@@ -601,13 +709,19 @@ def parse(cls, parser, text, pos):
SimpleValue,
SimpleDateValueNegation,
SimpleDateValue,
- ]
+ ],
)
class ParenthesizedSimpleValues(UnaryRule):
- """Parses parenthesized simple values along with boolean operations on them."""
- grammar = omit(Literal("(")), [SimpleValueBooleanQuery, SimpleValueNegation, SimpleValue], omit(Literal(")"))
+ """Parses parenthesized simple values along with boolean operations on
+ them."""
+
+ grammar = (
+ omit(Literal("(")),
+ [SimpleValueBooleanQuery, SimpleValueNegation, SimpleValue],
+ omit(Literal(")")),
+ )
@classmethod
def parse(cls, parser, text, pos):
@@ -620,21 +734,26 @@ def parse(cls, parser, text, pos):
return text, e
finally:
parser._parsing_parenthesized_simple_values_expression = False
+
+
# ######################################## #
class ComplexValue(LeafRule):
- """Accepting value with either single/double quotes or a regex value (/^.../$).
+ """Accepting value with either single/double quotes or a regex value
+ (/^.../$).
- These values have special and different meaning for the later phases of parsing:
- * Single quotes: partial text matching (text is analyzed before searched)
- * Double quotes: exact text matching
- * Regex: regex searches
+ These values have special and different meaning for the later phases
+ of parsing: * Single quotes: partial text matching (text is
+ analyzed before searched) * Double quotes: exact text matching *
+ Regex: regex searches
E.g. t 'Millisecond pulsar velocities'.
- This makes no difference for the parser and will be handled at a later parsing phase.
+ This makes no difference for the parser and will be handled at a
+ later parsing phase.
"""
+
EXACT_VALUE_TOKEN = '"'
PARTIAL_VALUE_TOKEN = '\''
REGEX_VALUE_TOKEN = '/'
@@ -652,7 +771,10 @@ class GreaterThanOp(UnaryRule):
Supports queries like author-count > 2000 or date after 10-2000.
"""
- grammar = omit(re.compile(r"after|>", re.IGNORECASE)), attr('op', [SimpleDateValue, SimpleValue])
+
+ grammar = omit(re.compile(r"after|>", re.IGNORECASE)), attr(
+ 'op', [SimpleDateValue, SimpleValue]
+ )
class GreaterEqualOp(UnaryRule):
@@ -660,11 +782,16 @@ class GreaterEqualOp(UnaryRule):
Supports queries like date >= 10-2000 or topcite 200+.
"""
+
grammar = [
(omit(Literal(">=")), attr('op', [SimpleDateValue, SimpleValue])),
- # Accept a number or numbers that are separated with (/ or -) followed by a "-" which should be
- # followed by \s or ) or end of input so that you don't accept a value like 1-e.
- (attr('op', re.compile(r"\d+([/-]\d+)*(?=\+)")), omit(re.compile(r'\+(?=\s|\)|$)'))),
+ # Accept a number or numbers that are separated with (/ or -)
+ # followed by a "-" which should be followed by \s or ) or
+ # end of input so that you don't accept a value like 1-e.
+ (
+ attr('op', re.compile(r"\d+([/-]\d+)*(?=\+)")),
+ omit(re.compile(r'\+(?=\s|\)|$)')),
+ ),
]
@@ -673,7 +800,10 @@ class LessThanOp(UnaryRule):
Supports queries like author-count < 100 or date before 1984.
"""
- grammar = omit(re.compile(r"before|<", re.IGNORECASE)), attr('op', [SimpleDateValue, SimpleValue])
+
+ grammar = omit(re.compile(r"before|<", re.IGNORECASE)), attr(
+ 'op', [SimpleDateValue, SimpleValue]
+ )
class LessEqualOp(UnaryRule):
@@ -684,8 +814,9 @@ class LessEqualOp(UnaryRule):
grammar = [
(omit(Literal("<=")), attr("op", [SimpleDateValue, SimpleValue])),
- # Accept a number or numbers that are separated with (/ or -) followed by a "-" which should be
- # followed by \s or ) or end of input so that you don't accept a value like 1-e.
+ # Accept a number or numbers that are separated with (/ or -) followed
+ # by a "-" which should befollowed by \s or ) or end of input
+ # so that you don't accept a value like 1-e.
(
attr("op", re.compile(r"\d+([/-]\d+)*(?=-)")),
omit(re.compile(r"-(?=\s|\)|$)")),
@@ -696,16 +827,17 @@ class LessEqualOp(UnaryRule):
class RangeOp(BinaryRule):
"""Range operator mixing any type of values.
- E.g. muon decay year:1983->1992
- author:"Ellis, J"->"Ellis, Qqq"
- author:"Ellis, J"->Ellis, M
+ E.g. muon decay year:1983->1992 author:"Ellis,
+ J"->"Ellis, Qqq" author:"Ellis, J"->Ellis, M
The non symmetrical type of values will be handled at a later phase.
"""
- grammar = \
- attr('left', [ComplexValue, SimpleRangeValue]), \
- omit(Literal("->")), \
- attr('right', [ComplexValue, SimpleRangeValue])
+
+ grammar = (
+ attr('left', [ComplexValue, SimpleRangeValue]),
+ omit(Literal("->")),
+ attr('right', [ComplexValue, SimpleRangeValue]),
+ )
class Value(UnaryRule):
@@ -713,22 +845,26 @@ class Value(UnaryRule):
Serves as an encapsulation of the listed rules.
"""
- grammar = attr('op', [
- (optional(omit(Literal("="))), RangeOp),
- GreaterEqualOp,
- LessEqualOp,
- GreaterThanOp,
- LessThanOp,
- (
- optional(omit(Literal("="))),
- [
- ComplexValue,
- ParenthesizedSimpleValues,
- SimpleValueBooleanQuery,
- SimpleValue
- ]
- )
- ])
+
+ grammar = attr(
+ 'op',
+ [
+ (optional(omit(Literal("="))), RangeOp),
+ GreaterEqualOp,
+ LessEqualOp,
+ GreaterThanOp,
+ LessThanOp,
+ (
+ optional(omit(Literal("="))),
+ [
+ ComplexValue,
+ ParenthesizedSimpleValues,
+ SimpleValueBooleanQuery,
+ SimpleValue,
+ ],
+ ),
+ ],
+ )
class DateValue(UnaryRule):
@@ -736,45 +872,52 @@ class DateValue(UnaryRule):
Serves as an encapsulation of the listed rules.
"""
- grammar = attr('op', [
- (optional(omit(Literal("="))), RangeOp),
- GreaterEqualOp,
- LessEqualOp,
- GreaterThanOp,
- LessThanOp,
- (
- optional(omit(Literal("="))),
- [
- ComplexValue,
- SimpleValueBooleanQuery,
- SimpleDateValue
- ]
- )
- ])
+
+ grammar = attr(
+ 'op',
+ [
+ (optional(omit(Literal("="))), RangeOp),
+ GreaterEqualOp,
+ LessEqualOp,
+ GreaterThanOp,
+ LessThanOp,
+ (
+ optional(omit(Literal("="))),
+ [ComplexValue, SimpleValueBooleanQuery, SimpleDateValue],
+ ),
+ ],
+ )
+
+
########################
class InvenioKeywordQuery(BinaryRule):
"""Keyword queries with colon separator (i.e. Invenio style).
- There needs to be a distinction between Invenio and SPIRES keyword queries, so as the parser is able to recognize
- any terminal as keyword for the former ones.
+ There needs to be a distinction between Invenio and SPIRES keyword
+ queries, so as the parser is able to recognize any terminal as
+ keyword for the former ones.
- Note:
- E.g. author: ellis, title: boson, or unknown_keyword: foo.
+ Note: E.g. author: ellis, title: boson, or unknown_keyword: foo.
"""
- grammar = attr('left', [[InspireKeyword, InspireDateKeyword], re.compile(r"[^\s:]+")]), \
- omit(':'), \
- attr('right', Value)
+
+ grammar = (
+ attr('left', [[InspireKeyword, InspireDateKeyword], re.compile(r"[^\s:]+")]),
+ omit(':'),
+ attr('right', Value),
+ )
class SpiresKeywordQuery(BinaryRule):
"""Keyword queries with space separator (i.e. Spires style)."""
+
grammar = attr('left', InspireKeyword), attr('right', Value)
class SpiresDateKeywordQuery(BinaryRule):
"""Keyword queries with pace separator (i.e. Spires style)."""
+
grammar = attr('left', InspireDateKeyword), attr('right', DateValue)
@@ -783,13 +926,17 @@ class SimpleQuery(UnaryRule):
These are comprised of metadata queries, keywords and value queries.
"""
- grammar = attr('op', [
- InvenioKeywordQuery,
- SpiresDateKeywordQuery,
- SpiresKeywordQuery,
- Value,
- DateValue,
- ])
+
+ grammar = attr(
+ 'op',
+ [
+ InvenioKeywordQuery,
+ SpiresDateKeywordQuery,
+ SpiresKeywordQuery,
+ Value,
+ DateValue,
+ ],
+ )
class Statement(UnaryRule):
@@ -797,26 +944,30 @@ class Statement(UnaryRule):
Supports queries chaining, see its grammar for more information.
"""
+
pass
class Expression(UnaryRule):
"""A generic query expression.
- Serves as a more restrictive rule than Statement.
- This is useful for eliminating left recursion in the grammar (requirement for PEGs) when used in binary queries as
- left hand side production rule.
+ Serves as a more restrictive rule than Statement. This is useful for
+ eliminating left recursion in the grammar (requirement for PEGs)
+ when used in binary queries as left hand side production rule.
"""
+
pass
class NotQuery(UnaryRule):
"""Negation query."""
+
grammar = omit(Not), attr('op', Expression)
class ParenthesizedQuery(UnaryRule):
"""Parenthesized query for denoting precedence."""
+
grammar = omit(Literal('(')), attr('op', Statement), omit(Literal(')'))
@@ -825,37 +976,45 @@ class NestedKeywordQuery(BinaryRule):
E.g. citedby:author:hui and refersto:author:witten
"""
+
pass
-Expression.grammar = attr('op', [
- NotQuery,
- NestedKeywordQuery,
- ParenthesizedQuery,
- SimpleQuery,
-])
+Expression.grammar = attr(
+ 'op',
+ [
+ NotQuery,
+ NestedKeywordQuery,
+ ParenthesizedQuery,
+ SimpleQuery,
+ ],
+)
-NestedKeywordQuery.grammar = \
- attr('left', [
- # Most specific regex must be higher.
- re.compile(r'citedbyexcludingselfcites', re.IGNORECASE),
- re.compile(r'citedbyx', re.IGNORECASE),
- re.compile(r'citedby', re.IGNORECASE),
- re.compile(r'referstoexcludingselfcites', re.IGNORECASE),
- re.compile(r'referstox', re.IGNORECASE),
- re.compile(r'refersto', re.IGNORECASE),
- ]), \
- optional(omit(":")), \
- attr('right', Expression)
+NestedKeywordQuery.grammar = (
+ attr(
+ 'left',
+ [
+ # Most specific regex must be higher.
+ re.compile(r'citedbyexcludingselfcites', re.IGNORECASE),
+ re.compile(r'citedbyx', re.IGNORECASE),
+ re.compile(r'citedby', re.IGNORECASE),
+ re.compile(r'referstoexcludingselfcites', re.IGNORECASE),
+ re.compile(r'referstox', re.IGNORECASE),
+ re.compile(r'refersto', re.IGNORECASE),
+ ],
+ ),
+ optional(omit(":")),
+ attr('right', Expression),
+)
class BooleanQuery(BooleanRule):
- """Represents boolean query as a binary rule.
+ """Represents boolean query as a binary rule."""
- """
grammar = Expression, [And, Or, None], Statement
+
# ########################
@@ -864,7 +1023,9 @@ class BooleanQuery(BooleanRule):
class MalformedQueryWords(ListRule):
- """Represents queries that weren't recognized by the main parsing branch of Statements."""
+ """Represents queries that weren't recognized by the main parsing branch of
+ Statements."""
+
grammar = some(re.compile(r"[^\s]+", re.UNICODE))
def __init__(self, children):
@@ -884,13 +1045,15 @@ def __repr__(self):
class Query(ListRule):
"""The entry-point for the grammar.
- Find keyword is ignored as the current grammar is an augmentation of SPIRES and Invenio style syntaxes.
- It only serves for backward compatibility with SPIRES syntax.
+ Find keyword is ignored as the current grammar is an augmentation of
+ SPIRES and Invenio style syntaxes. It only serves for backward
+ compatibility with SPIRES syntax.
"""
+
grammar = [
(
omit(optional(re.compile(r"(find|fin|fi|f)\s", re.IGNORECASE))),
- (Statement, maybe_some(MalformedQueryWords))
+ (Statement, maybe_some(MalformedQueryWords)),
),
MalformedQueryWords,
EmptyQuery,
diff --git a/inspire_query_parser/parsing_driver.py b/inspire_query_parser/parsing_driver.py
index 4c10299..f122211 100644
--- a/inspire_query_parser/parsing_driver.py
+++ b/inspire_query_parser/parsing_driver.py
@@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
-
"""This module provides the public API of INSPIRE query parser."""
from __future__ import absolute_import, print_function, unicode_literals
@@ -31,32 +30,36 @@
from inspire_query_parser.parser import Query
from inspire_query_parser.stateful_pypeg_parser import StatefulParser
from inspire_query_parser.utils.format_parse_tree import emit_tree_format
-from inspire_query_parser.visitors.elastic_search_visitor import \
- ElasticSearchVisitor
-from inspire_query_parser.visitors.restructuring_visitor import \
- RestructuringVisitor
+from inspire_query_parser.visitors.elastic_search_visitor import ElasticSearchVisitor
+from inspire_query_parser.visitors.restructuring_visitor import RestructuringVisitor
logger = logging.getLogger(__name__)
def parse_query(query_str):
- """
- Drives the whole logic, by parsing, restructuring and finally, generating an ElasticSearch query.
+ """Drives the whole logic, by parsing, restructuring and finally,
+ generating an ElasticSearch query.
- Args:
- query_str (six.text_types): the given query to be translated to an ElasticSearch query
+ Args: query_str (six.text_types): the given query to be
+ translated to an ElasticSearch query
- Returns:
- six.text_types: Return an ElasticSearch query.
+ Returns: six.text_types: Return an ElasticSearch query.
- Notes:
- In case there's an error, an ElasticSearch `multi_match` query is generated with its `query` value, being the
- query_str argument.
+ Notes: In case there's an error, an ElasticSearch `multi_match`
+ query is generated with its `query` value, being the query_str
+ argument.
"""
+
def _generate_match_all_fields_query():
# Strip colon character (special character for ES)
stripped_query_str = ' '.join(query_str.replace(':', ' ').split())
- return {'multi_match': {'query': stripped_query_str, 'fields': ['_all'], 'zero_terms_query': 'all'}}
+ return {
+ 'multi_match': {
+ 'query': stripped_query_str,
+ 'fields': ['_all'],
+ 'zero_terms_query': 'all',
+ }
+ }
if not isinstance(query_str, six.text_type):
query_str = six.text_type(query_str.decode('utf-8'))
@@ -71,8 +74,13 @@ def _generate_match_all_fields_query():
unrecognized_text, parse_tree = parser.parse(query_str, Query)
if unrecognized_text: # Usually, should never happen.
- msg = 'Parser returned unrecognized text: "' + unrecognized_text + \
- '" for query: "' + query_str + '".'
+ msg = (
+ 'Parser returned unrecognized text: "'
+ + unrecognized_text
+ + '" for query: "'
+ + query_str
+ + '".'
+ )
if query_str == unrecognized_text and parse_tree is None:
# Didn't recognize anything.
@@ -83,18 +91,26 @@ def _generate_match_all_fields_query():
logger.warn(msg)
except SyntaxError as e:
- logger.warn('Parser syntax error (' + six.text_type(e) + ') with query: "' + query_str +
- '". Continuing with a match_all with the given query.')
+ logger.warn(
+ 'Parser syntax error ('
+ + six.text_type(e)
+ + ') with query: "'
+ + query_str
+ + '". Continuing with a match_all with the given query.'
+ )
return _generate_match_all_fields_query()
- # Try-Catch-all exceptions for visitors, so that search functionality never fails for the user.
+ # Try-Catch-all exceptions for visitors, so that search functionality
+ # never fails for the user.
try:
restructured_parse_tree = parse_tree.accept(rst_visitor)
logger.debug('Parse tree: \n' + emit_tree_format(restructured_parse_tree))
except Exception as e:
logger.exception(
- RestructuringVisitor.__name__ + " crashed" + (": " + six.text_type(e) + ".") if six.text_type(e) else '.'
+ RestructuringVisitor.__name__ + " crashed" + (": " + six.text_type(e) + ".")
+ if six.text_type(e)
+ else '.'
)
return _generate_match_all_fields_query()
@@ -102,12 +118,15 @@ def _generate_match_all_fields_query():
es_query = restructured_parse_tree.accept(es_visitor)
except Exception as e:
logger.exception(
- ElasticSearchVisitor.__name__ + " crashed" + (": " + six.text_type(e) + ".") if six.text_type(e) else '.'
+ ElasticSearchVisitor.__name__ + " crashed" + (": " + six.text_type(e) + ".")
+ if six.text_type(e)
+ else '.'
)
return _generate_match_all_fields_query()
if not es_query:
- # Case where an empty query was generated (i.e. date query with malformed date, e.g. "d < 200").
+ # Case where an empty query was generated (i.e. date query with malformed
+ # date, e.g. "d < 200").
return _generate_match_all_fields_query()
return es_query
diff --git a/inspire_query_parser/stateful_pypeg_parser.py b/inspire_query_parser/stateful_pypeg_parser.py
index 02fbe9c..d89ac48 100644
--- a/inspire_query_parser/stateful_pypeg_parser.py
+++ b/inspire_query_parser/stateful_pypeg_parser.py
@@ -26,19 +26,22 @@
class StatefulParser(Parser):
"""Defines a stateful parser for encapsulating parsing flags functionality.
- Attributes:
- _parsing_parenthesized_terminal (bool):
- Signifies whether the parser is trying to identify a parenthesized terminal. Used for disabling the
- terminals parsing related check "stop on DSL keyword", for allowing to parse symbols such as "+", "-" which
- are also DSL keywords ('and' and 'not' respectively).
+ Attributes: _parsing_parenthesized_terminal (bool):
+ Signifies whether the parser is trying to identify a parenthesized
+ terminal. Used for disabling the terminals parsing related
+ check "stop on DSL keyword", for allowing to parse symbols such as
+ "+", "-" which are also DSL keywords ('and' and 'not'
+ respectively).
- _parsing_parenthesized_simple_values_expression (bool):
- Signifies whether we are parsing a parenthesized simple values expression. Used for disabling the simple
- values parsing related check "stop on INSPIRE keyword", for allowing parsing more expressions and not
- restrict the input accepted by the parser.
+ _parsing_parenthesized_simple_values_expression (bool):
+ Signifies whether we are parsing a parenthesized simple values
+ expression. Used for disabling the simple values parsing related
+ check "stop on INSPIRE keyword", for allowing parsing more
+ expressions and not restrict the input accepted by the parser.
- _parsing_texkey_expression (bool):
- Signifies whether we are parsing a `texkey` expression which has special value in which we must accept ':'.
+ _parsing_texkey_expression (bool): Signifies whether we are
+ parsing a `texkey` expression which has special value in which we
+ must accept ':'.
"""
def __init__(self):
diff --git a/inspire_query_parser/utils/format_parse_tree.py b/inspire_query_parser/utils/format_parse_tree.py
index dde7db0..4784716 100644
--- a/inspire_query_parser/utils/format_parse_tree.py
+++ b/inspire_query_parser/utils/format_parse_tree.py
@@ -24,22 +24,20 @@
import six
+from inspire_query_parser.ast import BinaryOp, Leaf, ListOp, UnaryOp
from inspire_query_parser.parser import BooleanRule
-from ..ast import BinaryOp, Leaf, ListOp, UnaryOp
-
INDENTATION = 4
def emit_tree_format(tree, verbose=False):
"""Returns a tree representation of a parse tree.
- Arguments:
- tree: the parse tree whose tree representation is to be generated
- verbose (bool): if True prints the parse tree to be formatted
+ Arguments: tree: the parse tree whose tree
+ representation is to be generated verbose (bool): if True prints
+ the parse tree to be formatted
- Returns:
- str: tree-like representation of the parse tree
+ Returns: str: tree-like representation of the parse tree
"""
if verbose:
print("Converting: " + repr(tree))
@@ -65,14 +63,19 @@ def __recursive_formatter(node, level=-INDENTATION):
new_level = INDENTATION + level
if isinstance(node, Leaf):
- value = "" if not repr(node.value) else node.__class__.__name__ \
- + " {" + (node.value if node.value else "") + "}"
+ value = (
+ ""
+ if not repr(node.value)
+ else node.__class__.__name__
+ + " {"
+ + (node.value if node.value else "")
+ + "}"
+ )
ret_str = __emit_symbol_at_level_str(value, new_level) if value != "" else ""
elif isinstance(node, six.text_type):
- value = "" if not repr(node) or repr(node) == "None" \
- else "Text {" + node + "}"
+ value = "" if not repr(node) or repr(node) == "None" else "Text {" + node + "}"
ret_str = __emit_symbol_at_level_str(value, new_level) if value != "" else ""
@@ -88,7 +91,7 @@ def __recursive_formatter(node, level=-INDENTATION):
if isinstance(node, BooleanRule):
ret_str = __emit_symbol_at_level_str(
node.__class__.__name__ + " {" + str(node.bool_op) + "}",
- new_level
+ new_level,
)
except AttributeError:
pass
diff --git a/inspire_query_parser/utils/visitor_utils.py b/inspire_query_parser/utils/visitor_utils.py
index 97902a3..a648e39 100644
--- a/inspire_query_parser/utils/visitor_utils.py
+++ b/inspire_query_parser/utils/visitor_utils.py
@@ -22,35 +22,39 @@
from __future__ import absolute_import, unicode_literals
-from datetime import date
-
+import contextlib
import json
-from dateutil.relativedelta import relativedelta
-from dateutil.parser import parse
import re
-from unidecode import unidecode
-
-from inspire_utils.name import ParsedName
+from datetime import date
+from dateutil.parser import parse
+from dateutil.relativedelta import relativedelta
from inspire_utils.date import PartialDate
+from inspire_utils.name import ParsedName
+from unidecode import unidecode
from inspire_query_parser.ast import GenericValue
-from inspire_query_parser.config import (DATE_LAST_MONTH_REGEX_PATTERN,
- DATE_SPECIFIERS_COLLECTION,
- DATE_THIS_MONTH_REGEX_PATTERN,
- DATE_TODAY_REGEX_PATTERN,
- DATE_YESTERDAY_REGEX_PATTERN)
-
-
-NAME_INITIAL_FOLLOWED_BY_FIRSTNAME_WITHOUT_SPACE = re.compile(r"(\.[a-z])", re.IGNORECASE)
-QUERY_STRING_QUERY_SPECIAL_CHARACTERS = re.compile(r'\/|\+|\-|\=|\&\&|\|\||\>|\<|\!|\(|\)|\{|\}|\[|\]|\^|\"|\~|\?|\:|\\')
+from inspire_query_parser.config import (
+ DATE_LAST_MONTH_REGEX_PATTERN,
+ DATE_SPECIFIERS_COLLECTION,
+ DATE_THIS_MONTH_REGEX_PATTERN,
+ DATE_TODAY_REGEX_PATTERN,
+ DATE_YESTERDAY_REGEX_PATTERN,
+)
+
+NAME_INITIAL_FOLLOWED_BY_FIRSTNAME_WITHOUT_SPACE = re.compile(
+ r"(\.[a-z])", re.IGNORECASE
+)
+QUERY_STRING_QUERY_SPECIAL_CHARACTERS = re.compile(
+ r'\/|\+|\-|\=|\&\&|\|\||\>|\<|\!|\(|\)|\{|\}|\[|\]|\^|\"|\~|\?|\:|\\'
+)
def retokenize_first_names(names):
"""Handle corner cases where the intial and firstname has no space.
- Example:
- For queries ``J.David`` we be split into ``J`` and ``David``.
+ Example: For queries ``J.David`` we be split into ``J`` and
+ ``David``.
"""
names_filtered = []
for name in names:
@@ -70,24 +74,24 @@ def is_initial_of_a_name(name_part):
def author_name_contains_fullnames(author_name):
- """Recognizes whether the name contains full name parts and not initials or only lastname.
+ """Recognizes whether the name contains full name parts and not initials or
+ only lastname.
- Returns:
- bool: True if name has only full name parts, e.g. 'Ellis John', False otherwise. So for example, False is
- returned for 'Ellis, J.' or 'Ellis'.
+ Returns: bool: True if name has only full name parts, e.g.
+ 'Ellis John', False otherwise. So for example, False is
+ returned for 'Ellis, J.' or 'Ellis'.
"""
parsed_name = ParsedName(author_name)
- if len(parsed_name) == 1:
- return False
- elif any([is_initial_of_a_name(name_part) for name_part in parsed_name]):
- return False
-
- return True
+ return not (
+ len(parsed_name) == 1
+ or any([is_initial_of_a_name(name_part) for name_part in parsed_name])
+ )
def _name_variation_has_only_initials(name):
"""Detects whether the name variation consists only from initials."""
+
def _is_initial(name_variation):
return len(name_variation) == 1 or u'.' in name_variation
@@ -99,42 +103,45 @@ def _is_initial(name_variation):
def generate_minimal_name_variations(author_name):
"""Generate a small number of name variations.
- Notes:
- Unidecodes the name, so that we use its transliterated version, since this is how the field is being indexed.
-
- For names with more than one part, {lastname} x {non lastnames, non lastnames initial} variations.
- Additionally, it generates the swapped version of those, for supporting queries like ``Mele Salvatore`` which
- ``ParsedName`` parses as lastname: Salvatore and firstname: Mele. So in those cases, we need to generate both
- ``Mele, Salvatore`` and ``Mele, S``.
-
- Wherever, the '-' is replaced by ' ', it's done because it's the way the name variations are being index, thus
- we want our minimal name variations to be generated identically. This has to be done after the creation of
- ParsedName, otherwise the name is parsed differently. E.g. 'Caro-Estevez' as is, it's a lastname, if we replace
- the '-' with ' ', then it's a firstname and lastname.
+ Notes: Unidecodes the name, so that we use its transliterated
+ version, since this is how the field is being indexed.
+
+ For names with more than one part, {lastname} x {non lastnames, non
+ lastnames initial} variations. Additionally, it generates the
+ swapped version of those, for supporting queries like ``Mele
+ Salvatore`` which ``ParsedName`` parses as lastname: Salvatore and
+ firstname: Mele. So in those cases, we need to generate both ``Mele,
+ Salvatore`` and ``Mele, S``.
+
+ Wherever, the '-' is replaced by ' ', it's done because it's the way
+ the name variations are being index, thus we want our minimal name
+ variations to be generated identically. This has to be done after
+ the creation of ParsedName, otherwise the name is parsed
+ differently. E.g. 'Caro-Estevez' as is, it's a lastname, if we
+ replace the '-' with ' ', then it's a firstname and lastname.
"""
parsed_name = ParsedName.loads(unidecode(author_name))
if len(parsed_name) > 1:
lastnames = parsed_name.last.replace('-', ' ')
- non_lastnames = ' '.join(
- parsed_name.first_list + parsed_name.suffix_list
- )
+ non_lastnames = ' '.join(parsed_name.first_list + parsed_name.suffix_list)
# Strip extra whitespace added if any of middle_list and suffix_list are empty.
non_lastnames = non_lastnames.strip().replace('-', ' ')
# Adding into a set first, so as to drop identical name variations.
- return list({
- name_variation.lower()
- for name_variation
- in [
- lastnames + ' ' + non_lastnames,
- lastnames + ' ' + non_lastnames[0],
- non_lastnames + ' ' + lastnames,
- non_lastnames + ' ' + lastnames[0],
- ]
- if not _name_variation_has_only_initials(name_variation)
- })
+ return list(
+ {
+ name_variation.lower()
+ for name_variation in [
+ lastnames + ' ' + non_lastnames,
+ lastnames + ' ' + non_lastnames[0],
+ non_lastnames + ' ' + lastnames,
+ non_lastnames + ' ' + lastnames[0],
+ ]
+ if not _name_variation_has_only_initials(name_variation)
+ }
+ )
else:
return [parsed_name.dumps().replace('-', ' ').lower()]
@@ -142,7 +149,8 @@ def generate_minimal_name_variations(author_name):
# #### Date specifiers related utils ####
ANY_PREFIX_AND_A_NUMBER = re.compile('(.+)(\d+)')
-# ES query constants that provide rounding of dates on query time, according to the date "resolution" the user gave.
+# ES query constants that provide rounding of dates on query time, according to the
+# date "resolution" the user gave.
# More here: https://www.elastic.co/guide/en/elasticsearch/reference/6.1/common-options.html#date-math
ES_DATE_MATH_ROUNDING_YEAR = "||/y"
ES_DATE_MATH_ROUNDING_MONTH = "||/M"
@@ -152,7 +160,9 @@ def generate_minimal_name_variations(author_name):
def _compile_date_regexes(date_specifier_patterns):
date_specifier_regexes = {}
for date_specifier in date_specifier_patterns:
- date_specifier_regexes[date_specifier] = re.compile(date_specifier, re.IGNORECASE)
+ date_specifier_regexes[date_specifier] = re.compile(
+ date_specifier, re.IGNORECASE
+ )
return date_specifier_regexes
@@ -163,35 +173,40 @@ def _compile_date_regexes(date_specifier_patterns):
def register_date_conversion_handler(date_specifier_patterns):
"""Decorator for registering handlers that convert text dates to dates.
- Args:
- date_specifier_patterns (str): the date specifier (in regex pattern format) for which the handler is registered
+ Args: date_specifier_patterns (str): the date specifier (in
+ regex pattern format) for which the handler is registered
"""
def _decorator(func):
global DATE_SPECIFIERS_CONVERSION_HANDLERS
- DATE_SPECIFIERS_CONVERSION_HANDLERS[DATE_SPECIFIERS_REGEXES[date_specifier_patterns]] = func
+ DATE_SPECIFIERS_CONVERSION_HANDLERS[
+ DATE_SPECIFIERS_REGEXES[date_specifier_patterns]
+ ] = func
return func
return _decorator
DATE_SPECIFIERS_CONVERSION_HANDLERS = {}
-"""Mapping that depending on the date-specifier (key), returns the handler that converts the textual date to date."""
+"""Mapping that depending on the date-specifier (key), returns the handler that
+converts the textual date to date."""
def _extract_number_from_text(text):
number = 0 # fallback in case extracting the number fails
number_match = ANY_PREFIX_AND_A_NUMBER.match(text)
if number_match:
- try:
+ with contextlib.suppress(ValueError):
number = int(number_match.group(2))
- except ValueError:
- pass
return number
def _convert_date_to_string(start_date, relative_delta=None):
- return str(start_date - relative_delta) if relative_delta is not None else str(start_date)
+ return (
+ str(start_date - relative_delta)
+ if relative_delta is not None
+ else str(start_date)
+ )
@register_date_conversion_handler(DATE_TODAY_REGEX_PATTERN)
@@ -199,7 +214,8 @@ def convert_today_date_specifier(relative_date_specifier_suffix):
start_date = date.today()
relative_delta = (
relativedelta(days=_extract_number_from_text(relative_date_specifier_suffix))
- if relative_date_specifier_suffix else None
+ if relative_date_specifier_suffix
+ else None
)
return _convert_date_to_string(start_date, relative_delta)
@@ -210,7 +226,8 @@ def convert_yesterday_date_specifier(relative_date_specifier_suffix):
start_date = date.today() - relativedelta(days=1)
relative_delta = (
relativedelta(days=_extract_number_from_text(relative_date_specifier_suffix))
- if relative_date_specifier_suffix else None
+ if relative_date_specifier_suffix
+ else None
)
return _convert_date_to_string(start_date, relative_delta)
@@ -221,7 +238,8 @@ def convert_this_month_date(relative_date_specifier_suffix):
start_date = date.today()
relative_delta = (
relativedelta(months=_extract_number_from_text(relative_date_specifier_suffix))
- if relative_date_specifier_suffix else None
+ if relative_date_specifier_suffix
+ else None
)
return _convert_date_to_string(start_date, relative_delta)
@@ -232,7 +250,8 @@ def convert_last_month_date(relative_date_specifier_suffix):
start_date = date.today() - relativedelta(months=1)
relative_delta = (
relativedelta(months=_extract_number_from_text(relative_date_specifier_suffix))
- if relative_date_specifier_suffix else None
+ if relative_date_specifier_suffix
+ else None
)
return _convert_date_to_string(start_date, relative_delta)
@@ -244,22 +263,22 @@ def convert_last_month_date(relative_date_specifier_suffix):
"""Contains all the dates that contain always only a year date."""
ES_RANGE_EQ_OPERATOR = 'eq'
-"""Additional (internal to the parser) range operator, for handling date equality queries as ranges."""
+"""Additional (internal to the parser) range operator, for handling date
+equality queries as ranges."""
def _truncate_wildcard_from_date(date_value):
"""Truncate wildcard from date parts.
- Returns:
- (str) The truncated date.
+ Returns: (str) The truncated date.
- Raises:
- ValueError, on either unsupported date separator (currently only ' ' and '-' are supported), or if there's a
- wildcard in the year.
+ Raises: ValueError, on either unsupported date separator
+ (currently only ' ' and '-' are supported), or if there's a
+ wildcard in the year.
- Notes:
- Either whole date part is wildcard, in which we ignore it and do a range query on the
- remaining parts, or some numbers are wildcards, where again, we ignore this part.
+ Notes: Either whole date part is wildcard, in which we ignore it
+ and do a range query on the remaining parts, or some numbers are
+ wildcards, where again, we ignore this part.
"""
if ' ' in date_value:
date_parts = date_value.split(' ')
@@ -278,17 +297,18 @@ def _truncate_wildcard_from_date(date_value):
def _truncate_date_value_according_on_date_field(field, date_value):
"""Truncates date value (to year only) according to the given date field.
- Args:
- field (unicode): The field for which the date value will be used to query on.
- date_value (str): The date value that is going to be truncated to its year.
+ Args: field (unicode): The field for which the date value will
+ be used to query on. date_value (str): The date value that is
+ going to be truncated to its year.
- Returns:
- PartialDate: The possibly truncated date, on success. None, otherwise.
+ Returns: PartialDate: The possibly truncated date, on success.
+ None, otherwise.
- Notes:
- In case the fieldname is in `ES_MAPPING_HEP_DATE_ONLY_YEAR`, then the date is normalized and then only its year
- value is used. This is needed for ElasticSearch to be able to do comparisons on dates that have only year, which
- fails if being queried with a date with more .
+ Notes: In case the fieldname is in
+ `ES_MAPPING_HEP_DATE_ONLY_YEAR`, then the date is normalized and
+ then only its year value is used. This is needed for
+ ElasticSearch to be able to do comparisons on dates that have only
+ year, which fails if being queried with a date with more .
"""
try:
partial_date = PartialDate.parse(date_value)
@@ -306,11 +326,10 @@ def _truncate_date_value_according_on_date_field(field, date_value):
def _get_next_date_from_partial_date(partial_date):
"""Calculates the next date from the given partial date.
- Args:
- partial_date (inspire_utils.date.PartialDate): The partial date whose next date should be calculated.
+ Args: partial_date (inspire_utils.date.PartialDate): The partial
+ date whose next date should be calculated.
- Returns:
- PartialDate: The next date from the given partial date.
+ Returns: PartialDate: The next date from the given partial date.
"""
relativedelta_arg = 'years'
@@ -323,27 +342,29 @@ def _get_next_date_from_partial_date(partial_date):
return PartialDate.from_parts(
next_date.year,
next_date.month if partial_date.month else None,
- next_date.day if partial_date.day else None
+ next_date.day if partial_date.day else None,
)
def _get_proper_elastic_search_date_rounding_format(partial_date):
- """Returns the proper ES date math unit according to the "resolution" of the partial_date.
-
- Args:
- partial_date (PartialDate): The partial date for which the date math unit is.
-
- Returns:
- (str): The ES date math unit format.
-
- Notes:
- This is needed for supporting range queries on dates, i.e. rounding them up or down according to
- the ES range operator.
- For example, without this, a query like 'date > 2010-11', would return documents with date '2010-11-15', due to
- the date value of the query being interpreted by ES as '2010-11-01 01:00:00'. By using the suffixes for rounding
- up or down, the date value of the query is interpreted as '2010-11-30T23:59:59.999', thus not returning the
- document with date '2010-11-15', as the user would expect. See:
- https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-range-query.html#_date_math_and_rounding
+ """Returns the proper ES date math unit according to the "resolution" of
+ the partial_date.
+
+ Args: partial_date (PartialDate): The partial date for which the
+ date math unit is.
+
+ Returns: (str): The ES date math unit format.
+
+ Notes: This is needed for supporting range queries on dates,
+ i.e. rounding them up or down according to the ES range
+ operator. For example, without this, a query like 'date >
+ 2010-11', would return documents with date '2010-11-15', due to
+ the date value of the query being interpreted by ES as '2010-11-01
+ 01:00:00'. By using the suffixes for rounding up or down, the
+ date value of the query is interpreted as '2010-11-30T23:59:59.999',
+ thus not returning the document with date '2010-11-15', as the
+ user would expect. See:
+ https://www.elastic.co/guide/en/elasticsearch/reference/6.1/query-dsl-range-query.html#_date_math_and_rounding
"""
es_date_math_unit = ES_DATE_MATH_ROUNDING_YEAR
@@ -355,17 +376,21 @@ def _get_proper_elastic_search_date_rounding_format(partial_date):
return es_date_math_unit
-def update_date_value_in_operator_value_pairs_for_fieldname(field, operator_value_pairs):
- """Updates (operator, date value) pairs by normalizing the date value according to the given field.
+def update_date_value_in_operator_value_pairs_for_fieldname(
+ field, operator_value_pairs
+):
+ """Updates (operator, date value) pairs by normalizing the date value
+ according to the given field.
- Args:
- field (unicode): The fieldname for which the operator-value pairs are being generated.
- operator_value_pairs (dict): ES range operator {'gt', 'gte', 'lt', 'lte'} along with a value.
- Additionally, if the operator is ``ES_RANGE_EQ_OPERATOR``, then it is indicated that the method should
- generate both a lower and an upper bound operator-value pairs, with the given date_value.
+ Args: field (unicode): The fieldname for which the operator-
+ value pairs are being generated. operator_value_pairs (dict): ES
+ range operator {'gt', 'gte', 'lt', 'lte'} along with a value.
+ Additionally, if the operator is ``ES_RANGE_EQ_OPERATOR``, then it
+ is indicated that the method should generate both a lower
+ and an upper bound operator-value pairs, with the given date_value.
- Notes:
- On a ``ValueError`` an empty operator_value_pairs is returned.
+ Notes: On a ``ValueError`` an empty operator_value_pairs is
+ returned.
"""
updated_operator_value_pairs = {}
for operator, value in operator_value_pairs.items():
@@ -374,15 +399,24 @@ def update_date_value_in_operator_value_pairs_for_fieldname(field, operator_valu
return {}
if operator == ES_RANGE_EQ_OPERATOR:
- updated_operator_value_pairs['gte'] = \
- modified_date.dumps() + _get_proper_elastic_search_date_rounding_format(modified_date)
+ updated_operator_value_pairs[
+ 'gte'
+ ] = modified_date.dumps() + _get_proper_elastic_search_date_rounding_format(
+ modified_date
+ )
next_date = _get_next_date_from_partial_date(modified_date)
- updated_operator_value_pairs['lt'] = \
- next_date.dumps() + _get_proper_elastic_search_date_rounding_format(next_date)
+ updated_operator_value_pairs[
+ 'lt'
+ ] = next_date.dumps() + _get_proper_elastic_search_date_rounding_format(
+ next_date
+ )
else:
- updated_operator_value_pairs[operator] = \
- modified_date.dumps() + _get_proper_elastic_search_date_rounding_format(modified_date)
+ updated_operator_value_pairs[
+ operator
+ ] = modified_date.dumps() + _get_proper_elastic_search_date_rounding_format(
+ modified_date
+ )
return updated_operator_value_pairs
@@ -391,22 +425,21 @@ def update_date_value_in_operator_value_pairs_for_fieldname(field, operator_valu
def generate_match_query(field, value, with_operator_and):
"""Helper for generating a match query.
- Args:
- field (six.text_type): The ES field to be queried.
- value (six.text_type/bool): The value of the query (bool for the case of type-code query ["core: true"]).
- with_operator_and (bool): Flag that signifies whether to generate the explicit notation of the query, along
- with '"operator": "and"', so that all tokens of the query value are required to match.
+ Args: field (six.text_type): The ES field to be queried.
+ value (six.text_type/bool): The value of the query (bool for the
+ case of type-code query ["core: true"]). with_operator_and
+ (bool): Flag that signifies whether to generate the explicit
+ notation of the query, along with '"operator": "and"', so
+ that all tokens of the query value are required to match.
- Notes:
- If value is of instance bool, then the shortened version of the match query is generated, at all times.
+ Notes: If value is of instance bool, then the shortened version
+ of the match query is generated, at all times.
"""
parsed_value = None
- try:
+ # Catch all possible exceptions
+ # we are not interested if they will appear
+ with contextlib.suppress(ValueError, TypeError, AttributeError):
parsed_value = json.loads(value.lower())
- except (ValueError, TypeError, AttributeError):
- # Catch all possible exceptions
- # we are not interested if they will appear
- pass
if isinstance(value, bool):
return {'match': {field: value}}
@@ -414,14 +447,7 @@ def generate_match_query(field, value, with_operator_and):
return {'match': {field: value.lower()}}
if with_operator_and:
- return {
- 'match': {
- field: {
- 'query': value,
- 'operator': 'and'
- }
- }
- }
+ return {'match': {field: {'query': value, 'operator': 'and'}}}
return {'match': {field: value}}
@@ -429,35 +455,32 @@ def generate_match_query(field, value, with_operator_and):
def generate_nested_query(path, queries):
"""Generates nested query.
- Returns:
- (dict): The nested query if queries is not falsy, otherwise an empty dict.
+ Returns: (dict): The nested query if queries is not falsy,
+ otherwise an empty dict.
"""
if not queries:
return {}
- return {
- 'nested': {
- 'path': path,
- 'query': queries
- }
- }
+ return {'nested': {'path': path, 'query': queries}}
-def wrap_queries_in_bool_clauses_if_more_than_one(queries,
- use_must_clause,
- preserve_bool_semantics_if_one_clause=False):
+def wrap_queries_in_bool_clauses_if_more_than_one(
+ queries, use_must_clause, preserve_bool_semantics_if_one_clause=False
+):
"""Helper for wrapping a list of queries into a bool.{must, should} clause.
- Args:
- queries (list): List of queries to be wrapped in a bool.{must, should} clause.
- use_must_clause (bool): Flag that signifies whether to use 'must' or 'should' clause.
- preserve_bool_semantics_if_one_clause (bool): Flag that signifies whether to generate a bool query even if
- there's only one clause. This happens to generate boolean query semantics. Usually not the case, but
- useful for boolean queries support.
-
- Returns:
- (dict): If len(queries) > 1, the bool clause, otherwise if len(queries) == 1, will return the query itself,
- while finally, if len(queries) == 0, then an empty dictionary is returned.
+ Args: queries (list): List of queries to be wrapped in a
+ bool.{must, should} clause. use_must_clause (bool): Flag that
+ signifies whether to use 'must' or 'should' clause.
+ preserve_bool_semantics_if_one_clause (bool): Flag that signifies
+ whether to generate a bool query even if there's only one
+ clause. This happens to generate boolean query semantics. Usually
+ not the case, but useful for boolean queries support.
+
+ Returns: (dict): If len(queries) > 1, the bool clause, otherwise
+ if len(queries) == 1, will return the query itself,
+ while finally, if len(queries) == 0, then an empty dictionary is
+ returned.
"""
if not queries:
return {}
@@ -467,22 +490,16 @@ def wrap_queries_in_bool_clauses_if_more_than_one(queries,
if len(queries) == 1 and not preserve_bool_semantics_if_one_clause:
return queries[0]
- return {
- 'bool': {
- ('must' if use_must_clause else 'should'): queries
- }
- }
+ return {'bool': {('must' if use_must_clause else 'should'): queries}}
def wrap_query_in_nested_if_field_is_nested(query, field, nested_fields):
- """Helper for wrapping a query into a nested if the fields within the query are nested
-
- Args:
- query : The query to be wrapped.
- field : The field that is being queried.
- nested_fields : List of fields which are nested.
- Returns:
- (dict): The nested query
+ """Helper for wrapping a query into a nested if the fields within the query
+ are nested.
+
+ Args: query : The query to be wrapped. field : The field
+ that is being queried. nested_fields : List of fields which are
+ nested. Returns: (dict): The nested query
"""
if not field:
return query
@@ -500,12 +517,13 @@ def wrap_query_in_nested_if_field_is_nested(query, field, nested_fields):
def escape_query_string_special_characters(value):
- """
- Helper to escape reserved characters in query_string query.
+ """Helper to escape reserved characters in query_string query.
+
According do the documentation failing to escape these special
characters correctly could lead to a syntax error which prevents
your query from running.
"""
- value = re.sub(QUERY_STRING_QUERY_SPECIAL_CHARACTERS,
- lambda char: "\\" + char.group(), value)
+ value = re.sub(
+ QUERY_STRING_QUERY_SPECIAL_CHARACTERS, lambda char: "\\" + char.group(), value
+ )
return value
diff --git a/inspire_query_parser/visitors/elastic_search_visitor.py b/inspire_query_parser/visitors/elastic_search_visitor.py
index 3f6cf42..cedc6b0 100644
--- a/inspire_query_parser/visitors/elastic_search_visitor.py
+++ b/inspire_query_parser/visitors/elastic_search_visitor.py
@@ -19,24 +19,22 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
-
-"""
-This module encapsulates the ElasticSearch visitor logic, that receives the output of the parser and restructuring
-visitor and converts it to an ElasticSearch query.
-"""
+"""This module encapsulates the ElasticSearch visitor logic, that receives the
+output of the parser and restructuring visitor and converts it to an
+ElasticSearch query."""
from __future__ import absolute_import, unicode_literals
import logging
-from pypeg2 import whitespace
import re
-import six
from unicodedata import normalize
+import six
from inspire_schemas.utils import convert_old_publication_info_to_new
from inspire_utils.helpers import force_list
-from inspire_utils.name import normalize_name, ParsedName
+from inspire_utils.name import ParsedName, normalize_name
from inspire_utils.query import wrap_queries_in_bool_clauses_if_more_than_one
+from pypeg2 import whitespace
from inspire_query_parser import ast
from inspire_query_parser.config import (
@@ -47,11 +45,11 @@
ES_RANGE_EQ_OPERATOR,
_truncate_date_value_according_on_date_field,
_truncate_wildcard_from_date,
+ escape_query_string_special_characters,
generate_match_query,
generate_nested_query,
update_date_value_in_operator_value_pairs_for_fieldname,
wrap_query_in_nested_if_field_is_nested,
- escape_query_string_special_characters
)
from inspire_query_parser.visitors.visitor_impl import Visitor
@@ -65,9 +63,11 @@ class FieldVariations(object):
class ElasticSearchVisitor(Visitor):
"""Converts a parse tree to an ElasticSearch query.
- Notes:
- The ElasticSearch query follows the 2.4 version DSL specification.
+
+ Notes: The ElasticSearch query follows the 2.4 version DSL
+ specification.
"""
+
# ##### Configuration #####
# ## Journal queries ##
JOURNAL_FIELDS_PREFIX = 'publication_info'
@@ -78,7 +78,9 @@ class ElasticSearchVisitor(Visitor):
JOURNAL_ART_ID = 'artid'
JOURNAL_YEAR = 'year'
JOURNAL_FIELDS_MAPPING = {
- JOURNAL_TITLE: '.'.join((JOURNAL_FIELDS_PREFIX, JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO)),
+ JOURNAL_TITLE: '.'.join(
+ (JOURNAL_FIELDS_PREFIX, JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO)
+ ),
JOURNAL_VOLUME: '.'.join((JOURNAL_FIELDS_PREFIX, JOURNAL_VOLUME)),
JOURNAL_PAGE_START: '.'.join((JOURNAL_FIELDS_PREFIX, JOURNAL_PAGE_START)),
JOURNAL_ART_ID: '.'.join((JOURNAL_FIELDS_PREFIX, JOURNAL_ART_ID)),
@@ -115,9 +117,7 @@ class ElasticSearchVisitor(Visitor):
'eprint': 'arxiv_eprints.value.raw',
'exact-author': 'authors.full_name_unicode_normalized',
'irn': 'external_system_identifiers.value.raw',
- 'journal': [
- *JOURNAL_FIELDS_MAPPING.values()
- ],
+ 'journal': [*JOURNAL_FIELDS_MAPPING.values()],
'keyword': 'keywords.value',
'refersto': 'references.record.$ref',
'reportnumber': 'report_numbers.value.fuzzy',
@@ -136,13 +136,14 @@ class ElasticSearchVisitor(Visitor):
'fulltext': 'documents.attachment.content',
'citedby': {
'path': 'references.record.$ref.raw',
- 'search_path': 'self.$ref.raw'
- }
+ 'search_path': 'self.$ref.raw',
+ },
}
"""Mapping from keywords to ElasticSearch fields.
- Note:
- If a keyword should query multiple fields, then it's value in the mapping should be a list. This will generate
- a ``multi_match`` query. Otherwise a ``match`` query is generated.
+
+ Note: If a keyword should query multiple fields, then it's value
+ in the mapping should be a list. This will generate a
+ ``multi_match`` query. Otherwise a ``match`` query is generated.
"""
TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING = {
'b': ('document_type', 'book'),
@@ -164,8 +165,9 @@ class ElasticSearchVisitor(Visitor):
'proceedings': ('document_type', 'proceedings'),
}
"""Mapping from type-code query values to field and value pairs.
- Note:
- These are going to be used for querying (instead of the given value).
+
+ Note: These are going to be used for querying (instead of the
+ given value).
"""
AUTHORS_NAME_VARIATIONS_FIELD = 'authors.name_variations'
@@ -187,68 +189,91 @@ class ElasticSearchVisitor(Visitor):
# #### Helpers ####
def _get_author_or_first_author_keyword_from_fieldnames(self, fieldnames=None):
- """Returns author or first_author keywords if their fields are part of the fieldnames. Defaults to author"""
- return 'first_author' if fieldnames and self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames else 'author'
+ """Returns author or first_author keywords if their fields are part of
+ the fieldnames.
+
+ Defaults to author
+ """
+ return (
+ 'first_author'
+ if fieldnames and self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames
+ else 'author'
+ )
def _generate_nested_author_query(self, query, fieldnames=None):
- """Generates nested query with path for authors or first_author"""
- nested_path = self.FIRST_AUTHOR_NESTED_QUERY_PATH \
- if fieldnames and self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames \
+ """Generates nested query with path for authors or first_author."""
+ nested_path = (
+ self.FIRST_AUTHOR_NESTED_QUERY_PATH
+ if fieldnames and self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames
else self.AUTHORS_NESTED_QUERY_PATH
+ )
return generate_nested_query(nested_path, query)
def _are_fieldnames_author_or_first_author(self, fieldnames):
if isinstance(fieldnames, list):
- return self.KEYWORD_TO_ES_FIELDNAME['author'] in fieldnames or self.KEYWORD_TO_ES_FIELDNAME[
- 'first_author'] in fieldnames
- return self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames or self.KEYWORD_TO_ES_FIELDNAME[
- 'first_author'] == fieldnames
+ return (
+ self.KEYWORD_TO_ES_FIELDNAME['author'] in fieldnames
+ or self.KEYWORD_TO_ES_FIELDNAME['first_author'] in fieldnames
+ )
+ return (
+ self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldnames
+ or self.KEYWORD_TO_ES_FIELDNAME['first_author'] == fieldnames
+ )
- def _generate_fieldnames_if_bai_query(self, fieldnames, node_value, bai_field_variation,
- query_bai_field_if_dots_in_name):
+ def _generate_fieldnames_if_bai_query(
+ self,
+ fieldnames,
+ node_value,
+ bai_field_variation,
+ query_bai_field_if_dots_in_name,
+ ):
"""Generates new fieldnames in case of BAI query.
- Args:
- fieldnames : names of the fields of the node.
- node_value (six.text_type): The node's value (i.e. author name).
- bai_field_variation (six.text_type): Which field variation to query ('search' or 'raw').
- query_bai_field_if_dots_in_name (bool): Whether to query BAI field (in addition to author's name field)
- if dots exist in the name and name contains no whitespace.
- Returns:
- list: Fieldnames to query on, in case of BAI query or None, otherwise.
- Raises:
- ValueError, if ``field_variation`` is not one of ('search', 'raw').
+
+ Args: fieldnames : names of the fields of the node.
+ node_value (six.text_type): The node's value (i.e. author name).
+ bai_field_variation (six.text_type): Which field variation to
+ query ('search' or 'raw'). query_bai_field_if_dots_in_name
+ (bool): Whether to query BAI field (in addition to author's name
+ field) if dots exist in the name and name contains no
+ whitespace. Returns: list: Fieldnames to query on, in case
+ of BAI query or None, otherwise. Raises: ValueError, if
+ ``field_variation`` is not one of ('search', 'raw').
"""
if bai_field_variation not in (FieldVariations.search, FieldVariations.raw):
- raise ValueError('Non supported field variation "{}".'.format(bai_field_variation))
+ raise ValueError(
+ 'Non supported field variation "{}".'.format(bai_field_variation)
+ )
keyword = self._get_author_or_first_author_keyword_from_fieldnames(fieldnames)
normalized_author_name = normalize_name(node_value).strip('.')
bai_fieldname = self.KEYWORD_TO_ES_FIELDNAME['{}_bai'.format(keyword)]
- if self.KEYWORD_TO_ES_FIELDNAME[keyword] and \
- self.BAI_REGEX.match(node_value):
+ if self.KEYWORD_TO_ES_FIELDNAME[keyword] and self.BAI_REGEX.match(node_value):
return [bai_fieldname + '.' + bai_field_variation]
- elif not whitespace.search(normalized_author_name) and \
- query_bai_field_if_dots_in_name and \
- self.KEYWORD_TO_ES_FIELDNAME[keyword] and \
- '.' in normalized_author_name:
+ elif (
+ not whitespace.search(normalized_author_name)
+ and query_bai_field_if_dots_in_name
+ and self.KEYWORD_TO_ES_FIELDNAME[keyword]
+ and '.' in normalized_author_name
+ ):
# Case of partial BAI, e.g. ``J.Smith``.
- return [bai_fieldname + '.' + bai_field_variation] + \
- force_list(self.KEYWORD_TO_ES_FIELDNAME[keyword])
+ return [bai_fieldname + '.' + bai_field_variation] + force_list(
+ self.KEYWORD_TO_ES_FIELDNAME[keyword]
+ )
return None
def _generate_author_query(self, fieldnames, author_name):
"""Generates a query handling specifically authors.
- Notes:
- There are three main cases:
- 1) ``a Smith``
- This will just generate a ``match`` query on ``last_name``
- 2) ``a John Smith``
- This will just generate a ``match`` query on ``last_name`` and a ``prefix`` query on ``first_name``
- and a ``match`` query on the initial ``J``. This will return results from ``Smith, John`` and ``Smith, J``
- but not from ``Smith, Jane``.
- 3) ``a J Smith``
- This will just generate a ``match`` query on ``last_name`` and a match query on ``first_name.initials``.
- Please note, cases such as ``J.D.`` have been properly handled by the tokenizer.
+
+ Notes: There are three main cases: 1) ``a Smith`` This
+ will just generate a ``match`` query on ``last_name`` 2) ``a
+ John Smith`` This will just generate a ``match`` query on
+ ``last_name`` and a ``prefix`` query on ``first_name`` and a
+ ``match`` query on the initial ``J``. This will return results
+ from ``Smith, John`` and ``Smith, J`` but not from ``Smith,
+ Jane``. 3) ``a J Smith`` This will just generate a
+ ``match`` query on ``last_name`` and a match query on
+ ``first_name.initials``. Please note, cases such as ``J.D.``
+ have been properly handled by the tokenizer.
"""
parsed_name = ParsedName(author_name)
keyword = self._get_author_or_first_author_keyword_from_fieldnames(fieldnames)
@@ -258,35 +283,37 @@ def _generate_author_query(self, fieldnames, author_name):
def _generate_exact_author_query(self, author_name_or_bai):
"""Generates a term query handling authors and BAIs.
- Notes:
- If given value is a BAI, search for the provided value in the raw field variation of
- `self.AUTHORS_BAI_FIELD`.
- Otherwise, the value will be procesed in the same way as the indexed value (i.e. lowercased and normalized
- (inspire_utils.normalize_name and then NFKC normalization).
- E.g. Searching for 'Smith, J.' is the same as searching for: 'Smith, J', 'smith, j.', 'smith j', 'j smith',
- 'j. smith', 'J Smith', 'J. Smith'.
+
+ Notes: If given value is a BAI, search for the provided
+ value in the raw field variation of `self.AUTHORS_BAI_FIELD`.
+ Otherwise, the value will be procesed in the same way as the
+ indexed value (i.e. lowercased and normalized
+ (inspire_utils.normalize_name and then NFKC normalization).
+ E.g. Searching for 'Smith, J.' is the same as searching for:
+ 'Smith, J', 'smith, j.', 'smith j', 'j smith', 'j. smith', 'J
+ Smith', 'J. Smith'.
"""
if self.BAI_REGEX.match(author_name_or_bai):
bai = author_name_or_bai.lower()
query = self._generate_term_query(
- '.'.join((self.AUTHORS_BAI_FIELD, FieldVariations.search)),
- bai
+ '.'.join((self.AUTHORS_BAI_FIELD, FieldVariations.search)), bai
)
else:
author_name = normalize('NFKC', normalize_name(author_name_or_bai)).lower()
query = self._generate_term_query(
- self.KEYWORD_TO_ES_FIELDNAME['exact-author'],
- author_name
+ self.KEYWORD_TO_ES_FIELDNAME['exact-author'], author_name
)
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
def _generate_date_with_wildcard_query(self, date_value):
"""Helper for generating a date keyword query containing a wildcard.
- Returns:
- (dict): The date query containing the wildcard or an empty dict in case the date value is malformed.
- The policy followed here is quite conservative on what it accepts as valid input. Look into
- :meth:`inspire_query_parser.utils.visitor_utils._truncate_wildcard_from_date` for more information.
+
+ Returns: (dict): The date query containing the wildcard or
+ an empty dict in case the date value is malformed. The policy
+ followed here is quite conservative on what it accepts as valid
+ input. Look into :meth:`inspire_query_parser.utils.visitor_utils
+ ._truncate_wildcard_from_date` for more information.
"""
if date_value.endswith(ast.GenericValue.WILDCARD_TOKEN):
try:
@@ -295,38 +322,46 @@ def _generate_date_with_wildcard_query(self, date_value):
# Drop date query.
return {}
- return self._generate_range_queries(self.KEYWORD_TO_ES_FIELDNAME['date'],
- {ES_RANGE_EQ_OPERATOR: date_value})
+ return self._generate_range_queries(
+ self.KEYWORD_TO_ES_FIELDNAME['date'], {ES_RANGE_EQ_OPERATOR: date_value}
+ )
else:
# Drop date query with wildcard not as suffix, e.g. 2000-1*-31
return {}
def _generate_queries_for_title_symbols(self, title_field, query_value):
- """Generate queries for any symbols in the title against the whitespace tokenized field of titles.
- Returns:
- (dict): The query or queries for the whitespace tokenized field of titles. If none such tokens exist, then
- returns an empty dict.
- Notes:
- Splits the value stream into tokens according to whitespace.
- Heuristically identifies the ones that contain symbol-indicating-characters (examples of those tokens are
- "g-2", "SU(2)").
+ """Generate queries for any symbols in the title against the whitespace
+ tokenized field of titles.
+
+ Returns: (dict): The query or queries for the whitespace
+ tokenized field of titles. If none such tokens exist, then
+ returns an empty dict. Notes: Splits the value stream into
+ tokens according to whitespace. Heuristically identifies the
+ ones that contain symbol-indicating-characters (examples of
+ those tokens are "g-2", "SU(2)").
"""
values_tokenized_by_whitespace = query_value.split()
symbol_queries = []
for value in values_tokenized_by_whitespace:
- # Heuristic: If there's a symbol-indicating-character in the value, it signifies terms that should be
+ # Heuristic: If there's a symbol-indicating-character in the value,
+ # it signifies terms that should be
# queried against the whitespace-tokenized title.
- if any(character in value for character in self.TITLE_SYMBOL_INDICATING_CHARACTER):
+ if any(
+ character in value
+ for character in self.TITLE_SYMBOL_INDICATING_CHARACTER
+ ):
symbol_queries.append(
generate_match_query(
'.'.join([title_field, FieldVariations.search]),
value,
- with_operator_and=False
+ with_operator_and=False,
)
)
- return wrap_queries_in_bool_clauses_if_more_than_one(symbol_queries, use_must_clause=True)
+ return wrap_queries_in_bool_clauses_if_more_than_one(
+ symbol_queries, use_must_clause=True
+ )
def _generate_title_queries(self, value):
title_field = self.KEYWORD_TO_ES_FIELDNAME['title']
@@ -335,18 +370,22 @@ def _generate_title_queries(self, value):
symbol_queries = self._generate_queries_for_title_symbols(title_field, value)
return wrap_queries_in_bool_clauses_if_more_than_one(
[element for element in (q, symbol_queries) if element],
- use_must_clause=True
+ use_must_clause=True,
)
def _generate_type_code_query(self, value):
"""Generate type-code queries.
- Notes:
- If the value of the type-code query exists in `TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING, then we
- query the specified field, along with the given value according to the mapping.
- See: https://github.com/inspirehep/inspire-query-parser/issues/79
- Otherwise, we query both ``document_type`` and ``publication_info``.
+
+ Notes: If the value of the type-code query exists in
+ `TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING, then we query
+ the specified field, along with the given value according to the
+ mapping. See:
+ https://github.com/inspirehep/inspire-query-parser/issues/79
+ Otherwise, we query both ``document_type`` and ``publication_info``.
"""
- mapping_for_value = self.TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING.get(value.lower(), None)
+ mapping_for_value = self.TYPECODE_VALUE_TO_FIELD_AND_VALUE_PAIRS_MAPPING.get(
+ value.lower(), None
+ )
if mapping_for_value:
return generate_match_query(*mapping_for_value, with_operator_and=True)
@@ -355,9 +394,13 @@ def _generate_type_code_query(self, value):
'bool': {
'minimum_should_match': 1,
'should': [
- generate_match_query('document_type', value, with_operator_and=True),
- generate_match_query('publication_type', value, with_operator_and=True),
- ]
+ generate_match_query(
+ 'document_type', value, with_operator_and=True
+ ),
+ generate_match_query(
+ 'publication_type', value, with_operator_and=True
+ ),
+ ],
}
}
@@ -367,7 +410,9 @@ def _generate_query_string_query(self, value, fieldnames, analyze_wildcard):
field_specifier, field_specifier_value = 'default_field', '_all'
else:
field_specifier = 'fields'
- field_specifier_value = fieldnames if isinstance(fieldnames, list) else [fieldnames]
+ field_specifier_value = (
+ fieldnames if isinstance(fieldnames, list) else [fieldnames]
+ )
# Can only use prefix queries on keyword, text and wildcard
# fields so in journal * searches with type date need to be removed
if 'publication_info.year' in field_specifier_value:
@@ -376,7 +421,7 @@ def _generate_query_string_query(self, value, fieldnames, analyze_wildcard):
'query_string': {
'query': escape_query_string_special_characters(value),
field_specifier: field_specifier_value,
- 'default_operator': "AND"
+ 'default_operator': "AND",
}
}
if analyze_wildcard:
@@ -387,20 +432,9 @@ def _generate_query_string_query(self, value, fieldnames, analyze_wildcard):
# TODO Move it to visitor utils and write tests for it.
def _generate_term_query(self, fieldname, value, boost=None):
if not boost:
- return {
- 'term': {
- fieldname: value
- }
- }
+ return {'term': {fieldname: value}}
- return {
- 'term': {
- fieldname: {
- 'value': value,
- 'boost': boost
- }
- }
- }
+ return {'term': {fieldname: {'value': value, 'boost': boost}}}
def _generate_boolean_query(self, node):
condition_a = node.left.accept(self)
@@ -410,42 +444,48 @@ def _generate_boolean_query(self, node):
return wrap_queries_in_bool_clauses_if_more_than_one(
bool_body,
use_must_clause=isinstance(node, ast.AndOp),
- preserve_bool_semantics_if_one_clause=True
+ preserve_bool_semantics_if_one_clause=True,
)
def _generate_range_queries(self, fieldnames, operator_value_pairs):
"""Generates ElasticSearch range queries.
- Args:
- fieldnames (list): The fieldnames on which the search is the range query is targeted on,
- operator_value_pairs (dict): Contains (range_operator, value) pairs.
- The range_operator should be one of those supported by ElasticSearch (e.g. 'gt', 'lt', 'ge', 'le').
- The value should be of type int or string.
- Notes:
- A bool should query with multiple range sub-queries is generated so that even if one of the multiple fields
- is missing from a document, ElasticSearch will be able to match some records.
- In the case of a 'date' keyword query, it updates date values after normalizing them by using
- :meth:`inspire_query_parser.utils.visitor_utils.update_date_value_in_operator_value_pairs_for_fieldname`.
- Additionally, in the aforementioned case, if a malformed date has been given, then the the method will
- return an empty dictionary.
+
+ Args: fieldnames (list): The fieldnames on which the search
+ is the range query is targeted on, operator_value_pairs
+ (dict): Contains (range_operator, value) pairs. The
+ range_operator should be one of those supported by ElasticSearch
+ (e.g. 'gt', 'lt', 'ge', 'le'). The value should be of
+ type int or string. Notes: A bool should query with multiple
+ range sub-queries is generated so that even if one of the
+ multiple fields is missing from a document, ElasticSearch
+ will be able to match some records. In the case of a 'date'
+ keyword query, it updates date values after normalizing them by
+ using :meth:`inspire_query_parser.utils.visitor_utils.update
+ _date_value_in_operator_value_pairs_for_fieldname`.
+ Additionally, in the aforementioned case, if a malformed date
+ has been given, then the the method will return an empty
+ dictionary.
"""
if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames or all(
- field in [self.KEYWORD_TO_ES_FIELDNAME['date-added'],
- self.KEYWORD_TO_ES_FIELDNAME['date-updated'],
- self.KEYWORD_TO_ES_FIELDNAME['date-earliest']] for field in fieldnames
+ field
+ in [
+ self.KEYWORD_TO_ES_FIELDNAME['date-added'],
+ self.KEYWORD_TO_ES_FIELDNAME['date-updated'],
+ self.KEYWORD_TO_ES_FIELDNAME['date-earliest'],
+ ]
+ for field in fieldnames
):
range_queries = []
for fieldname in fieldnames:
- updated_operator_value_pairs = update_date_value_in_operator_value_pairs_for_fieldname(
- fieldname, operator_value_pairs
+ updated_operator_value_pairs = (
+ update_date_value_in_operator_value_pairs_for_fieldname(
+ fieldname, operator_value_pairs
+ )
)
if not updated_operator_value_pairs:
break # Malformed date
else:
- range_query = {
- 'range': {
- fieldname: updated_operator_value_pairs
- }
- }
+ range_query = {'range': {fieldname: updated_operator_value_pairs}}
range_queries.append(
generate_nested_query(self.DATE_NESTED_QUERY_PATH, range_query)
@@ -453,22 +493,30 @@ def _generate_range_queries(self, fieldnames, operator_value_pairs):
else range_query
)
elif 'publication_info.year' in fieldnames:
- range_queries = [generate_nested_query(self.DATE_NESTED_QUERY_PATH,
- {'range': {fieldname: operator_value_pairs}})
- for fieldname in fieldnames]
+ range_queries = [
+ generate_nested_query(
+ self.DATE_NESTED_QUERY_PATH,
+ {'range': {fieldname: operator_value_pairs}},
+ )
+ for fieldname in fieldnames
+ ]
else:
- range_queries = [{'range': {fieldname: operator_value_pairs}}
- for fieldname in fieldnames]
+ range_queries = [
+ {'range': {fieldname: operator_value_pairs}} for fieldname in fieldnames
+ ]
- return wrap_queries_in_bool_clauses_if_more_than_one(range_queries, use_must_clause=False)
+ return wrap_queries_in_bool_clauses_if_more_than_one(
+ range_queries, use_must_clause=False
+ )
@staticmethod
def _generate_malformed_query(data):
"""Generates a query on the ``_all`` field with all the query content.
- Args:
- data (six.text_type or list): The query in the format of ``six.text_type`` (when used from parsing driver)
- or ``list`` when used from withing the ES visitor.
+
+ Args: data (six.text_type or list): The query in the format
+ of ``six.text_type`` (when used from parsing driver) or
+ ``list`` when used from withing the ES visitor.
"""
if isinstance(data, six.text_type):
# Remove colon character (special character for ES)
@@ -476,73 +524,77 @@ def _generate_malformed_query(data):
else:
query_str = ' '.join([word.strip(':') for word in data.children])
- return {
- 'simple_query_string': {
- 'fields': ['_all'],
- 'query': query_str
- }
- }
+ return {'simple_query_string': {'fields': ['_all'], 'query': query_str}}
- def _preprocess_journal_query_value(self, third_journal_field, old_publication_info_values):
- """Transforms the given journal query value (old publication info) to the new one.
- Args:
- third_journal_field (six.text_type): The final field to be used for populating the old publication info.
- old_publication_info_values (six.text_type): The old publication info. It must be one of {only title, title
- & volume, title & volume & artid/page_start}.
- Returns:
- (dict) The new publication info.
+ def _preprocess_journal_query_value(
+ self, third_journal_field, old_publication_info_values
+ ):
+ """Transforms the given journal query value (old publication info) to
+ the new one.
+
+ Args: third_journal_field (six.text_type): The final field
+ to be used for populating the old publication info.
+ old_publication_info_values (six.text_type): The old publication
+ info. It must be one of {only title, title & volume,
+ title & volume & artid/page_start}. Returns: (dict) The new
+ publication info.
"""
- # Prepare old publication info for :meth:`inspire_schemas.utils.convert_old_publication_info_to_new`.
+ # Prepare old publication info for
+ # :meth:`inspire_schemas.utils.convert_old_publication_info_to_new`.
publication_info_keys = [
self.JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO,
self.JOURNAL_VOLUME,
- third_journal_field
+ third_journal_field,
]
values_list = [
- value.strip()
- for value
- in old_publication_info_values.split(',')
- if value
+ value.strip() for value in old_publication_info_values.split(',') if value
]
old_publication_info = [
{
key: value
- for key, value
- in zip(publication_info_keys, values_list)
+ for key, value in zip(publication_info_keys, values_list)
if value
}
]
- # We are always assuming that the returned list will not be empty. In the situation of a journal query with no
+ # We are always assuming that the returned list will not be empty.
+ # In the situation of a journal query with no
# value, a malformed query will be generated instead.
- new_publication_info = convert_old_publication_info_to_new(old_publication_info)[0]
+ new_publication_info = convert_old_publication_info_to_new(
+ old_publication_info
+ )[0]
return new_publication_info
def _generate_journal_queries(self, value):
"""Generates ElasticSearch nested query(s).
- Args:
- value (string): Contains the journal_title, journal_volume and artid or start_page separated by a comma.
- This value should be of type string.
- Notes:
- The value contains at least one of the 3 mentioned items, in this order and at most 3.
- The 3rd is either the artid or the page_start and it will query the corresponding ES field for this item.
- The values are then split on comma and stripped of spaces before being saved in a values list in order to
- be assigned to corresponding fields.
+
+ Args: value (string): Contains the journal_title,
+ journal_volume and artid or start_page separated by a comma.
+ This value should be of type string. Notes: The value
+ contains at least one of the 3 mentioned items, in this order
+ and at most 3. The 3rd is either the artid or the page_start
+ and it will query the corresponding ES field for this item. The
+ values are then split on comma and stripped of spaces before
+ being saved in a values list in order to be assigned to
+ corresponding fields.
"""
# Abstract away which is the third field, we care only for its existence.
third_journal_field = self.JOURNAL_PAGE_START
- new_publication_info = self._preprocess_journal_query_value(third_journal_field, value)
+ new_publication_info = self._preprocess_journal_query_value(
+ third_journal_field, value
+ )
- # We always expect a journal title, otherwise query would be considered malformed, and thus this method would
+ # We always expect a journal title, otherwise query would
+ # be considered malformed, and thus this method would
journal_title_query = generate_match_query(
- self.JOURNAL_TITLE,
- new_publication_info[self.JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO],
- with_operator_and=False
- )
+ self.JOURNAL_TITLE,
+ new_publication_info[self.JOURNAL_TITLE_FOR_OLD_PUBLICATION_INFO],
+ with_operator_and=False,
+ )
queries_for_each_field = []
if self.JOURNAL_VOLUME in new_publication_info:
@@ -550,7 +602,7 @@ def _generate_journal_queries(self, value):
generate_match_query(
self.JOURNAL_FIELDS_MAPPING[self.JOURNAL_VOLUME],
new_publication_info[self.JOURNAL_VOLUME],
- with_operator_and=False
+ with_operator_and=False,
)
)
@@ -559,7 +611,7 @@ def _generate_journal_queries(self, value):
generate_match_query(
self.JOURNAL_FIELDS_MAPPING[self.JOURNAL_YEAR],
new_publication_info[self.JOURNAL_YEAR],
- with_operator_and=False
+ with_operator_and=False,
)
)
@@ -569,33 +621,32 @@ def _generate_journal_queries(self, value):
generate_match_query(
self.JOURNAL_FIELDS_MAPPING[third_field],
artid_or_page_start,
- with_operator_and=False
+ with_operator_and=False,
)
- for third_field
- in (self.JOURNAL_PAGE_START, self.JOURNAL_ART_ID)
+ for third_field in (self.JOURNAL_PAGE_START, self.JOURNAL_ART_ID)
]
queries_for_each_field.append(
- wrap_queries_in_bool_clauses_if_more_than_one(match_queries, use_must_clause=False)
+ wrap_queries_in_bool_clauses_if_more_than_one(
+ match_queries, use_must_clause=False
+ )
)
nested_query = generate_nested_query(
self.JOURNAL_FIELDS_PREFIX,
- wrap_queries_in_bool_clauses_if_more_than_one(queries_for_each_field, use_must_clause=True)
+ wrap_queries_in_bool_clauses_if_more_than_one(
+ queries_for_each_field, use_must_clause=True
+ ),
)
journal_queries = [journal_title_query, nested_query]
- return wrap_queries_in_bool_clauses_if_more_than_one(journal_queries, use_must_clause=True)
+ return wrap_queries_in_bool_clauses_if_more_than_one(
+ journal_queries, use_must_clause=True
+ )
def _generate_terms_lookup(self, path, search_path, value):
return {
- "terms": {
- search_path : {
- "index" : "records-hep",
- "id" : value,
- "path" : path
- }
- }
+ "terms": {search_path: {"index": "records-hep", "id": value, "path": path}}
}
# ################
@@ -626,11 +677,7 @@ def visit_query_with_malformed_part(self, node):
return query
def visit_not_op(self, node):
- return {
- 'bool': {
- 'must_not': [node.op.accept(self)]
- }
- }
+ return {'bool': {'must_not': [node.op.accept(self)]}}
def visit_and_op(self, node):
return self._generate_boolean_query(node)
@@ -639,25 +686,37 @@ def visit_or_op(self, node):
return self._generate_boolean_query(node)
def visit_keyword_op(self, node):
- # For this visitor, the decision on which type of ElasticSearch query to generate, relies mainly on the leaves.
- # Thus, the fieldname is propagated to them, so that they generate query type, depending on their type.
+ # For this visitor, the decision on which type of ElasticSearch
+ # query to generate, relies mainly on the leaves.
+ # Thus, the fieldname is propagated to them, so that they
+ # generate query type, depending on their type.
fieldname = node.left.accept(self)
return node.right.accept(self, fieldname)
def visit_range_op(self, node, fieldnames):
- return self._generate_range_queries(force_list(fieldnames), {'gte': node.left.value, 'lte': node.right.value})
+ return self._generate_range_queries(
+ force_list(fieldnames), {'gte': node.left.value, 'lte': node.right.value}
+ )
def visit_greater_than_op(self, node, fieldnames):
- return self._generate_range_queries(force_list(fieldnames), {'gt': node.op.value})
+ return self._generate_range_queries(
+ force_list(fieldnames), {'gt': node.op.value}
+ )
def visit_greater_equal_than_op(self, node, fieldnames):
- return self._generate_range_queries(force_list(fieldnames), {'gte': node.op.value})
+ return self._generate_range_queries(
+ force_list(fieldnames), {'gte': node.op.value}
+ )
def visit_less_than_op(self, node, fieldnames):
- return self._generate_range_queries(force_list(fieldnames), {'lt': node.op.value})
+ return self._generate_range_queries(
+ force_list(fieldnames), {'lt': node.op.value}
+ )
def visit_less_equal_than_op(self, node, fieldnames):
- return self._generate_range_queries(force_list(fieldnames), {'lte': node.op.value})
+ return self._generate_range_queries(
+ force_list(fieldnames), {'lte': node.op.value}
+ )
def visit_nested_keyword_op(self, node): # TODO Cannot be completed as of yet.
# FIXME: quick and dirty implementation of refersto:recid:
@@ -668,42 +727,43 @@ def visit_nested_keyword_op(self, node): # TODO Cannot be completed as of yet.
return self._generate_terms_lookup(
self.KEYWORD_TO_ES_FIELDNAME['citedby']['path'],
self.KEYWORD_TO_ES_FIELDNAME['citedby']['search_path'],
- record_id
+ record_id,
)
- if node.left.value == 'refersto':
- if right.left.value == 'control_number':
- recid = right.right.value
- citing_records_query = generate_match_query(
- self.KEYWORD_TO_ES_FIELDNAME['refersto'],
- recid,
- with_operator_and=False
- )
- records_with_collection_literature_query = generate_match_query(
- '_collections',
- 'Literature',
- with_operator_and=False
- )
- superseded_records_query = generate_match_query(
- self.RECORD_RELATION_FIELD,
- 'successor',
- with_operator_and=False
- )
- self_citation = generate_match_query(
- "control_number",
- recid,
- with_operator_and=False
- )
- return {
- 'bool': {
- 'must': [citing_records_query, records_with_collection_literature_query],
- 'must_not': [superseded_records_query, self_citation]
- }
+ if node.left.value == 'refersto' and right.left.value == 'control_number':
+ recid = right.right.value
+ citing_records_query = generate_match_query(
+ self.KEYWORD_TO_ES_FIELDNAME['refersto'],
+ recid,
+ with_operator_and=False,
+ )
+ records_with_collection_literature_query = generate_match_query(
+ '_collections', 'Literature', with_operator_and=False
+ )
+ superseded_records_query = generate_match_query(
+ self.RECORD_RELATION_FIELD, 'successor', with_operator_and=False
+ )
+ self_citation = generate_match_query(
+ "control_number", recid, with_operator_and=False
+ )
+ return {
+ 'bool': {
+ 'must': [
+ citing_records_query,
+ records_with_collection_literature_query,
+ ],
+ 'must_not': [superseded_records_query, self_citation],
}
+ }
if right.left.value == 'author':
- return generate_match_query("referenced_authors_bais", right.right.value, with_operator_and=False)
+ return generate_match_query(
+ "referenced_authors_bais",
+ right.right.value,
+ with_operator_and=False,
+ )
def visit_keyword(self, node):
- # If no keyword is found, return the original node value (case of an unknown keyword).
+ # If no keyword is found, return the original node value
+ # (case of an unknown keyword).
return self.KEYWORD_TO_ES_FIELDNAME.get(node.value, node.value)
def handle_value_wildcard(self, node, fieldnames=None):
@@ -714,35 +774,36 @@ def handle_value_wildcard(self, node, fieldnames=None):
fieldnames,
node.value,
bai_field_variation=FieldVariations.search,
- query_bai_field_if_dots_in_name=True
+ query_bai_field_if_dots_in_name=True,
)
query = self._generate_query_string_query(
node.value,
fieldnames=bai_fieldnames or fieldnames,
- analyze_wildcard=True
+ analyze_wildcard=True,
)
return self._generate_nested_author_query(query, fieldnames)
query = self._generate_query_string_query(
- node.value,
- fieldnames=fieldnames,
- analyze_wildcard=True
+ node.value, fieldnames=fieldnames, analyze_wildcard=True
+ )
+ return wrap_query_in_nested_if_field_is_nested(
+ query, fieldnames, self.NESTED_FIELDS
)
- return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS)
def handle_author_query(self, node, fieldnames=None):
bai_fieldnames = self._generate_fieldnames_if_bai_query(
fieldnames,
node.value,
bai_field_variation=FieldVariations.search,
- query_bai_field_if_dots_in_name=True
+ query_bai_field_if_dots_in_name=True,
)
if bai_fieldnames:
if len(bai_fieldnames) == 1:
query = {"match": {bai_fieldnames[0]: node.value}}
return self._generate_nested_author_query(query, fieldnames)
- # Not an exact BAI pattern match, but node's value looks like BAI (no spaces and dots),
- # e.g. `S.Mele`. In this case generate a partial match query.
+ # Not an exact BAI pattern match, but node's value looks like
+ # BAI (no spaces and dots), e.g. `S.Mele`. In this case generate
+ # a partial match query.
return self.visit_partial_match_value(node, bai_fieldnames)
return self._generate_author_query(fieldnames, node.value)
@@ -753,11 +814,18 @@ def visit_value(self, node, fieldnames=None):
if node.contains_wildcard:
return self.handle_value_wildcard(node, fieldnames=fieldnames)
- if fieldnames in [self.KEYWORD_TO_ES_FIELDNAME['date'], self.KEYWORD_TO_ES_FIELDNAME['date-added'],
- self.KEYWORD_TO_ES_FIELDNAME['date-updated'], self.KEYWORD_TO_ES_FIELDNAME['date-earliest']]:
- # Date queries with simple values are transformed into range queries, among the given and the exact
+ if fieldnames in [
+ self.KEYWORD_TO_ES_FIELDNAME['date'],
+ self.KEYWORD_TO_ES_FIELDNAME['date-added'],
+ self.KEYWORD_TO_ES_FIELDNAME['date-updated'],
+ self.KEYWORD_TO_ES_FIELDNAME['date-earliest'],
+ ]:
+ # Date queries with simple values are transformed into range queries,
+ # among the given and the exact
# next date, according to the granularity of the given date.
- return self._generate_range_queries(force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value})
+ return self._generate_range_queries(
+ force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value}
+ )
if isinstance(fieldnames, list):
if self.KEYWORD_TO_ES_FIELDNAME['journal'] == fieldnames:
return self._generate_journal_queries(node.value)
@@ -765,14 +833,17 @@ def visit_value(self, node, fieldnames=None):
if self.KEYWORD_TO_ES_FIELDNAME['affiliation-id'] == fieldnames:
match_queries = [
wrap_query_in_nested_if_field_is_nested(
- generate_match_query(field, node.value, with_operator_and=False),
+ generate_match_query(
+ field, node.value, with_operator_and=False
+ ),
field,
self.NESTED_FIELDS,
)
for field in fieldnames
]
return wrap_queries_in_bool_clauses_if_more_than_one(
- match_queries, use_must_clause=False)
+ match_queries, use_must_clause=False
+ )
return {
'multi_match': {
@@ -800,34 +871,46 @@ def visit_value(self, node, fieldnames=None):
query = generate_match_query(
self.KEYWORD_TO_ES_FIELDNAME['affiliation'],
node.value,
- with_operator_and=True
+ with_operator_and=True,
)
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
elif self.KEYWORD_TO_ES_FIELDNAME['eprint'] == fieldnames:
- return generate_match_query(fieldnames, re.sub('ar[xX]iv:', "", node.value), with_operator_and=True)
+ return generate_match_query(
+ fieldnames,
+ re.sub('ar[xX]iv:', "", node.value),
+ with_operator_and=True,
+ )
elif self.KEYWORD_TO_ES_FIELDNAME['texkey'] == fieldnames:
- return generate_match_query('texkeys.raw', node.value, with_operator_and=False)
+ return generate_match_query(
+ 'texkeys.raw', node.value, with_operator_and=False
+ )
elif fieldnames not in self.KEYWORD_TO_ES_FIELDNAME.values():
colon_value = ':'.join([fieldnames, node.value])
- given_field_query = generate_match_query(fieldnames, node.value, with_operator_and=True)
+ given_field_query = generate_match_query(
+ fieldnames, node.value, with_operator_and=True
+ )
if self.TEXKEY_REGEX.match(colon_value):
- return generate_match_query('texkeys.raw', colon_value, with_operator_and=False)
- _all_field_query = generate_match_query('_all', colon_value, with_operator_and=True)
+ return generate_match_query(
+ 'texkeys.raw', colon_value, with_operator_and=False
+ )
+ _all_field_query = generate_match_query(
+ '_all', colon_value, with_operator_and=True
+ )
query = wrap_queries_in_bool_clauses_if_more_than_one(
- [given_field_query, _all_field_query], use_must_clause=False)
- return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS)
+ [given_field_query, _all_field_query], use_must_clause=False
+ )
+ return wrap_query_in_nested_if_field_is_nested(
+ query, fieldnames, self.NESTED_FIELDS
+ )
return generate_match_query(fieldnames, node.value, with_operator_and=True)
def visit_exact_match_value(self, node, fieldnames=None):
"""Generates a term query (exact search in ElasticSearch)."""
- if not fieldnames:
- fieldnames = ['_all']
- else:
- fieldnames = force_list(fieldnames)
+ fieldnames = ['_all'] if not fieldnames else force_list(fieldnames)
if self.KEYWORD_TO_ES_FIELDNAME['exact-author'] == fieldnames[0]:
return self._generate_exact_author_query(node.value)
@@ -842,14 +925,19 @@ def visit_exact_match_value(self, node, fieldnames=None):
fieldnames,
node.value,
bai_field_variation=FieldVariations.raw,
- query_bai_field_if_dots_in_name=False
+ query_bai_field_if_dots_in_name=False,
)
if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
exact_match_queries = []
for field in fieldnames:
- term_query = \
- {'term': {field: _truncate_date_value_according_on_date_field(field, node.value).dumps()}}
+ term_query = {
+ 'term': {
+ field: _truncate_date_value_according_on_date_field(
+ field, node.value
+ ).dumps()
+ }
+ }
exact_match_queries.append(
generate_nested_query(self.DATE_NESTED_QUERY_PATH, term_query)
@@ -858,25 +946,40 @@ def visit_exact_match_value(self, node, fieldnames=None):
)
elif self._are_fieldnames_author_or_first_author(fieldnames):
exact_match_queries = [
- self._generate_nested_author_query({'match_phrase': {field: node.value}}, fieldnames)
+ self._generate_nested_author_query(
+ {'match_phrase': {field: node.value}}, fieldnames
+ )
for field in (bai_fieldnames or fieldnames)
]
else:
- exact_match_queries = [{'match_phrase': {field: node.value}} for field in (bai_fieldnames or fieldnames)]
- query = wrap_queries_in_bool_clauses_if_more_than_one(exact_match_queries, use_must_clause=False)
- return wrap_query_in_nested_if_field_is_nested(query, fieldnames[0], self.NESTED_FIELDS)
+ exact_match_queries = [
+ {'match_phrase': {field: node.value}}
+ for field in (bai_fieldnames or fieldnames)
+ ]
+ query = wrap_queries_in_bool_clauses_if_more_than_one(
+ exact_match_queries, use_must_clause=False
+ )
+ return wrap_query_in_nested_if_field_is_nested(
+ query, fieldnames[0], self.NESTED_FIELDS
+ )
- return wrap_queries_in_bool_clauses_if_more_than_one(exact_match_queries, use_must_clause=False)
+ return wrap_queries_in_bool_clauses_if_more_than_one(
+ exact_match_queries, use_must_clause=False
+ )
def visit_partial_match_value(self, node, fieldnames=None):
- """Generates a query which looks for a substring of the node's value in the given fieldname."""
+ """Generates a query which looks for a substring of the node's value in
+ the given fieldname."""
if self.KEYWORD_TO_ES_FIELDNAME['date'] == fieldnames:
- # Date queries with partial values are transformed into range queries, among the given and the exact
+ # Date queries with partial values are transformed into range queries,
+ # among the given and the exact
# next date, according to the granularity of the given date.
if node.contains_wildcard:
return self._generate_date_with_wildcard_query(node.value)
- return self._generate_range_queries(force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value})
+ return self._generate_range_queries(
+ force_list(fieldnames), {ES_RANGE_EQ_OPERATOR: node.value}
+ )
if self.KEYWORD_TO_ES_FIELDNAME['exact-author'] == fieldnames:
return self._generate_exact_author_query(node.value)
@@ -888,32 +991,37 @@ def visit_partial_match_value(self, node, fieldnames=None):
return self._generate_journal_queries(node.value)
# Add wildcard token as prefix and suffix.
- value = \
- ('' if node.value.startswith(ast.GenericValue.WILDCARD_TOKEN) else '*') + \
- node.value + \
- ('' if node.value.endswith(ast.GenericValue.WILDCARD_TOKEN) else '*')
+ value = (
+ ('' if node.value.startswith(ast.GenericValue.WILDCARD_TOKEN) else '*')
+ + node.value
+ + ('' if node.value.endswith(ast.GenericValue.WILDCARD_TOKEN) else '*')
+ )
if self._are_fieldnames_author_or_first_author(fieldnames):
bai_fieldnames = self._generate_fieldnames_if_bai_query(
fieldnames,
node.value,
bai_field_variation=FieldVariations.search,
- query_bai_field_if_dots_in_name=True
+ query_bai_field_if_dots_in_name=True,
+ )
+ query = self._generate_query_string_query(
+ value, fieldnames=bai_fieldnames or fieldnames, analyze_wildcard=True
)
- query = self._generate_query_string_query(value, fieldnames=bai_fieldnames or fieldnames, analyze_wildcard=True)
return self._generate_nested_author_query(query, fieldnames)
- query = self._generate_query_string_query(value, fieldnames, analyze_wildcard=True)
- return wrap_query_in_nested_if_field_is_nested(query, fieldnames, self.NESTED_FIELDS)
+ query = self._generate_query_string_query(
+ value, fieldnames, analyze_wildcard=True
+ )
+ return wrap_query_in_nested_if_field_is_nested(
+ query, fieldnames, self.NESTED_FIELDS
+ )
def visit_regex_value(self, node, fieldname="_all"):
- query = {
- 'regexp': {
- fieldname: node.value
- }
- }
+ query = {'regexp': {fieldname: node.value}}
if self.KEYWORD_TO_ES_FIELDNAME['author'] == fieldname:
return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
- return wrap_query_in_nested_if_field_is_nested(query, fieldname, self.NESTED_FIELDS)
+ return wrap_query_in_nested_if_field_is_nested(
+ query, fieldname, self.NESTED_FIELDS
+ )
diff --git a/inspire_query_parser/visitors/restructuring_visitor.py b/inspire_query_parser/visitors/restructuring_visitor.py
index bdb2225..d0effc2 100644
--- a/inspire_query_parser/visitors/restructuring_visitor.py
+++ b/inspire_query_parser/visitors/restructuring_visitor.py
@@ -19,10 +19,9 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
-
-"""
-This module encapsulates the restructuring visitor logic, that receives the output of the parser and converts it to a
-more compact and restructured parse tree.
+"""This module encapsulates the restructuring visitor logic, that receives the
+output of the parser and converts it to a more compact and restructured parse
+tree.
Additionally, the date specifier conversion handlers logic is defined.
"""
@@ -32,46 +31,64 @@
import logging
from inspire_query_parser import ast
-from inspire_query_parser.ast import (AndOp, ExactMatchValue, Keyword,
- KeywordOp, NotOp, OrOp,
- PartialMatchValue,
- QueryWithMalformedPart, RegexValue, ValueOp)
-from inspire_query_parser.parser import (And, ComplexValue,
- SimpleValueBooleanQuery)
-from inspire_query_parser.utils.visitor_utils import \
- DATE_SPECIFIERS_CONVERSION_HANDLERS
+from inspire_query_parser.ast import (
+ AndOp,
+ ExactMatchValue,
+ Keyword,
+ KeywordOp,
+ NotOp,
+ OrOp,
+ PartialMatchValue,
+ QueryWithMalformedPart,
+ RegexValue,
+ ValueOp,
+)
+from inspire_query_parser.parser import And, ComplexValue, SimpleValueBooleanQuery
+from inspire_query_parser.utils.visitor_utils import DATE_SPECIFIERS_CONVERSION_HANDLERS
from inspire_query_parser.visitors.visitor_impl import Visitor
logger = logging.getLogger(__name__)
def _restructure_if_volume_follows_journal(left, right):
- """Remove volume node if it follows a journal logically in the tree hierarchy.
+ """Remove volume node if it follows a journal logically in the tree
+ hierarchy.
- Args:
- left (ast.ASTElement): The journal KeywordOp node.
- right (ast.ASTElement): The rest of the tree to be restructured.
+ Args: left (ast.ASTElement): The journal KeywordOp node. right
+ (ast.ASTElement): The rest of the tree to be restructured.
- Return:
- (ast.ASTElement): The restructured tree, with the volume node removed.
+ Return: (ast.ASTElement): The restructured tree, with the volume
+ node removed.
- Notes:
- This happens to support queries like "journal Phys.Rev. and vol d85". Appends the value of KeywordOp with
- Keyword 'volume' and discards 'volume' KeywordOp node from the tree.
+ Notes: This happens to support queries like "journal Phys.Rev.
+ and vol d85". Appends the value of KeywordOp with Keyword
+ 'volume' and discards 'volume' KeywordOp node from the tree.
"""
+
def _get_volume_keyword_op_and_remaining_subtree(right_subtree):
- if isinstance(right_subtree, NotOp) and isinstance(right_subtree.op, KeywordOp) \
- and right_subtree.op.left == Keyword('volume'):
+ if (
+ isinstance(right_subtree, NotOp)
+ and isinstance(right_subtree.op, KeywordOp)
+ and right_subtree.op.left == Keyword('volume')
+ ):
return None, None
- elif isinstance(right_subtree, AndOp) and isinstance(right_subtree.left, NotOp) \
- and isinstance(right_subtree.left.op, KeywordOp) and right_subtree.left.op.left == Keyword('volume'):
+ elif (
+ isinstance(right_subtree, AndOp)
+ and isinstance(right_subtree.left, NotOp)
+ and isinstance(right_subtree.left.op, KeywordOp)
+ and right_subtree.left.op.left == Keyword('volume')
+ ):
return None, right_subtree.right
- elif isinstance(right_subtree, KeywordOp) and right_subtree.left == Keyword('volume'):
+ elif isinstance(right_subtree, KeywordOp) and right_subtree.left == Keyword(
+ 'volume'
+ ):
return right_subtree, None
- elif isinstance(right_subtree, AndOp) and right_subtree.left.left == Keyword('volume'):
+ elif isinstance(right_subtree, AndOp) and right_subtree.left.left == Keyword(
+ 'volume'
+ ):
return right_subtree.left, right_subtree.right
journal_value = left.right.value
@@ -88,14 +105,21 @@ def _get_volume_keyword_op_and_remaining_subtree(right_subtree):
def _convert_simple_value_boolean_query_to_and_boolean_queries(tree, keyword):
- """Chain SimpleValueBooleanQuery values into chained AndOp queries with the given current Keyword."""
+ """Chain SimpleValueBooleanQuery values into chained AndOp queries with the
+ given current Keyword."""
def _create_operator_node(value_node):
"""Creates a KeywordOp or a ValueOp node."""
base_node = value_node.op if isinstance(value_node, NotOp) else value_node
- updated_base_node = KeywordOp(keyword, base_node) if keyword else ValueOp(base_node)
+ updated_base_node = (
+ KeywordOp(keyword, base_node) if keyword else ValueOp(base_node)
+ )
- return NotOp(updated_base_node) if isinstance(value_node, NotOp) else updated_base_node
+ return (
+ NotOp(updated_base_node)
+ if isinstance(value_node, NotOp)
+ else updated_base_node
+ )
def _get_bool_op_type(bool_op):
return AndOp if isinstance(bool_op, And) else OrOp
@@ -119,11 +143,12 @@ def _get_bool_op_type(bool_op):
class RestructuringVisitor(Visitor):
- """Converts the output of the parser to a more compact and restructured parse tree.
+ """Converts the output of the parser to a more compact and restructured
+ parse tree.
- Notes:
- Compaction, as in removing intermediate nodes, such as Statement, Expression, etc. and restructure, as in,
- breaking down a :class:`SimpleValueBooleanQuery` to chained boolean queries.
+ Notes: Compaction, as in removing intermediate nodes, such as
+ Statement, Expression, etc. and restructure, as in, breaking
+ down a :class:`SimpleValueBooleanQuery` to chained boolean queries.
"""
def _create_not_op(self, node):
@@ -134,9 +159,10 @@ def visit_query(self, node):
if len(result) == 1:
result = result[0]
- if isinstance(result, (ast.Value, ast.ExactMatchValue)) \
- or isinstance(result, ast.PartialMatchValue) \
- or isinstance(result, ast.RegexValue):
+ if isinstance(
+ result,
+ (ast.Value, ast.ExactMatchValue, ast.PartialMatchValue, ast.RegexValue),
+ ):
# The only Values that can be standalone queries are the above.
return ast.ValueOp(result)
else:
@@ -162,26 +188,34 @@ def visit_boolean_query(self, node):
left = node.left.accept(self)
right = node.right.accept(self)
- is_journal_keyword_op = isinstance(left, KeywordOp) and left.left == Keyword('journal')
+ is_journal_keyword_op = isinstance(left, KeywordOp) and left.left == Keyword(
+ 'journal'
+ )
if is_journal_keyword_op:
- journal_and_volume_conjunction = _restructure_if_volume_follows_journal(left, right)
+ journal_and_volume_conjunction = _restructure_if_volume_follows_journal(
+ left, right
+ )
if journal_and_volume_conjunction:
return journal_and_volume_conjunction
- return AndOp(left, right) if isinstance(node.bool_op, And) else OrOp(left, right)
+ return (
+ AndOp(left, right) if isinstance(node.bool_op, And) else OrOp(left, right)
+ )
def visit_simple_value_boolean_query(self, node):
- """
- Visits only the children of :class:`SimpleValueBooleanQuery` without substituting the actual node type.
-
- Notes:
- Defer conversion from :class:`SimpleValueBooleanQuery` to AndOp or OrOp.
- This transformation needs to occur higher in the tree, so that we don't lose the information that this is a
- boolean query among terminals and thus the associative rule needs to be applied if we reached here from a
- keyword query, or a conversion from :class:`SimpleValueBooleanQuery` to :class:`AndOp` or :class:`OrOp`,
- otherwise.
+ """Visits only the children of :class:`SimpleValueBooleanQuery` without
+ substituting the actual node type.
+
+ Notes: Defer conversion from
+ :class:`SimpleValueBooleanQuery` to AndOp or OrOp. This
+ transformation needs to occur higher in the tree, so that we
+ don't lose the information that this is a boolean query
+ among terminals and thus the associative rule needs to be
+ applied if we reached here from a keyword query, or a
+ conversion from :class:`SimpleValueBooleanQuery` to
+ :class:`AndOp` or :class:`OrOp`, otherwise.
"""
node.left, node.right = node.left.accept(self), node.right.accept(self)
return node
@@ -192,10 +226,14 @@ def visit_simple_value_negation(self, node):
def visit_simple_query(self, node):
node = node.op.accept(self)
if isinstance(node, SimpleValueBooleanQuery):
- # Case in which the node is a simple value boolean query not paired with a keyword query. e.g. 'foo and bar'
- return _convert_simple_value_boolean_query_to_and_boolean_queries(node, None)
+ # Case in which the node is a simple value boolean query not
+ # paired with a keyword query. e.g. 'foo and bar'
+ return _convert_simple_value_boolean_query_to_and_boolean_queries(
+ node, None
+ )
elif isinstance(node, ast.Value):
- # Case in which the node is a SimpleQuery(Value(...)) e.g. for a value query "Ellis"
+ # Case in which the node is a SimpleQuery(Value(...)) e.g. for
+ # a value query "Ellis"
return ast.ValueOp(node)
return node
@@ -206,45 +244,55 @@ def visit_not_query(self, node):
def visit_spires_keyword_query(self, node):
"""Transform a :class:`SpiresKeywordQuery` into a :class:`KeywordOp`.
- Notes:
- In case the value being a :class:`SimpleValueBooleanQuery`, the subtree is transformed to chained
- :class:`AndOp` queries containing :class:`KeywordOp`, whose keyword is the keyword of the current node and
- values, all the :class:`SimpleValueBooleanQuery` values (either :class:`SimpleValues` or
- :class:`SimpleValueNegation`.)
+ Notes: In case the value being a
+ :class:`SimpleValueBooleanQuery`, the subtree is transformed to
+ chained :class:`AndOp` queries containing
+ :class:`KeywordOp`, whose keyword is the keyword of the current
+ node and values, all the :class:`SimpleValueBooleanQuery`
+ values (either :class:`SimpleValues` or
+ :class:`SimpleValueNegation`.)
"""
keyword = node.left.accept(self)
value = node.right.accept(self)
if isinstance(value, SimpleValueBooleanQuery):
- return _convert_simple_value_boolean_query_to_and_boolean_queries(value, keyword)
+ return _convert_simple_value_boolean_query_to_and_boolean_queries(
+ value, keyword
+ )
return KeywordOp(keyword, value)
def visit_spires_date_keyword_query(self, node):
"""Transform a :class:`SpiresKeywordQuery` into a :class:`KeywordOp`.
- Notes:
- In case the value being a :class:`SimpleValueBooleanQuery`, the subtree is transformed to chained
- :class:`AndOp` queries containing :class:`KeywordOp`, whose keyword is the keyword of the current node and
- values, all the :class:`SimpleValueBooleanQuery` values (either :class:`SimpleValues` or
- :class:`SimpleValueNegation`.)
+ Notes: In case the value being a
+ :class:`SimpleValueBooleanQuery`, the subtree is transformed to
+ chained :class:`AndOp` queries containing
+ :class:`KeywordOp`, whose keyword is the keyword of the current
+ node and values, all the :class:`SimpleValueBooleanQuery`
+ values (either :class:`SimpleValues` or
+ :class:`SimpleValueNegation`.)
"""
keyword = node.left.accept(self)
value = node.right.accept(self)
if isinstance(value, SimpleValueBooleanQuery):
- return _convert_simple_value_boolean_query_to_and_boolean_queries(value, keyword)
+ return _convert_simple_value_boolean_query_to_and_boolean_queries(
+ value, keyword
+ )
return KeywordOp(keyword, value)
def visit_invenio_keyword_query(self, node):
"""Transform an :class:`InvenioKeywordQuery` into a :class:`KeywordOp`.
- Notes:
- In case the value being a :class:`SimpleValueBooleanQuery`, the subtree is transformed to chained
- :class:`AndOp` queries containing :class:`KeywordOp`, whose keyword is the keyword of the current node and
- values, all the :class:`SimpleValueBooleanQuery` values (either :class:`SimpleValues` or
- :class:`SimpleValueNegation`.)
+ Notes: In case the value being a
+ :class:`SimpleValueBooleanQuery`, the subtree is transformed to
+ chained :class:`AndOp` queries containing
+ :class:`KeywordOp`, whose keyword is the keyword of the current
+ node and values, all the :class:`SimpleValueBooleanQuery`
+ values (either :class:`SimpleValues` or
+ :class:`SimpleValueNegation`.)
"""
try:
keyword = node.left.accept(self)
@@ -255,7 +303,9 @@ def visit_invenio_keyword_query(self, node):
value = node.right.accept(self)
if isinstance(value, SimpleValueBooleanQuery):
- return _convert_simple_value_boolean_query_to_and_boolean_queries(value, keyword)
+ return _convert_simple_value_boolean_query_to_and_boolean_queries(
+ value, keyword
+ )
return KeywordOp(keyword, value)
@@ -274,7 +324,8 @@ def visit_greater_than_op(self, node):
def visit_greater_equal_op(self, node):
try:
value = node.op.accept(self)
- except AttributeError: # Case of "100+" format, where 100 is text (and not a SimpleValue).
+ # Case of "100+" format, where 100 is text (and not a SimpleValue).
+ except AttributeError:
value = ast.Value(node.op)
return ast.GreaterEqualThanOp(value)
@@ -284,7 +335,8 @@ def visit_less_than_op(self, node):
def visit_less_equal_op(self, node):
try:
value = node.op.accept(self)
- except AttributeError: # Case of "100-" format where 100 is text (and not a SimpleValue).
+ # Case of "100-" format where 100 is text (and not a SimpleValue).
+ except AttributeError:
value = ast.Value(node.op)
return ast.LessEqualThanOp(value)
@@ -299,19 +351,21 @@ def visit_empty_query(self, node):
return ast.EmptyQuery(None)
def visit_complex_value(self, node):
- """Convert :class:`ComplexValue` to one of ExactMatch, PartialMatch and Regex Value nodes."""
+ """Convert :class:`ComplexValue` to one of ExactMatch, PartialMatch and
+ Regex Value nodes."""
if node.value.startswith(ComplexValue.EXACT_VALUE_TOKEN):
value = node.value.strip(ComplexValue.EXACT_VALUE_TOKEN)
return ExactMatchValue(value)
elif node.value.startswith(ComplexValue.PARTIAL_VALUE_TOKEN):
value = node.value.strip(ComplexValue.PARTIAL_VALUE_TOKEN)
- return PartialMatchValue(value, True if ast.GenericValue.WILDCARD_TOKEN in value else False)
+ return PartialMatchValue(value, ast.GenericValue.WILDCARD_TOKEN in value)
elif node.value.startswith(ComplexValue.REGEX_VALUE_TOKEN):
return RegexValue(node.value.strip(ComplexValue.REGEX_VALUE_TOKEN))
else:
- # Covering the case where ComplexValue supports more than ExactMatch, PartialMatch and Regex values.
+ # Covering the case where ComplexValue supports more than ExactMatch,
+ # PartialMatch and Regex values.
msg = self.__class__.__name__ + ': Unrecognized complex value'
try:
msg += ' lookahead token: "' + node.value[0] + '"'
@@ -323,15 +377,22 @@ def visit_complex_value(self, node):
def visit_simple_value(self, node):
# In case of date specifiers convert relative or text date to normal date.
- for regexp, date_conversion_handler in DATE_SPECIFIERS_CONVERSION_HANDLERS.items():
+ for (
+ regexp,
+ date_conversion_handler,
+ ) in DATE_SPECIFIERS_CONVERSION_HANDLERS.items():
date_value = node.value
regexp_match = regexp.match(node.value)
if regexp_match:
- relative_date_specifier_suffix = date_value.split(regexp_match.group())[1]
- return ast.Value(str(date_conversion_handler(relative_date_specifier_suffix)))
+ relative_date_specifier_suffix = date_value.split(regexp_match.group())[
+ 1
+ ]
+ return ast.Value(
+ str(date_conversion_handler(relative_date_specifier_suffix))
+ )
# Normal text value
- return ast.Value(node.value, True if ast.GenericValue.WILDCARD_TOKEN in node.value else False)
+ return ast.Value(node.value, ast.GenericValue.WILDCARD_TOKEN in node.value)
def visit_simple_range_value(self, node):
return ast.Value(node.value)
@@ -340,12 +401,19 @@ def visit_date_value(self, node):
return node.op.accept(self)
def visit_simple_date_value(self, node):
- for regexp, date_conversion_handler in DATE_SPECIFIERS_CONVERSION_HANDLERS.items():
+ for (
+ regexp,
+ date_conversion_handler,
+ ) in DATE_SPECIFIERS_CONVERSION_HANDLERS.items():
date_value = node.value
regexp_match = regexp.match(node.value)
if regexp_match:
- relative_date_specifier_suffix = date_value.split(regexp_match.group())[1]
- return ast.Value(str(date_conversion_handler(relative_date_specifier_suffix)))
+ relative_date_specifier_suffix = date_value.split(regexp_match.group())[
+ 1
+ ]
+ return ast.Value(
+ str(date_conversion_handler(relative_date_specifier_suffix))
+ )
# Normal text value
- return ast.Value(node.value, True if ast.GenericValue.WILDCARD_TOKEN in node.value else False)
+ return ast.Value(node.value, ast.GenericValue.WILDCARD_TOKEN in node.value)
diff --git a/inspire_query_parser/visitors/visitor_impl.py b/inspire_query_parser/visitors/visitor_impl.py
index b504045..8b52bc9 100644
--- a/inspire_query_parser/visitors/visitor_impl.py
+++ b/inspire_query_parser/visitors/visitor_impl.py
@@ -19,10 +19,7 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
-
-"""
-Encapsulates visitor pattern logic.
-"""
+"""Encapsulates visitor pattern logic."""
from __future__ import absolute_import, unicode_literals
diff --git a/ruff.toml b/ruff.toml
new file mode 100644
index 0000000..07adf74
--- /dev/null
+++ b/ruff.toml
@@ -0,0 +1,28 @@
+target-version = "py311"
+[lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[lint]
+select = [
+ # pycodestyle
+ "E",
+ # Pyflakes
+ "F",
+ # flake8-bugbear
+ "B",
+ # flake8-simplify
+ "SIM",
+ # isort
+ "I",
+ # flake8-tidy-imports
+ "TID",
+ # flake8-pytest-style
+ "PT",
+]
+ignore = ["B904", "B905"]
+
+[lint.pycodestyle]
+ignore-overlong-task-comments = true
+
+[lint.pydocstyle]
+convention = "google"
diff --git a/run-tests.sh b/run-tests.sh
index 93f195e..0258522 100755
--- a/run-tests.sh
+++ b/run-tests.sh
@@ -22,5 +22,4 @@
set -e
-flake8 inspire_query_parser tests
py.test tests
diff --git a/setup.py b/setup.py
index 9f5d753..f46b12b 100644
--- a/setup.py
+++ b/setup.py
@@ -22,27 +22,28 @@
"""A PEG-based query parser for INSPIRE."""
-import os
from setuptools import find_packages, setup
-
URL = 'https://github.com/inspirehep/inspire-query-parser'
-readme = open('README.rst').read()
+with open("README.rst") as f:
+ readme = f.read()
setup_requires = [
'autosemver==0.5.5',
]
-install_requires = [
- 'inspire-schemas~=61.0',
- 'inspire-utils~=3.0,>=3.0.0',
- 'pypeg2~=2.0,>=2.15.2',
- 'python-dateutil~=2.0,>=2.6.1',
- 'six~=1.0,>=1.11.0',
- 'datefinder~=0.7.1'
-],
+install_requires = (
+ [
+ 'inspire-schemas~=61.0',
+ 'inspire-utils~=3.0,>=3.0.0',
+ 'pypeg2~=2.0,>=2.15.2',
+ 'python-dateutil~=2.0,>=2.6.1',
+ 'six~=1.0,>=1.11.0',
+ 'datefinder~=0.7.1',
+ ],
+)
docs_require = []
@@ -53,13 +54,18 @@
'pytest~=3.0,>=3.2.2',
]
+dev_require = [
+ "pre-commit==3.5.0",
+]
+
extras_require = {
'docs': docs_require,
'tests': tests_require,
+ 'dev': dev_require,
}
extras_require['all'] = []
-for name, reqs in extras_require.items():
+for _name, reqs in extras_require.items():
extras_require['all'].extend(reqs)
packages = find_packages(exclude=['docs'])
diff --git a/tests/conftest.py b/tests/conftest.py
index 3ef134e..31faee6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -38,15 +38,20 @@
def pytest_assertrepr_compare(op, left, right):
if (
- isinstance(left, Query) and isinstance(right, Query) or
- isinstance(left, KeywordOp) and isinstance(right, KeywordOp) or
- isinstance(left, AndOp) and isinstance(right, AndOp) or
- isinstance(left, OrOp) and isinstance(right, OrOp)
+ isinstance(left, Query)
+ and isinstance(right, Query)
+ or isinstance(left, KeywordOp)
+ and isinstance(right, KeywordOp)
+ or isinstance(left, AndOp)
+ and isinstance(right, AndOp)
+ or isinstance(left, OrOp)
+ and isinstance(right, OrOp)
) and op == "==":
left_parse_tree = emit_tree_format(left).splitlines()
right_parse_tree = emit_tree_format(right).splitlines()
- return \
- ['that given parse trees are equal:'] \
- + left_parse_tree \
- + ['', "──────── == ────────", ''] \
+ return (
+ ['that given parse trees are equal:']
+ + left_parse_tree
+ + ['', "──────── == ────────", '']
+ right_parse_tree
+ )
diff --git a/tests/helpers/test_utils.py b/tests/helpers/test_utils.py
index ee220e3..44f4859 100644
--- a/tests/helpers/test_utils.py
+++ b/tests/helpers/test_utils.py
@@ -47,18 +47,25 @@ def parametrize(test_configurations):
if not isinstance(test_configurations, dict):
__tracebackhide__ = True
- pytest.fail('In parametrize test configurations parameter must be a dictionary.')
+ pytest.fail(
+ 'In parametrize test configurations parameter must be a dictionary.'
+ )
ordered_tests_config = OrderedDict(sorted(viewitems(test_configurations)))
for test_name, test_configuration in iteritems(ordered_tests_config):
- ordered_tests_config[test_name] = OrderedDict(sorted(viewitems(test_configuration)))
+ ordered_tests_config[test_name] = OrderedDict(
+ sorted(viewitems(test_configuration))
+ )
# Extract arg_names from a test configuration
arg_names = list(iterkeys(next(itervalues(ordered_tests_config))))
# Generate list of arg_values
- arg_values = [ordered_tests_config[test_config].values() for test_config in ordered_tests_config]
+ arg_values = [
+ ordered_tests_config[test_config].values()
+ for test_config in ordered_tests_config
+ ]
# Generate ids list
ids = list(iterkeys(ordered_tests_config))
diff --git a/tests/test_elastic_search_visitor.py b/tests/test_elastic_search_visitor.py
index 73d7138..a848f17 100644
--- a/tests/test_elastic_search_visitor.py
+++ b/tests/test_elastic_search_visitor.py
@@ -29,10 +29,8 @@
from inspire_query_parser import parse_query, parser
from inspire_query_parser.config import ES_MUST_QUERY, ES_SHOULD_QUERY
from inspire_query_parser.stateful_pypeg_parser import StatefulParser
-from inspire_query_parser.visitors.elastic_search_visitor import \
- ElasticSearchVisitor
-from inspire_query_parser.visitors.restructuring_visitor import \
- RestructuringVisitor
+from inspire_query_parser.visitors.elastic_search_visitor import ElasticSearchVisitor
+from inspire_query_parser.visitors.restructuring_visitor import RestructuringVisitor
def _parse_query(query_str):
@@ -302,7 +300,7 @@ def test_elastic_search_visitor_find_journal_title_and_old_style_vol_simple_valu
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_find_journal_title_and_vol_and_artid_or_start_page_simple_value():
+def test_elastic_search_visitor_find_journal_title_and_vol_and_artid_or_start_page_simple_value(): # noqa E501
query_str = "j Phys.Lett.B,351,123"
expected_es_query = {
"bool": {
@@ -324,7 +322,9 @@ def test_elastic_search_visitor_find_journal_title_and_vol_and_artid_or_start_pa
"should": [
{
"match": {
- "publication_info.page_start": "123"
+ "publication_info.page_start": (
+ "123"
+ )
}
},
{
@@ -774,11 +774,16 @@ def test_elastic_search_visitor_wildcard_journal_search():
'query': {
'query_string': {
'query': 'Phys.Rev.*',
- 'fields': ['publication_info.journal_title','publication_info.journal_volume', 'publication_info.page_start', 'publication_info.artid'],
+ 'fields': [
+ 'publication_info.journal_title',
+ 'publication_info.journal_volume',
+ 'publication_info.page_start',
+ 'publication_info.artid',
+ ],
'default_operator': 'AND',
'analyze_wildcard': True,
}
- }
+ },
}
}
generated_es_query = _parse_query(query_str)
@@ -862,7 +867,7 @@ def test_elastic_search_visitor_with_malformed_query():
"inspire_query_parser.visitors.elastic_search_visitor.DEFAULT_ES_OPERATOR_FOR_MALFORMED_QUERIES",
ES_MUST_QUERY,
)
-def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malformed_query_op_as_must():
+def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malformed_query_op_as_must(): # noqa E501
query_str = "subject astrophysics and: author:"
expected_es_query = {
"bool": {
@@ -888,7 +893,7 @@ def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malfo
"inspire_query_parser.visitors.elastic_search_visitor.DEFAULT_ES_OPERATOR_FOR_MALFORMED_QUERIES",
ES_SHOULD_QUERY,
)
-def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malformed_query_op_as_should():
+def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malformed_query_op_as_should(): # noqa E501
query_str = "subject astrophysics and author:"
expected_es_query = {
"bool": {
@@ -912,7 +917,7 @@ def test_elastic_search_visitor_with_query_with_malformed_part_and_default_malfo
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_only_year_fields():
+def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_only_year_fields(): # noqa E501
query_str = "date 2000-10"
expected_es_query = {
"bool": {
@@ -958,7 +963,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_o
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_rollover_year():
+def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_rollover_year(): # noqa E501
query_str = "date 2017-12"
expected_es_query = {
"bool": {
@@ -1004,7 +1009,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_r
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_rollover_month():
+def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_rollover_month(): # noqa E501
query_str = "date 2017-10-31"
expected_es_query = {
"bool": {
@@ -1062,7 +1067,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_simple_value_handles_r
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_day():
+def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_day(): # noqa E501
query_str = "date 2000-10-*"
expected_es_query = {
"bool": {
@@ -1108,7 +1113,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_month():
+def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_month(): # noqa E501
query_str = "date 2015-*"
expected_es_query = {
"bool": {
@@ -1138,7 +1143,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_as_month_part():
+def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_as_month_part(): # noqa E501
query_str = "date 2015-1*"
expected_es_query = {
"bool": {
@@ -1168,7 +1173,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_one_query_date_multi_field_and_wildcard_infix_generates_to_all_field():
+def test_elastic_search_visitor_with_one_query_date_multi_field_and_wildcard_infix_generates_to_all_field(): # noqa E501
query_str = "date: 2017-*-12"
expected_es_query = {
"multi_match": {
@@ -1182,7 +1187,7 @@ def test_elastic_search_visitor_with_one_query_date_multi_field_and_wildcard_inf
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_two_queries_date_multi_field_and_wildcard_infix_drops_date():
+def test_elastic_search_visitor_with_two_queries_date_multi_field_and_wildcard_infix_drops_date(): # noqa E501
query_str = "date: 2017-*-12 and title collider"
expected_es_query = {
"bool": {
@@ -1203,7 +1208,7 @@ def test_elastic_search_visitor_with_two_queries_date_multi_field_and_wildcard_i
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_year_drops_date_query():
+def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_year_drops_date_query(): # noqa E501
query_str = "date 201* and title collider"
expected_es_query = {
"bool": {
@@ -1224,7 +1229,7 @@ def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_month_drops_date_query():
+def test_elastic_search_visitor_with_date_multi_field_and_wildcard_value_suffix_in_month_drops_date_query(): # noqa E501
query_str = "date 2000-*-01 and title collider"
expected_es_query = {
"bool": {
@@ -1407,8 +1412,9 @@ def test_elastic_search_visitor_with_date_multi_field_and_range_op():
def test_elastic_search_visitor_with_date_multi_field_range_within_same_year():
- # This kind of query works fine (regarding the ``publication_info.year``), since the range operator is including
- # its bounds, otherwise we would get no records.
+ # This kind of query works fine (regarding the ``publication_info.year``),
+ # since the range operator is including its bounds,
+ # otherwise we would get no records.
query_str = "date 2000-01->2000-04"
expected_es_query = {
"bool": {
@@ -1726,7 +1732,7 @@ def test_elastic_search_visitor_handles_first_author_bai_exact_value():
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_handles_partial_match_value_with_bai_value_and_partial_bai_value():
+def test_elastic_search_visitor_handles_partial_match_value_with_bai_value_and_partial_bai_value(): # noqa E501
query_str = "a 'A.Einstein.1' and a 'S.Mele'"
expected_es_query = {
"bool": {
@@ -1813,7 +1819,7 @@ def test_elastic_search_visitor_handles_wildcard_simple_and_partial_bai_like_que
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_author_name_contains_dot_and_no_spaces():
+def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_author_name_contains_dot_and_no_spaces(): # noqa E501
query_str = "a S.Mele"
expected_es_query = {
"nested": {
@@ -1833,7 +1839,7 @@ def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_author_n
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_first_author_name_contains_dot_and_no_spaces():
+def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_first_author_name_contains_dot_and_no_spaces(): # noqa E501
query_str = "fa S.Mele"
expected_es_query = {
"nested": {
@@ -1856,14 +1862,14 @@ def test_elastic_search_visitor_queries_also_bai_field_with_wildcard_if_first_au
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_comma_and_dot():
+def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_comma_and_dot(): # noqa E501
query_str = "a gava,e."
generated_es_query = _parse_query(query_str)
assert ElasticSearchVisitor.AUTHORS_BAI_FIELD not in str(generated_es_query)
-def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_comma_and_dot():
+def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_comma_and_dot(): # noqa E501
query_str = "fa gava,e."
generated_es_query = _parse_query(query_str)
@@ -1872,14 +1878,14 @@ def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_cont
)
-def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_trailing_dot():
+def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_trailing_dot(): # noqa E501
query_str = "a mele."
generated_es_query = _parse_query(query_str)
assert ElasticSearchVisitor.AUTHORS_BAI_FIELD not in str(generated_es_query)
-def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_trailing_dot():
+def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_trailing_dot(): # noqa E501
query_str = "fa mele."
generated_es_query = _parse_query(query_str)
@@ -1888,14 +1894,14 @@ def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_cont
)
-def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_prefix_dot():
+def test_elastic_search_visitor_queries_does_not_query_bai_field_if_name_contains_prefix_dot(): # noqa E501
query_str = "a .mele"
generated_es_query = _parse_query(query_str)
assert ElasticSearchVisitor.AUTHORS_BAI_FIELD not in str(generated_es_query)
-def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_prefix_dot():
+def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_contains_prefix_dot(): # noqa E501
query_str = "fa .mele"
generated_es_query = _parse_query(query_str)
@@ -1904,7 +1910,7 @@ def test_elastic_search_visitor_fa_queries_does_not_query_bai_field_if_name_cont
)
-def test_elastic_search_visitor_does_not_query_bai_field_if_name_contains_dot_and_spaces():
+def test_elastic_search_visitor_does_not_query_bai_field_if_name_contains_dot_and_spaces(): # noqa E501
query_str = "a S. Mele"
bai_field = "authors.ids.value.search"
@@ -1912,7 +1918,7 @@ def test_elastic_search_visitor_does_not_query_bai_field_if_name_contains_dot_an
assert bai_field not in str(generated_es_query)
-def test_elastic_search_visitor_does_not_query_bai_field_if_fa_name_contains_dot_and_spaces():
+def test_elastic_search_visitor_does_not_query_bai_field_if_fa_name_contains_dot_and_spaces(): # noqa E501
query_str = "fa S. Mele"
bai_field = "first_author.ids.value.search"
@@ -2006,7 +2012,7 @@ def test_elastic_search_visitor_with_word_and_symbol_containing_unicode_characte
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_document_type():
+def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_document_type(): # noqa E501
query_str = "tc c"
expected_es_query = {
"match": {"document_type": {"query": "conference paper", "operator": "and"}}
@@ -2016,7 +2022,7 @@ def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_doc
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_publication_type():
+def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_publication_type(): # noqa E501
query_str = "tc i"
expected_es_query = {
"match": {"publication_type": {"query": "introductory", "operator": "and"}}
@@ -2042,7 +2048,7 @@ def test_elastic_search_visitor_type_code_with_known_value_mapping_and_query_ref
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_type_code_with_unknown_value_searches_both_document_and_publication_type_fields():
+def test_elastic_search_visitor_type_code_with_unknown_value_searches_both_document_and_publication_type_fields(): # noqa E501
query_str = "tc note"
expected_es_query = {
"bool": {
@@ -2058,7 +2064,7 @@ def test_elastic_search_visitor_type_code_with_unknown_value_searches_both_docum
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_type_code_with_known_exact_value_mapping_and_query_refereed():
+def test_elastic_search_visitor_type_code_with_known_exact_value_mapping_and_query_refereed(): # noqa E501
query_str = 'tc "p"'
expected_es_query = {"match": {"refereed": True}}
@@ -2066,7 +2072,7 @@ def test_elastic_search_visitor_type_code_with_known_exact_value_mapping_and_que
assert generated_es_query == expected_es_query
-def test_elastic_search_visitor_type_code_with_known_partial_value_mapping_and_query_refereed():
+def test_elastic_search_visitor_type_code_with_known_partial_value_mapping_and_query_refereed(): # noqa E501
query_str = "tc 'p'"
expected_es_query = {"match": {"refereed": True}}
@@ -2406,7 +2412,9 @@ def test_elastic_search_visitor_find_journal_with_year():
"should": [
{
"match": {
- "publication_info.page_start": "112"
+ "publication_info.page_start": (
+ "112"
+ )
}
},
{
@@ -2461,7 +2469,9 @@ def test_regression_query_with_multiple_dots():
{
"match": {
"_all": {
- "query": "references.reference.dois:10.7483/OPENDATA.CMS.ATLAS",
+ "query": (
+ "references.reference.dois:10.7483/OPENDATA.CMS.ATLAS"
+ ),
"operator": "and",
}
}
@@ -2764,7 +2774,9 @@ def test_first_author_query_with_full_name():
{
"match_phrase_prefix": {
"first_author.first_name": {
- "analyzer": "names_analyzer",
+ "analyzer": (
+ "names_analyzer"
+ ),
"query": "John",
}
}
@@ -2772,7 +2784,7 @@ def test_first_author_query_with_full_name():
{
"match": {
"first_author.first_name": {
- "analyzer": "names_initials_analyzer",
+ "analyzer": "names_initials_analyzer", # noqa E501
"operator": "AND",
"query": "John",
}
@@ -3097,7 +3109,9 @@ def test_journal_title_variants_regression():
"should": [
{
"match": {
- "publication_info.page_start": "015"
+ "publication_info.page_start": (
+ "015"
+ )
}
},
{
@@ -3141,12 +3155,9 @@ def test_journal_title_variants_regression_complex_journal_title():
def test_elastic_search_visitor_fulltext():
query_str = "fulltext FCC"
- expected_es_query = {'match':{
- 'documents.attachment.content': {
- 'query': 'FCC',
- 'operator': 'and'
- }
- }}
+ expected_es_query = {
+ 'match': {'documents.attachment.content': {'query': 'FCC', 'operator': 'and'}}
+ }
generated_es_query = _parse_query(query_str)
assert expected_es_query == generated_es_query
@@ -3160,18 +3171,11 @@ def test_elastic_search_visitor_fulltext_and_other_field():
'match': {
'documents.attachment.content': {
'query': 'something',
- 'operator': 'and'
+ 'operator': 'and',
}
}
},
- {
- 'match': {
- 'titles.full_title': {
- 'query': 'boson',
- 'operator': 'and'
- }
- }
- }
+ {'match': {'titles.full_title': {'query': 'boson', 'operator': 'and'}}},
]
}
}
@@ -3186,7 +3190,7 @@ def test_elastic_search_visitor_partial_match_fulltext():
'query': '*this is a test*',
'fields': ['documents.attachment.content'],
'default_operator': 'AND',
- 'analyze_wildcard': True
+ 'analyze_wildcard': True,
}
}
generated_es_query = _parse_query(query_str)
@@ -3200,7 +3204,7 @@ def test_elastic_search_visitor_citedby():
"self.$ref.raw": {
"index": "records-hep",
"id": "123456",
- "path": "references.record.$ref.raw"
+ "path": "references.record.$ref.raw",
}
}
}
@@ -3218,18 +3222,11 @@ def test_elastic_search_visitor_complex_query():
"self.$ref.raw": {
"index": "records-hep",
"id": "123456",
- "path": "references.record.$ref.raw"
+ "path": "references.record.$ref.raw",
}
}
},
- {
- "match": {
- "titles.full_title": {
- "query": "Test",
- "operator": "and"
- }
- }
- }
+ {"match": {"titles.full_title": {"query": "Test", "operator": "and"}}},
]
}
}
@@ -3239,11 +3236,7 @@ def test_elastic_search_visitor_complex_query():
def test_elastic_search_visitor_texkeys_regression():
query_str = "texkey Chen:2014cwa"
- expected_es_query = {
- "match": {
- "texkeys.raw": "Chen:2014cwa"
- }
- }
+ expected_es_query = {"match": {"texkeys.raw": "Chen:2014cwa"}}
generated_es_query = _parse_query(query_str)
assert generated_es_query == expected_es_query
@@ -3253,11 +3246,7 @@ def test_elastic_search_visitor_texkeys_regression_bool_query():
expected_es_query = {
"bool": {
"must": [
- {
- "match": {
- "texkeys.raw": "Chen:2014cwa"
- }
- },
+ {"match": {"texkeys.raw": "Chen:2014cwa"}},
{
"nested": {
"path": "authors",
@@ -3268,15 +3257,15 @@ def test_elastic_search_visitor_texkeys_regression_bool_query():
"match": {
"authors.last_name": {
"query": "Moskovic",
- "operator": "AND"
+ "operator": "AND",
}
}
}
]
}
- }
+ },
}
- }
+ },
]
}
}
diff --git a/tests/test_format_parse_tree.py b/tests/test_format_parse_tree.py
index 7d81915..e093f6d 100644
--- a/tests/test_format_parse_tree.py
+++ b/tests/test_format_parse_tree.py
@@ -22,18 +22,37 @@
from __future__ import absolute_import, unicode_literals
-from inspire_query_parser.parser import (Expression, InvenioKeywordQuery,
- Query, SimpleQuery, SimpleValue,
- Statement, Value)
+from inspire_query_parser.parser import (
+ Expression,
+ InvenioKeywordQuery,
+ Query,
+ SimpleQuery,
+ SimpleValue,
+ Statement,
+ Value,
+)
from inspire_query_parser.utils.format_parse_tree import emit_tree_format
def test_format_parse_tree_handles_unicode_values():
- parse_tree = Query([Statement(Expression(SimpleQuery(Value(SimpleValue('γ-radiation')))))])
+ parse_tree = Query(
+ [Statement(Expression(SimpleQuery(Value(SimpleValue('γ-radiation')))))]
+ )
assert emit_tree_format(parse_tree, verbose=True)
def test_format_parse_tree_handles_unicode_nodes():
- parse_tree = Query([Statement(Expression(SimpleQuery(InvenioKeywordQuery('unicode-keyword-φοο',
- Value(SimpleValue('γ-radiation'))))))])
+ parse_tree = Query(
+ [
+ Statement(
+ Expression(
+ SimpleQuery(
+ InvenioKeywordQuery(
+ 'unicode-keyword-φοο', Value(SimpleValue('γ-radiation'))
+ )
+ )
+ )
+ )
+ ]
+ )
assert emit_tree_format(parse_tree, verbose=True)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 879f0b6..9c0905a 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -22,19 +22,22 @@
from __future__ import print_function, unicode_literals
+from test_utils import parametrize
+
from inspire_query_parser.parser import SimpleValue, SimpleValueUnit
from inspire_query_parser.stateful_pypeg_parser import StatefulParser
-from test_utils import parametrize
# Test parse terminal token
-def test_that_parse_terminal_token_does_accept_keywords_if_parsing_parenthesized_terminal_flag_is_on():
+def test_that_parse_terminal_token_does_accept_keywords_if_parsing_parenthesized_terminal_flag_is_on(): # noqa E501
query_str = 'and'
parser = StatefulParser()
parser._parsing_parenthesized_terminal = True
- returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(parser, query_str)
+ returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(
+ parser, query_str
+ )
assert returned_unrecognised_text == ''
assert returned_result == query_str
@@ -44,7 +47,9 @@ def test_that_parse_terminal_token_does_not_accept_token_followed_by_colon():
parser = StatefulParser()
- returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(parser, query_str)
+ returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(
+ parser, query_str
+ )
assert isinstance(returned_result, SyntaxError)
assert returned_unrecognised_text == query_str
@@ -54,7 +59,9 @@ def test_that_parse_terminal_token_accepts_non_shortened_inspire_keywords():
parser = StatefulParser()
- returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(parser, query_str)
+ returned_unrecognised_text, returned_result = SimpleValueUnit.parse_terminal_token(
+ parser, query_str
+ )
assert returned_result == query_str
assert returned_unrecognised_text == ""
@@ -66,58 +73,60 @@ def test_that_parse_terminal_token_accepts_non_shortened_inspire_keywords():
'Date specifiers arithmetic: today': {
'query_str': 'today - 2',
'unrecognized_text': '',
- 'result': SimpleValueUnit('today - 2')
+ 'result': SimpleValueUnit('today - 2'),
},
'Date specifiers arithmetic: yesterday': {
'query_str': 'yesterday - 365',
'unrecognized_text': '',
- 'result': SimpleValueUnit('yesterday - 365')
+ 'result': SimpleValueUnit('yesterday - 365'),
},
'Date specifiers arithmetic: this month': {
'query_str': 'this month - 1',
'unrecognized_text': '',
- 'result': SimpleValueUnit('this month - 1')
+ 'result': SimpleValueUnit('this month - 1'),
},
'Date specifiers arithmetic: last month': {
'query_str': 'last month-1',
'unrecognized_text': '',
- 'result': SimpleValueUnit('last month-1')
+ 'result': SimpleValueUnit('last month-1'),
},
'Date specifier w/o arithmetic (followed by a query)': {
'query_str': 'today - a',
'unrecognized_text': ' - a',
- 'result': SimpleValueUnit('today')
+ 'result': SimpleValueUnit('today'),
},
-
# Basic tokens
'Simple token': {
'query_str': 'foo',
'unrecognized_text': '',
- 'result': SimpleValueUnit('foo')
+ 'result': SimpleValueUnit('foo'),
},
'Unicode token': {
'query_str': 'γ-radiation',
'unrecognized_text': '',
- 'result': SimpleValueUnit('γ-radiation')
+ 'result': SimpleValueUnit('γ-radiation'),
},
# Tokens separated by whitespace, don't get recognized by SimpleValueUnit.
'Many tokens (whitespace separated)': {
'query_str': 'foo bar',
'unrecognized_text': ' bar',
- 'result': SimpleValueUnit('foo')
+ 'result': SimpleValueUnit('foo'),
},
}
)
def test_simple_value_unit_accepted_tokens(query_str, unrecognized_text, result):
parser = StatefulParser()
- returned_unrecognised_text, returned_result = SimpleValueUnit.parse(parser, query_str, None)
- if type(result) != SyntaxError:
+ returned_unrecognised_text, returned_result = SimpleValueUnit.parse(
+ parser, query_str, None
+ )
+ if not isinstance(result, SyntaxError):
assert returned_unrecognised_text == unrecognized_text
assert returned_result == result
else:
assert returned_unrecognised_text == unrecognized_text
- assert isinstance(returned_result, SyntaxError) and result.msg == result.msg
+ assert isinstance(returned_result, SyntaxError)
+ assert result.msg == result.msg
@parametrize(
@@ -125,32 +134,35 @@ def test_simple_value_unit_accepted_tokens(query_str, unrecognized_text, result)
'Multiple whitespace-separated tokens': {
'query_str': 'foo bar',
'unrecognized_text': '',
- 'result': SimpleValue('foo bar')
+ 'result': SimpleValue('foo bar'),
},
'Plaintext with parentheses': {
'query_str': 'foo(a)',
'unrecognized_text': '',
- 'result': SimpleValue('foo(a)')
+ 'result': SimpleValue('foo(a)'),
},
'Plaintext with keywords (or keyword symbols +/-/|) in parentheses': {
'query_str': '(and)',
'unrecognized_text': '',
- 'result': SimpleValue('(and)')
+ 'result': SimpleValue('(and)'),
},
'Plaintext with colons in the first word': {
'query_str': 'foo:bar baz:quux',
'unrecognized_text': 'baz:quux',
- 'result': SimpleValue('foo:bar')
+ 'result': SimpleValue('foo:bar'),
},
}
)
def test_simple_value_accepted_tokens(query_str, unrecognized_text, result):
parser = StatefulParser()
- returned_unrecognised_text, returned_result = SimpleValue.parse(parser, query_str, None)
- if type(result) != SyntaxError:
+ returned_unrecognised_text, returned_result = SimpleValue.parse(
+ parser, query_str, None
+ )
+ if not isinstance(result, SyntaxError):
assert returned_unrecognised_text == unrecognized_text
assert returned_result == result
else:
assert returned_unrecognised_text == unrecognized_text
- assert isinstance(returned_result, SyntaxError) and result.msg == result.msg
+ assert isinstance(returned_result, SyntaxError)
+ assert result.msg == result.msg
diff --git a/tests/test_parser_functionality.py b/tests/test_parser_functionality.py
index d0f9dec..a27c79f 100644
--- a/tests/test_parser_functionality.py
+++ b/tests/test_parser_functionality.py
@@ -24,26 +24,44 @@
import pytest
-from inspire_query_parser.parser import (And, BooleanQuery, ComplexValue,
- DateValue, EmptyQuery, Expression,
- GreaterEqualOp, GreaterThanOp,
- InspireDateKeyword, InspireKeyword,
- InvenioKeywordQuery, LessEqualOp,
- LessThanOp, MalformedQueryWords,
- NestedKeywordQuery, NotQuery, Or,
- ParenthesizedQuery, Query, RangeOp,
- SimpleDateValue, SimpleQuery,
- SimpleRangeValue, SimpleValue,
- SimpleValueBooleanQuery,
- SpiresDateKeywordQuery,
- SpiresKeywordQuery, Statement, Value)
+from inspire_query_parser.parser import (
+ And,
+ BooleanQuery,
+ ComplexValue,
+ DateValue,
+ EmptyQuery,
+ Expression,
+ GreaterEqualOp,
+ GreaterThanOp,
+ InspireDateKeyword,
+ InspireKeyword,
+ InvenioKeywordQuery,
+ LessEqualOp,
+ LessThanOp,
+ MalformedQueryWords,
+ NestedKeywordQuery,
+ NotQuery,
+ Or,
+ ParenthesizedQuery,
+ Query,
+ RangeOp,
+ SimpleDateValue,
+ SimpleQuery,
+ SimpleRangeValue,
+ SimpleValue,
+ SimpleValueBooleanQuery,
+ SpiresDateKeywordQuery,
+ SpiresKeywordQuery,
+ Statement,
+ Value,
+)
from inspire_query_parser.stateful_pypeg_parser import StatefulParser
# TODO Reformat parentheses around parametrize entries
@pytest.mark.parametrize(
- ["query_str", "expected_parse_tree"],
+ ("query_str", "expected_parse_tree"),
{
(
"date nov 2020 12",
@@ -807,7 +825,10 @@
),
),
(
- "author ellis, j. and not (title boson or not (author /^xi$/ and title foo))",
+ (
+ "author ellis, j. and not (title boson or not (author /^xi$/ and title"
+ " foo))"
+ ),
Query(
[
Statement(
@@ -1061,7 +1082,8 @@
]
),
),
- # Parenthesized keyword query values (working also with SPIRES operators - doesn't on legacy)
+ # Parenthesized keyword query values (working also with
+ # SPIRES operators - doesn't on legacy)
(
"author:(title ellis)",
Query(
@@ -1173,7 +1195,10 @@
),
),
(
- "find title Alternative the Phase-II upgrade of the ATLAS Inner Detector or na61/shine",
+ (
+ "find title Alternative the Phase-II upgrade of the ATLAS Inner"
+ " Detector or na61/shine"
+ ),
Query(
[
Statement(
@@ -1184,7 +1209,8 @@
Value(
SimpleValueBooleanQuery(
SimpleValue(
- "Alternative the Phase-II upgrade of the ATLAS Inner Detector"
+ "Alternative the Phase-II upgrade of"
+ " the ATLAS Inner Detector"
),
Or(),
SimpleValue("na61/shine"),
diff --git a/tests/test_parsing_driver.py b/tests/test_parsing_driver.py
index 6a8de30..ec9782f 100644
--- a/tests/test_parsing_driver.py
+++ b/tests/test_parsing_driver.py
@@ -31,10 +31,7 @@ def test_driver_with_simple_query():
query_str = 'subject astrophysics'
expected_es_query = {
"match": {
- "facet_inspire_categories": {
- "query": "astrophysics",
- "operator": "and"
- }
+ "facet_inspire_categories": {"query": "astrophysics", "operator": "and"}
}
}
@@ -50,7 +47,7 @@ def test_driver_with_nothing_recognized(mocked_parser):
'multi_match': {
'query': 'unrecognized query',
'fields': ['_all'],
- 'zero_terms_query': 'all'
+ 'zero_terms_query': 'all',
}
}
@@ -68,7 +65,7 @@ def test_driver_with_syntax_error(mocked_parser):
'multi_match': {
'query': 'query with syntax error',
'fields': ['_all'],
- 'zero_terms_query': 'all'
+ 'zero_terms_query': 'all',
}
}
@@ -83,13 +80,11 @@ def test_driver_with_syntax_error(mocked_parser):
def test_driver_with_rst_visitor_error(mocked_rst_visitor):
query_str = 'foo'
expected_es_query = {
- 'multi_match': {
- 'query': 'foo',
- 'fields': ['_all'],
- 'zero_terms_query': 'all'
- }
+ 'multi_match': {'query': 'foo', 'fields': ['_all'], 'zero_terms_query': 'all'}
}
- mocked_rst_visitor.return_value.visit.side_effect = Exception('Something went wrong with visit_value')
+ mocked_rst_visitor.return_value.visit.side_effect = Exception(
+ 'Something went wrong with visit_value'
+ )
mocked_rst_visitor.__name__ = 'MockedRestructuringVisitor'
es_query = parse_query(query_str)
@@ -101,13 +96,11 @@ def test_driver_with_rst_visitor_error(mocked_rst_visitor):
def test_driver_with_es_visitor_error(mocked_es_visitor):
query_str = 'foo'
expected_es_query = {
- 'multi_match': {
- 'query': 'foo',
- 'fields': ['_all'],
- 'zero_terms_query': 'all'
- }
+ 'multi_match': {'query': 'foo', 'fields': ['_all'], 'zero_terms_query': 'all'}
}
- mocked_es_visitor.return_value.visit.side_effect = Exception('Something went wrong with visit_value')
+ mocked_es_visitor.return_value.visit.side_effect = Exception(
+ 'Something went wrong with visit_value'
+ )
mocked_es_visitor.__name__ = 'MockedElasticSearchVisitor'
es_query = parse_query(query_str)
@@ -121,7 +114,7 @@ def test_driver_with_es_visitor_empty_query_generates_a_query_against_all():
'multi_match': {
'query': 'd < 200',
'fields': ['_all'],
- 'zero_terms_query': 'all'
+ 'zero_terms_query': 'all',
}
}
diff --git a/tests/test_restructuring_visitor.py b/tests/test_restructuring_visitor.py
index 928c571..5ce337a 100644
--- a/tests/test_restructuring_visitor.py
+++ b/tests/test_restructuring_visitor.py
@@ -28,121 +28,141 @@
from dateutil.relativedelta import relativedelta
from inspire_query_parser import parser
-from inspire_query_parser.ast import (AndOp, EmptyQuery, ExactMatchValue,
- GreaterEqualThanOp, GreaterThanOp,
- Keyword, KeywordOp, LessEqualThanOp,
- LessThanOp, MalformedQuery,
- NestedKeywordOp, NotOp, OrOp,
- PartialMatchValue,
- QueryWithMalformedPart, RangeOp,
- RegexValue, Value, ValueOp)
+from inspire_query_parser.ast import (
+ AndOp,
+ EmptyQuery,
+ ExactMatchValue,
+ GreaterEqualThanOp,
+ GreaterThanOp,
+ Keyword,
+ KeywordOp,
+ LessEqualThanOp,
+ LessThanOp,
+ MalformedQuery,
+ NestedKeywordOp,
+ NotOp,
+ OrOp,
+ PartialMatchValue,
+ QueryWithMalformedPart,
+ RangeOp,
+ RegexValue,
+ Value,
+ ValueOp,
+)
from inspire_query_parser.stateful_pypeg_parser import StatefulParser
-from inspire_query_parser.visitors.restructuring_visitor import \
- RestructuringVisitor
+from inspire_query_parser.visitors.restructuring_visitor import RestructuringVisitor
@pytest.mark.parametrize(
- ['query_str', 'expected_parse_tree'],
+ ('query_str', 'expected_parse_tree'),
[
# Find keyword combined with other production rules
- ('FIN author:\'ellis\'', KeywordOp(Keyword('author'), PartialMatchValue('ellis'))),
+ (
+ 'FIN author:\'ellis\'',
+ KeywordOp(Keyword('author'), PartialMatchValue('ellis')),
+ ),
('Find author "ellis"', KeywordOp(Keyword('author'), ExactMatchValue('ellis'))),
('f author ellis', KeywordOp(Keyword('author'), Value('ellis'))),
-
# Invenio like search
(
'author:ellis and title:boson',
AndOp(
KeywordOp(Keyword('author'), Value('ellis')),
- KeywordOp(Keyword('title'), Value('boson'))
- )
- ),
- ('unknown_keyword:\'bar\'', KeywordOp(Keyword('unknown_keyword'), PartialMatchValue('bar'))),
- ('dotted.keyword:\'bar\'', KeywordOp(Keyword('dotted.keyword'), PartialMatchValue('bar'))),
-
+ KeywordOp(Keyword('title'), Value('boson')),
+ ),
+ ),
+ (
+ 'unknown_keyword:\'bar\'',
+ KeywordOp(Keyword('unknown_keyword'), PartialMatchValue('bar')),
+ ),
+ (
+ 'dotted.keyword:\'bar\'',
+ KeywordOp(Keyword('dotted.keyword'), PartialMatchValue('bar')),
+ ),
# Boolean operator testing (And/Or)
(
'author ellis and title \'boson\'',
AndOp(
KeywordOp(Keyword('author'), Value('ellis')),
- KeywordOp(Keyword('title'), PartialMatchValue('boson'))
- )
- ),
+ KeywordOp(Keyword('title'), PartialMatchValue('boson')),
+ ),
+ ),
(
'f a appelquist and date 1983',
AndOp(
KeywordOp(Keyword('author'), Value('appelquist')),
- KeywordOp(Keyword('date'), Value('1983'))
- )
- ),
+ KeywordOp(Keyword('date'), Value('1983')),
+ ),
+ ),
(
'fin a henneaux and citedby a nicolai',
AndOp(
KeywordOp(Keyword('author'), Value('henneaux')),
- NestedKeywordOp(Keyword('citedby'), KeywordOp(Keyword('author'), Value('nicolai'))))
- ),
+ NestedKeywordOp(
+ Keyword('citedby'), KeywordOp(Keyword('author'), Value('nicolai'))
+ ),
+ ),
+ ),
(
'au ellis | title \'boson\'',
OrOp(
KeywordOp(Keyword('author'), Value('ellis')),
- KeywordOp(Keyword('title'), PartialMatchValue('boson'))
- )
- ),
+ KeywordOp(Keyword('title'), PartialMatchValue('boson')),
+ ),
+ ),
(
'-author ellis OR title \'boson\'',
OrOp(
NotOp(KeywordOp(Keyword('author'), Value('ellis'))),
- KeywordOp(Keyword('title'), PartialMatchValue('boson'))
- )
- ),
+ KeywordOp(Keyword('title'), PartialMatchValue('boson')),
+ ),
+ ),
(
'author ellis & title \'boson\'',
AndOp(
KeywordOp(Keyword('author'), Value('ellis')),
- KeywordOp(Keyword('title'), PartialMatchValue('boson'))
- )
- ),
-
+ KeywordOp(Keyword('title'), PartialMatchValue('boson')),
+ ),
+ ),
# Implicit And
(
'author ellis elastic.keyword:\'boson\'',
AndOp(
KeywordOp(Keyword('author'), Value('ellis')),
- KeywordOp(Keyword('elastic.keyword'), PartialMatchValue('boson'))
- )
- ),
+ KeywordOp(Keyword('elastic.keyword'), PartialMatchValue('boson')),
+ ),
+ ),
(
'find cn atlas not tc c',
AndOp(
KeywordOp(Keyword('collaboration'), Value('atlas')),
- NotOp(KeywordOp(Keyword('type-code'), Value('c')))
- )
- ),
+ NotOp(KeywordOp(Keyword('type-code'), Value('c'))),
+ ),
+ ),
(
'author:ellis j title:\'boson\' reference:M.N.1',
AndOp(
KeywordOp(Keyword('author'), Value('ellis j')),
AndOp(
KeywordOp(Keyword('title'), PartialMatchValue('boson')),
- KeywordOp(Keyword('cite'), Value('M.N.1'))
- )
- )
- ),
+ KeywordOp(Keyword('cite'), Value('M.N.1')),
+ ),
+ ),
+ ),
(
'author ellis - title \'boson\'',
AndOp(
KeywordOp(Keyword('author'), Value('ellis')),
- NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson')))
- )
- ),
+ NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson'))),
+ ),
+ ),
(
- 'topcite 2+ and skands',
- AndOp(
- KeywordOp(Keyword('topcite'), GreaterEqualThanOp(Value('2'))),
- ValueOp(Value('skands'))
- )
+ 'topcite 2+ and skands',
+ AndOp(
+ KeywordOp(Keyword('topcite'), GreaterEqualThanOp(Value('2'))),
+ ValueOp(Value('skands')),
+ ),
),
-
# ##### Boolean operators at terminals level ####
(
'author ellis title:boson not higgs',
@@ -150,21 +170,22 @@
KeywordOp(Keyword('author'), Value('ellis')),
AndOp(
KeywordOp(Keyword('title'), Value('boson')),
- NotOp(KeywordOp(Keyword('title'), Value('higgs')))
- )
- )
- ),
-
+ NotOp(KeywordOp(Keyword('title'), Value('higgs'))),
+ ),
+ ),
+ ),
# Negation
(
'ellis and not title \'boson\'',
AndOp(
ValueOp(Value('ellis')),
- NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson')))
- )
- ),
- ('-title \'boson\'', NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson')))),
-
+ NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson'))),
+ ),
+ ),
+ (
+ '-title \'boson\'',
+ NotOp(KeywordOp(Keyword('title'), PartialMatchValue('boson'))),
+ ),
# Nested expressions
(
'author ellis, j. and (title boson or (author /^xi$/ and title foo))',
@@ -174,13 +195,16 @@
KeywordOp(Keyword('title'), Value('boson')),
AndOp(
KeywordOp(Keyword('author'), RegexValue('^xi$')),
- KeywordOp(Keyword('title'), Value('foo'))
- )
- )
- )
- ),
+ KeywordOp(Keyword('title'), Value('foo')),
+ ),
+ ),
+ ),
+ ),
(
- 'author ellis, j. and not (title boson or not (author /^xi$/ and title foo))',
+ (
+ 'author ellis, j. and not (title boson or not (author /^xi$/ and title'
+ ' foo))'
+ ),
AndOp(
KeywordOp(Keyword('author'), Value('ellis, j.')),
NotOp(
@@ -189,14 +213,13 @@
NotOp(
AndOp(
KeywordOp(Keyword('author'), RegexValue('^xi$')),
- KeywordOp(Keyword('title'), Value('foo'))
+ KeywordOp(Keyword('title'), Value('foo')),
)
- )
+ ),
)
- )
- )
- ),
-
+ ),
+ ),
+ ),
# Metadata search
(
'refersto:1347300 and (reference:Ellis or reference "Ellis")',
@@ -204,41 +227,37 @@
NestedKeywordOp(Keyword('refersto'), ValueOp(Value('1347300'))),
OrOp(
KeywordOp(Keyword('cite'), Value('Ellis')),
- KeywordOp(Keyword('cite'), ExactMatchValue('Ellis'))
- )
- )
+ KeywordOp(Keyword('cite'), ExactMatchValue('Ellis')),
+ ),
+ ),
),
(
'exactauthor:M.Vanderhaeghen.1 and ac: 42',
AndOp(
KeywordOp(Keyword('exact-author'), Value('M.Vanderhaeghen.1')),
- KeywordOp(Keyword('author-count'), Value('42'))
- )
+ KeywordOp(Keyword('author-count'), Value('42')),
+ ),
),
-
# Simple phrases
('ellis', ValueOp(Value('ellis'))),
('\'ellis\'', ValueOp(PartialMatchValue('ellis'))),
('(ellis and smith)', AndOp(ValueOp(Value('ellis')), ValueOp(Value('smith')))),
-
- # Parenthesized keyword query values (working also with SPIRES operators - doesn't on legacy)
- (
- 'author:(title ellis)', KeywordOp(Keyword('author'), Value('title ellis'))
- ),
+ # Parenthesized keyword query values (working also with SPIRES operators -
+ # doesn't on legacy)
+ ('author:(title ellis)', KeywordOp(Keyword('author'), Value('title ellis'))),
(
'author (pardo, f AND slavich) OR (author:bernreuther and not date:2017)',
OrOp(
AndOp(
KeywordOp(Keyword('author'), Value('pardo, f')),
- KeywordOp(Keyword('author'), Value('slavich'))
+ KeywordOp(Keyword('author'), Value('slavich')),
),
AndOp(
KeywordOp(Keyword('author'), Value('bernreuther')),
- NotOp(KeywordOp(Keyword('date'), Value('2017')))
- )
- )
- ),
-
+ NotOp(KeywordOp(Keyword('date'), Value('2017'))),
+ ),
+ ),
+ ),
# Non trivial terminals
(
'author smith and not j., ellis or foo',
@@ -246,301 +265,363 @@
KeywordOp(Keyword('author'), Value('smith')),
OrOp(
NotOp(KeywordOp(Keyword('author'), Value('j., ellis'))),
- KeywordOp(Keyword('author'), Value('foo'))
- )
- )
- ),
+ KeywordOp(Keyword('author'), Value('foo')),
+ ),
+ ),
+ ),
(
- 'find title Alternative the Phase-II upgrade of the ATLAS Inner Detector or na61/shine',
+ (
+ 'find title Alternative the Phase-II upgrade of the ATLAS Inner'
+ ' Detector or na61/shine'
+ ),
OrOp(
- KeywordOp(Keyword('title'), Value('Alternative the Phase-II upgrade of the ATLAS Inner Detector')),
- KeywordOp(Keyword('title'), Value('na61/shine'))
- )
- ),
+ KeywordOp(
+ Keyword('title'),
+ Value(
+ 'Alternative the Phase-II upgrade of the ATLAS Inner Detector'
+ ),
+ ),
+ KeywordOp(Keyword('title'), Value('na61/shine')),
+ ),
+ ),
(
'find (j phys.rev. and vol d85) or (j phys.rev.lett.,62,1825)',
OrOp(
KeywordOp(Keyword('journal'), Value('phys.rev.,d85')),
- KeywordOp(Keyword('journal'), Value('phys.rev.lett.,62,1825'))
- )
- ),
+ KeywordOp(Keyword('journal'), Value('phys.rev.lett.,62,1825')),
+ ),
+ ),
(
"title e-10 and -author d'hoker",
AndOp(
KeywordOp(Keyword('title'), Value('e-10')),
- NotOp(KeywordOp(Keyword('author'), Value('d\'hoker')))
- )
- ),
+ NotOp(KeywordOp(Keyword('author'), Value('d\'hoker'))),
+ ),
+ ),
(
'a pang,yi and t SU(2)',
AndOp(
KeywordOp(Keyword('author'), Value('pang,yi')),
- KeywordOp(Keyword('title'), Value('SU(2)'))
- )
- ),
+ KeywordOp(Keyword('title'), Value('SU(2)')),
+ ),
+ ),
(
't e(+)e(-) or e+e- Colliders',
OrOp(
KeywordOp(Keyword('title'), Value('e(+)e(-)')),
- KeywordOp(Keyword('title'), Value('e+e- Colliders'))
- )
+ KeywordOp(Keyword('title'), Value('e+e- Colliders')),
+ ),
+ ),
+ (
+ 'title: Si-28(p(pol.),n(pol.))',
+ KeywordOp(Keyword('title'), Value('Si-28(p(pol.),n(pol.))')),
+ ),
+ (
+ 't Si28(p→,p→′)Si28(6−,T=1)',
+ KeywordOp(Keyword('title'), Value('Si28(p→,p→′)Si28(6−,T=1)')),
+ ),
+ (
+ 't C-12(vec-p,vec-n)N-12 (g.s.,1+)',
+ KeywordOp(Keyword('title'), Value('C-12(vec-p,vec-n)N-12 (g.s.,1+)')),
),
- ('title: Si-28(p(pol.),n(pol.))', KeywordOp(Keyword('title'), Value('Si-28(p(pol.),n(pol.))'))),
- ('t Si28(p→,p→′)Si28(6−,T=1)', KeywordOp(Keyword('title'), Value('Si28(p→,p→′)Si28(6−,T=1)'))),
- ('t C-12(vec-p,vec-n)N-12 (g.s.,1+)', KeywordOp(Keyword('title'), Value('C-12(vec-p,vec-n)N-12 (g.s.,1+)'))),
-
# Regex
- ('author:/^Ellis, (J|John)$/', KeywordOp(Keyword('author'), RegexValue('^Ellis, (J|John)$'))),
- ('title:/dense ([^ $]* )?matter/', KeywordOp(Keyword('title'), RegexValue('dense ([^ $]* )?matter'))),
-
+ (
+ 'author:/^Ellis, (J|John)$/',
+ KeywordOp(Keyword('author'), RegexValue('^Ellis, (J|John)$')),
+ ),
+ (
+ 'title:/dense ([^ $]* )?matter/',
+ KeywordOp(Keyword('title'), RegexValue('dense ([^ $]* )?matter')),
+ ),
# Nestable keywords
(
'referstox:author:s.p.martin.1',
- NestedKeywordOp(Keyword('referstox'), KeywordOp(Keyword('author'), Value('s.p.martin.1')))
- ),
+ NestedKeywordOp(
+ Keyword('referstox'),
+ KeywordOp(Keyword('author'), Value('s.p.martin.1')),
+ ),
+ ),
(
'find a parke, s j and refersto author witten',
AndOp(
KeywordOp(Keyword('author'), Value('parke, s j')),
- NestedKeywordOp(Keyword('refersto'), KeywordOp(Keyword('author'), Value('witten')))
- )
- ),
+ NestedKeywordOp(
+ Keyword('refersto'), KeywordOp(Keyword('author'), Value('witten'))
+ ),
+ ),
+ ),
(
'citedbyx:author:s.p.martin.1',
- NestedKeywordOp(Keyword('citedbyx'), KeywordOp(Keyword('author'), Value('s.p.martin.1')))
- ),
+ NestedKeywordOp(
+ Keyword('citedbyx'), KeywordOp(Keyword('author'), Value('s.p.martin.1'))
+ ),
+ ),
(
'citedby:author:s.p.martin.1',
- NestedKeywordOp(Keyword('citedby'), KeywordOp(Keyword('author'), Value('s.p.martin.1')))
- ),
+ NestedKeywordOp(
+ Keyword('citedby'), KeywordOp(Keyword('author'), Value('s.p.martin.1'))
+ ),
+ ),
(
'-refersto:recid:1374998 and citedby:(A.A.Aguilar.Arevalo.1)',
AndOp(
- NotOp(NestedKeywordOp(Keyword('refersto'), KeywordOp(Keyword('control_number'), Value('1374998')))),
- NestedKeywordOp(Keyword('citedby'), ValueOp(Value('A.A.Aguilar.Arevalo.1')))
- )
- ),
+ NotOp(
+ NestedKeywordOp(
+ Keyword('refersto'),
+ KeywordOp(Keyword('control_number'), Value('1374998')),
+ )
+ ),
+ NestedKeywordOp(
+ Keyword('citedby'), ValueOp(Value('A.A.Aguilar.Arevalo.1'))
+ ),
+ ),
+ ),
(
'citedby:(author A.A.Aguilar.Arevalo.1 and not a ellis)',
NestedKeywordOp(
Keyword('citedby'),
AndOp(
KeywordOp(Keyword('author'), Value('A.A.Aguilar.Arevalo.1')),
- NotOp(KeywordOp(Keyword('author'), Value('ellis')))
- )
- )
+ NotOp(KeywordOp(Keyword('author'), Value('ellis'))),
+ ),
+ ),
),
(
'citedby:refersto:recid:1432705',
NestedKeywordOp(
Keyword('citedby'),
- NestedKeywordOp(Keyword('refersto'), KeywordOp(Keyword('control_number'), Value('1432705')))
- )
- ),
-
+ NestedKeywordOp(
+ Keyword('refersto'),
+ KeywordOp(Keyword('control_number'), Value('1432705')),
+ ),
+ ),
+ ),
# Ranges
(
- 'd 2015->2017 and cited:1->9',
- AndOp(
- KeywordOp(Keyword("date"), RangeOp(Value('2015'), Value('2017'))),
- KeywordOp(Keyword('topcite'), RangeOp(Value('1'), Value('9')))
- )
- ),
-
+ 'd 2015->2017 and cited:1->9',
+ AndOp(
+ KeywordOp(Keyword("date"), RangeOp(Value('2015'), Value('2017'))),
+ KeywordOp(Keyword('topcite'), RangeOp(Value('1'), Value('9'))),
+ ),
+ ),
# Empty query
('', EmptyQuery()),
(' ', EmptyQuery()),
-
# G, GE, LT, LE, E queries
(
- 'date > 2000-10 and date < 2000-12',
- AndOp(
- KeywordOp(Keyword('date'), GreaterThanOp(Value('2000-10'))),
- KeywordOp(Keyword('date'), LessThanOp(Value('2000-12')))
- )
- ),
+ 'date > 2000-10 and date < 2000-12',
+ AndOp(
+ KeywordOp(Keyword('date'), GreaterThanOp(Value('2000-10'))),
+ KeywordOp(Keyword('date'), LessThanOp(Value('2000-12'))),
+ ),
+ ),
(
- 'date after 10/2000 and date before 2000-12',
- AndOp(
- KeywordOp(Keyword('date'), GreaterThanOp(Value('10/2000'))),
- KeywordOp(Keyword('date'), LessThanOp(Value('2000-12')))
- )
- ),
+ 'date after 10/2000 and date before 2000-12',
+ AndOp(
+ KeywordOp(Keyword('date'), GreaterThanOp(Value('10/2000'))),
+ KeywordOp(Keyword('date'), LessThanOp(Value('2000-12'))),
+ ),
+ ),
(
'date >= nov 2000 and d<=2005',
AndOp(
KeywordOp(Keyword('date'), GreaterEqualThanOp(Value('nov 2000'))),
- KeywordOp(Keyword('date'), LessEqualThanOp(Value('2005')))
- )
+ KeywordOp(Keyword('date'), LessEqualThanOp(Value('2005'))),
+ ),
),
(
'date 1978+ + -ac 100+',
AndOp(
- KeywordOp(Keyword('date'), GreaterEqualThanOp(Value('1978'))),
- NotOp(KeywordOp(Keyword('author-count'), GreaterEqualThanOp(Value('100'))))
- )
- ),
+ KeywordOp(Keyword('date'), GreaterEqualThanOp(Value('1978'))),
+ NotOp(
+ KeywordOp(Keyword('author-count'), GreaterEqualThanOp(Value('100')))
+ ),
+ ),
+ ),
(
'f a wimpenny and date = 1987',
AndOp(
KeywordOp(Keyword('author'), Value('wimpenny')),
- KeywordOp(Keyword('date'), Value('1987')))
- ),
-
+ KeywordOp(Keyword('date'), Value('1987')),
+ ),
+ ),
# Date specifiers
(
'date today - 2 and title foo',
AndOp(
- KeywordOp(Keyword('date'), Value(str(date.today() - timedelta(days=2)))),
- KeywordOp(Keyword('title'), Value('foo'))
- )
- ),
+ KeywordOp(
+ Keyword('date'), Value(str(date.today() - timedelta(days=2)))
+ ),
+ KeywordOp(Keyword('title'), Value('foo')),
+ ),
+ ),
(
'date today - 0 and title foo',
AndOp(
KeywordOp(Keyword('date'), Value(str(date.today()))),
- KeywordOp(Keyword('title'), Value('foo'))
- )
- ),
+ KeywordOp(Keyword('title'), Value('foo')),
+ ),
+ ),
(
'date today - title foo',
AndOp(
KeywordOp(Keyword('date'), Value(str(date.today()))),
- NotOp(KeywordOp(Keyword('title'), Value('foo')))
- )
- ),
+ NotOp(KeywordOp(Keyword('title'), Value('foo'))),
+ ),
+ ),
(
'date this month and author ellis',
AndOp(
KeywordOp(Keyword('date'), Value(str(date.today()))),
- KeywordOp(Keyword('author'), Value('ellis'))
- )
- ),
+ KeywordOp(Keyword('author'), Value('ellis')),
+ ),
+ ),
(
'date this month - 3 and author ellis',
AndOp(
- KeywordOp(Keyword('date'), Value(str(date.today() - relativedelta(months=3)))),
- KeywordOp(Keyword('author'), Value('ellis'))
- )
- ),
+ KeywordOp(
+ Keyword('date'), Value(str(date.today() - relativedelta(months=3)))
+ ),
+ KeywordOp(Keyword('author'), Value('ellis')),
+ ),
+ ),
(
'date yesterday - 2 - ac 100',
AndOp(
- KeywordOp(Keyword('date'),
- Value(str(date.today() - relativedelta(days=3)))),
- NotOp(KeywordOp(Keyword('author-count'), Value('100')))
- )
- ),
+ KeywordOp(
+ Keyword('date'), Value(str(date.today() - relativedelta(days=3)))
+ ),
+ NotOp(KeywordOp(Keyword('author-count'), Value('100'))),
+ ),
+ ),
(
pytest.param(
'date last month - 2 + ac < 50',
AndOp(
- KeywordOp(Keyword('date'), Value(str((date.today() - relativedelta(months=3))))),
- KeywordOp(Keyword('author-count'), LessThanOp(Value('50')))
+ KeywordOp(
+ Keyword('date'),
+ Value(str((date.today() - relativedelta(months=3)))),
+ ),
+ KeywordOp(Keyword('author-count'), LessThanOp(Value('50'))),
+ ),
+ marks=pytest.mark.xfail(
+ reason="doesn't work on 31st of the month, see INSPIR-2882"
),
- marks=pytest.mark.xfail(reason="doesn't work on 31st of the month, see INSPIR-2882")
)
- ),
+ ),
(
'du > yesterday - 2',
KeywordOp(
Keyword('date-updated'),
- GreaterThanOp(Value(str((date.today() - relativedelta(days=3)))))
- )
- ),
-
+ GreaterThanOp(Value(str((date.today() - relativedelta(days=3))))),
+ ),
+ ),
# Wildcard queries
(
'find a \'o*aigh\' and t "alge*" and date >2013',
AndOp(
- KeywordOp(Keyword('author'), PartialMatchValue('o*aigh', contains_wildcard=True)),
+ KeywordOp(
+ Keyword('author'),
+ PartialMatchValue('o*aigh', contains_wildcard=True),
+ ),
AndOp(
- KeywordOp(Keyword('title'), ExactMatchValue('alge*'
-
- )),
- KeywordOp(Keyword('date'), GreaterThanOp(Value('2013')))
- )
- )
- ),
+ KeywordOp(Keyword('title'), ExactMatchValue('alge*')),
+ KeywordOp(Keyword('date'), GreaterThanOp(Value('2013'))),
+ ),
+ ),
+ ),
(
'a *alge | a alge* | a o*aigh',
OrOp(
KeywordOp(Keyword('author'), Value('*alge', contains_wildcard=True)),
OrOp(
- KeywordOp(Keyword('author'), Value('alge*', contains_wildcard=True)),
- KeywordOp(Keyword('author'), Value('o*aigh', contains_wildcard=True))
- )
- )
- ),
+ KeywordOp(
+ Keyword('author'), Value('alge*', contains_wildcard=True)
+ ),
+ KeywordOp(
+ Keyword('author'), Value('o*aigh', contains_wildcard=True)
+ ),
+ ),
+ ),
+ ),
(
'find texkey Hirata:1992*',
- KeywordOp(Keyword('texkeys.raw'), Value('Hirata:1992*', contains_wildcard=True))
+ KeywordOp(
+ Keyword('texkeys.raw'), Value('Hirata:1992*', contains_wildcard=True)
+ ),
),
-
# Queries for implicit "and" removal
('title and foo', AndOp(ValueOp(Value('title')), ValueOp(Value('foo')))),
('author takumi doi', KeywordOp(Keyword('author'), Value('takumi doi'))),
(
'title cms and title experiment and date 2008',
AndOp(
- KeywordOp(Keyword('title'), Value('cms')),
- AndOp(
- KeywordOp(Keyword('title'), Value('experiment')),
- KeywordOp(Keyword('date'), Value('2008'))
- )
- )
+ KeywordOp(Keyword('title'), Value('cms')),
+ AndOp(
+ KeywordOp(Keyword('title'), Value('experiment')),
+ KeywordOp(Keyword('date'), Value('2008')),
+ ),
+ ),
),
(
'author:witten title:foo',
AndOp(
KeywordOp(Keyword('author'), Value('witten')),
- KeywordOp(Keyword('title'), Value('foo'))
- )
+ KeywordOp(Keyword('title'), Value('foo')),
+ ),
),
-
# Unrecognized queries
(
'title γ-radiation and and',
QueryWithMalformedPart(
KeywordOp(Keyword('title'), Value('γ-radiation')),
- MalformedQuery(['and', 'and'])
- )
- ),
- ('find j Nucl.Phys.,A531,11', KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A531,11'))),
+ MalformedQuery(['and', 'and']),
+ ),
+ ),
+ (
+ 'find j Nucl.Phys.,A531,11',
+ KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A531,11')),
+ ),
(
'find j Nucl.Phys. and j Nucl.Phys.',
AndOp(
KeywordOp(Keyword('journal'), Value('Nucl.Phys.')),
- KeywordOp(Keyword('journal'), Value('Nucl.Phys.'))
- )
+ KeywordOp(Keyword('journal'), Value('Nucl.Phys.')),
+ ),
),
(
'find j Nucl.Phys. and vol A351 and author ellis',
AndOp(
KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A351')),
- KeywordOp(Keyword('author'), Value('ellis'))
- )
+ KeywordOp(Keyword('author'), Value('ellis')),
+ ),
),
(
- 'find j Nucl.Phys. and vol A351 and author ellis and author smith and ea john',
+ (
+ 'find j Nucl.Phys. and vol A351 and author ellis and author smith and'
+ ' ea john'
+ ),
AndOp(
KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A351')),
AndOp(
KeywordOp(Keyword('author'), Value('ellis')),
AndOp(
KeywordOp(Keyword('author'), Value('smith')),
- KeywordOp(Keyword('exact-author'), Value('john'))
- )
- )
- )
+ KeywordOp(Keyword('exact-author'), Value('john')),
+ ),
+ ),
+ ),
+ ),
+ (
+ 'find j Nucl.Phys. and vol A531',
+ KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A531')),
),
- ('find j Nucl.Phys. and vol A531', KeywordOp(Keyword('journal'), Value('Nucl.Phys.,A531'))),
(
'find j Nucl.Phys. and author ellis',
AndOp(
KeywordOp(Keyword('journal'), Value('Nucl.Phys.')),
- KeywordOp(Keyword('author'), Value('ellis'))
- )
+ KeywordOp(Keyword('author'), Value('ellis')),
+ ),
),
(
'find author ellis and j Nucl.Phys. and vol B351 and title Collider',
@@ -548,21 +629,14 @@
KeywordOp(Keyword('author'), Value('ellis')),
AndOp(
KeywordOp(Keyword('journal'), Value('Nucl.Phys.,B351')),
- KeywordOp(Keyword('title'), Value('Collider'))
- )
- )
+ KeywordOp(Keyword('title'), Value('Collider')),
+ ),
+ ),
),
(
- 'find author ellis and j Nucl.Phys. and vol B351 and title Collider',
- AndOp(
- KeywordOp(Keyword('author'), Value('ellis')),
- AndOp(
- KeywordOp(Keyword('journal'), Value('Nucl.Phys.,B351')),
- KeywordOp(Keyword('title'), Value('Collider'))
- )
- )
+ 'find j Nucl.Phys. and not vol A531',
+ KeywordOp(Keyword('journal'), Value('Nucl.Phys.')),
),
- ('find j Nucl.Phys. and not vol A531', KeywordOp(Keyword('journal'), Value('Nucl.Phys.'))),
# regression with date keyword followed by string not containing date
(
"find da Silva and j Nucl.Phys.",
@@ -577,11 +651,11 @@
KeywordOp(Keyword('journal'), Value('Nucl.Phys.')),
AndOp(
KeywordOp(Keyword('author'), Value('ellis')),
- KeywordOp(Keyword('author'), Value('john'))
- )
- )
- )
- ]
+ KeywordOp(Keyword('author'), Value('john')),
+ ),
+ ),
+ ),
+ ],
)
def test_restructuring_visitor_functionality(query_str, expected_parse_tree):
print("Parsing: " + query_str)
@@ -600,28 +674,27 @@ def test_foo_bar():
restructuring_visitor = RestructuringVisitor()
_, parse_tree = stateful_parser.parse(query_str, parser.Query)
parse_tree = parse_tree.accept(restructuring_visitor)
- expected_parse_tree = AndOp(KeywordOp(Keyword('journal'), Value('Nucl.Phys.')),
- KeywordOp(Keyword('author'), Value('ellis')))
+ expected_parse_tree = AndOp(
+ KeywordOp(Keyword('journal'), Value('Nucl.Phys.')),
+ KeywordOp(Keyword('author'), Value('ellis')),
+ )
assert parse_tree == expected_parse_tree
@pytest.mark.parametrize(
- ['query_str', 'expected_parse_tree'],
+ ('query_str', 'expected_parse_tree'),
[
(
'sungtae cho or 1301.7261',
- OrOp(
- ValueOp(Value('sungtae cho')),
- ValueOp(Value('1301.7261'))
- )
+ OrOp(ValueOp(Value('sungtae cho')), ValueOp(Value('1301.7261'))),
),
(
'raffaele d\'agnolo and not cn cms',
AndOp(
ValueOp(Value('raffaele d\'agnolo')),
- NotOp(KeywordOp(Keyword('collaboration'), Value('cms')))
- )
+ NotOp(KeywordOp(Keyword('collaboration'), Value('cms'))),
+ ),
),
('a kondrashuk', KeywordOp(Keyword('author'), Value('kondrashuk'))),
('a r.j.hill.1', KeywordOp(Keyword('author'), Value('r.j.hill.1'))),
@@ -630,29 +703,31 @@ def test_foo_bar():
OrOp(
KeywordOp(Keyword('author'), Value('fileviez perez,p')),
KeywordOp(Keyword('author'), Value('p. f. perez')),
- )
+ ),
),
(
'a espinosa,jose r and not a rodriguez espinosa',
AndOp(
KeywordOp(Keyword('author'), Value('espinosa,jose r')),
NotOp(KeywordOp(Keyword('author'), Value('rodriguez espinosa'))),
- )
+ ),
),
(
'a nilles,h and not tc I',
AndOp(
KeywordOp(Keyword('author'), Value('nilles,h')),
NotOp(KeywordOp(Keyword('type-code'), Value('I'))),
- )
+ ),
),
(
- 'a rojo,j. or rojo-chacon,j. and not collaboration pierre auger '
- 'and not collaboration auger and not t auger and tc p',
+ (
+ 'a rojo,j. or rojo-chacon,j. and not collaboration pierre auger '
+ 'and not collaboration auger and not t auger and tc p'
+ ),
AndOp(
OrOp(
KeywordOp(Keyword('author'), Value('rojo,j.')),
- KeywordOp(Keyword('author'), Value('rojo-chacon,j.'))
+ KeywordOp(Keyword('author'), Value('rojo-chacon,j.')),
),
AndOp(
NotOp(KeywordOp(Keyword('collaboration'), Value('pierre auger'))),
@@ -660,45 +735,69 @@ def test_foo_bar():
NotOp(KeywordOp(Keyword('collaboration'), Value('auger'))),
AndOp(
NotOp(KeywordOp(Keyword('title'), Value('auger'))),
- KeywordOp(Keyword('type-code'), Value('p'))
- )
- )
- )
- )
+ KeywordOp(Keyword('type-code'), Value('p')),
+ ),
+ ),
+ ),
+ ),
),
- ('ea wu, xing gang', KeywordOp(Keyword('exact-author'), Value('wu, xing gang'))),
- ('abstract: part*', KeywordOp(Keyword('abstract'), Value('part*', contains_wildcard=True))),
(
- "(author:'Hiroshi Okada' OR (author:'H Okada' hep-ph) OR "
- "title: 'Dark matter in supersymmetric U(1(B-L) model' OR "
- "title: 'Non-Abelian discrete symmetry for flavors')",
+ 'ea wu, xing gang',
+ KeywordOp(Keyword('exact-author'), Value('wu, xing gang')),
+ ),
+ (
+ 'abstract: part*',
+ KeywordOp(Keyword('abstract'), Value('part*', contains_wildcard=True)),
+ ),
+ (
+ (
+ "(author:'Hiroshi Okada' OR (author:'H Okada' hep-ph) OR "
+ "title: 'Dark matter in supersymmetric U(1(B-L) model' OR "
+ "title: 'Non-Abelian discrete symmetry for flavors')"
+ ),
OrOp(
KeywordOp(Keyword('author'), PartialMatchValue('Hiroshi Okada')),
OrOp(
AndOp(
KeywordOp(Keyword('author'), PartialMatchValue('H Okada')),
- ValueOp(Value('hep-ph'))
+ ValueOp(Value('hep-ph')),
),
OrOp(
- KeywordOp(Keyword('title'), PartialMatchValue('Dark matter in supersymmetric U(1(B-L) model')),
- KeywordOp(Keyword('title'), PartialMatchValue('Non-Abelian discrete symmetry for flavors')),
- )
- )
- )
+ KeywordOp(
+ Keyword('title'),
+ PartialMatchValue(
+ 'Dark matter in supersymmetric U(1(B-L) model'
+ ),
+ ),
+ KeywordOp(
+ Keyword('title'),
+ PartialMatchValue(
+ 'Non-Abelian discrete symmetry for flavors'
+ ),
+ ),
+ ),
+ ),
+ ),
),
(
'author:"Takayanagi, Tadashi" or hep-th/0010101',
OrOp(
KeywordOp(Keyword('author'), ExactMatchValue('Takayanagi, Tadashi')),
- ValueOp(Value('hep-th/0010101'))
- )
+ ValueOp(Value('hep-th/0010101')),
+ ),
),
('ea:matt visser', KeywordOp(Keyword('exact-author'), Value('matt visser'))),
(
'citedby:recid:902780',
- NestedKeywordOp(Keyword('citedby'), KeywordOp(Keyword('control_number'), Value('902780')))
+ NestedKeywordOp(
+ Keyword('citedby'),
+ KeywordOp(Keyword('control_number'), Value('902780')),
+ ),
+ ),
+ (
+ 'eprint:arxiv:1706.04080',
+ KeywordOp(Keyword('eprint'), Value('arxiv:1706.04080')),
),
- ('eprint:arxiv:1706.04080', KeywordOp(Keyword('eprint'), Value('arxiv:1706.04080'))),
('eprint:1706.04080', KeywordOp(Keyword('eprint'), Value('1706.04080'))),
(
'f a ostapchenko not olinto not haungs',
@@ -706,18 +805,29 @@ def test_foo_bar():
KeywordOp(Keyword('author'), Value('ostapchenko')),
AndOp(
NotOp(KeywordOp(Keyword('author'), Value('olinto'))),
- NotOp(KeywordOp(Keyword('author'), Value('haungs')))
- )
- )
+ NotOp(KeywordOp(Keyword('author'), Value('haungs'))),
+ ),
+ ),
),
('find cc italy', KeywordOp(Keyword('country'), Value('italy'))),
- ('fin date > today', KeywordOp(Keyword('date'), GreaterThanOp(Value(str(date.today()))))),
- ('find r atlas-conf-*', KeywordOp(Keyword('reportnumber'), Value('atlas-conf-*', contains_wildcard=True))),
+ (
+ 'fin date > today',
+ KeywordOp(Keyword('date'), GreaterThanOp(Value(str(date.today())))),
+ ),
+ (
+ 'find r atlas-conf-*',
+ KeywordOp(
+ Keyword('reportnumber'), Value('atlas-conf-*', contains_wildcard=True)
+ ),
+ ),
(
'find caption "Diagram for the fermion flow violating process"',
- KeywordOp(Keyword('caption'), ExactMatchValue('Diagram for the fermion flow violating process'))
- )
- ]
+ KeywordOp(
+ Keyword('caption'),
+ ExactMatchValue('Diagram for the fermion flow violating process'),
+ ),
+ ),
+ ],
)
def test_parsing_output_with_inspire_next_tests(query_str, expected_parse_tree):
print("Parsing: " + query_str)
@@ -730,32 +840,30 @@ def test_parsing_output_with_inspire_next_tests(query_str, expected_parse_tree):
def test_convert_simple_value_boolean_query_to_and_boolean_queries():
- parse_tree = \
- parser.SimpleQuery(
- parser.SpiresKeywordQuery(
- parser.InspireKeyword('author'),
- parser.Value(
+ parse_tree = parser.SimpleQuery(
+ parser.SpiresKeywordQuery(
+ parser.InspireKeyword('author'),
+ parser.Value(
+ parser.SimpleValueBooleanQuery(
+ parser.SimpleValue('foo'),
+ parser.And(),
parser.SimpleValueBooleanQuery(
- parser.SimpleValue('foo'),
- parser.And(),
- parser.SimpleValueBooleanQuery(
- parser.SimpleValue('bar'),
- parser.Or(),
- parser.SimpleValueNegation(parser.SimpleValue('foobar'))
- )
- )
+ parser.SimpleValue('bar'),
+ parser.Or(),
+ parser.SimpleValueNegation(parser.SimpleValue('foobar')),
+ ),
)
- )
+ ),
)
+ )
- expected_parse_tree = \
- AndOp(
- KeywordOp(Keyword('author'), Value('foo')),
- OrOp(
- KeywordOp(Keyword('author'), Value('bar')),
- NotOp(KeywordOp(Keyword('author'), Value('foobar')))
- )
- )
+ expected_parse_tree = AndOp(
+ KeywordOp(Keyword('author'), Value('foo')),
+ OrOp(
+ KeywordOp(Keyword('author'), Value('bar')),
+ NotOp(KeywordOp(Keyword('author'), Value('foobar'))),
+ ),
+ )
restructuring_visitor = RestructuringVisitor()
parse_tree = parse_tree.accept(restructuring_visitor)
diff --git a/tests/test_visitor_utils.py b/tests/test_visitor_utils.py
index 3b72602..b0ce7a1 100644
--- a/tests/test_visitor_utils.py
+++ b/tests/test_visitor_utils.py
@@ -22,7 +22,8 @@
from __future__ import absolute_import, print_function, unicode_literals
-from pytest import raises
+import pytest
+from test_utils import parametrize
from inspire_query_parser.utils.visitor_utils import (
_truncate_wildcard_from_date,
@@ -34,23 +35,18 @@
wrap_query_in_nested_if_field_is_nested,
)
-from test_utils import parametrize
-
-@parametrize({
- 'Name with full name parts': {
- 'name': 'mele salvatore', 'expected_answer': True
- },
- 'Lastname only': {
- 'name': 'mele', 'expected_answer': False
- },
- 'Lastname, initial(Firstname)': {
- 'name': 'mele s', 'expected_answer': False
- },
- 'Lastname, initial(Firstname).': {
- 'name': 'mele s.', 'expected_answer': False
- },
-})
+@parametrize(
+ {
+ 'Name with full name parts': {
+ 'name': 'mele salvatore',
+ 'expected_answer': True,
+ },
+ 'Lastname only': {'name': 'mele', 'expected_answer': False},
+ 'Lastname, initial(Firstname)': {'name': 'mele s', 'expected_answer': False},
+ 'Lastname, initial(Firstname).': {'name': 'mele s.', 'expected_answer': False},
+ }
+)
def test_author_name_contains_fullnames(name, expected_answer):
assert expected_answer == author_name_contains_fullnames(name)
@@ -94,7 +90,7 @@ def test_generate_minimal_name_variations_with_dotted_initial():
assert expected_variations == set(generate_minimal_name_variations(name))
-def test_generate_minimal_name_variations_without_dotted_initial_doesnt_generate_same_variation():
+def test_generate_minimal_name_variations_without_dotted_initial_doesnt_generate_same_variation(): # noqa E501
name = 'Oz, Y'
expected_variations = {
'oz y',
@@ -108,7 +104,7 @@ def test_generate_minimal_name_variations_without_dotted_initial_doesnt_generate
assert expected_variations == set(result)
-def test_generate_minimal_name_variations_with_initial_strips_multiple_consecutive_whitespace():
+def test_generate_minimal_name_variations_with_initial_strips_multiple_consecutive_whitespace(): # noqa E501
name = 'oz,y'
expected_variations = {
'oz y',
@@ -132,63 +128,61 @@ def test_generate_minimal_name_variations_with_dashed_lastname():
assert expected_variations == generate_minimal_name_variations(name)
-@parametrize({
- 'Wildcard as whole day': {
- 'date': '2018-01-*', 'expected_date': '2018-01'
- },
- 'Wildcard as part of the day': {
- 'date': '2018-01-1*', 'expected_date': '2018-01'
- },
- 'Wildcard as whole day (space separated)': {
- 'date': '2018 01 *', 'expected_date': '2018-01'
- },
- 'Wildcard as part of the day (space separated)': {
- 'date': '2018 01 1*', 'expected_date': '2018-01'
- },
-
- 'Wildcard as whole month': {
- 'date': '2018-*', 'expected_date': '2018'
- },
- 'Wildcard as part of the month': {
- 'date': '2018-*', 'expected_date': '2018'
- },
- 'Wildcard as whole month (space separated)': {
- 'date': '2018 *', 'expected_date': '2018'
- },
- 'Wildcard as part of the month (space separated)': {
- 'date': '2018 1*', 'expected_date': '2018'
- },
-})
+@parametrize(
+ {
+ 'Wildcard as whole day': {'date': '2018-01-*', 'expected_date': '2018-01'},
+ 'Wildcard as part of the day': {
+ 'date': '2018-01-1*',
+ 'expected_date': '2018-01',
+ },
+ 'Wildcard as whole day (space separated)': {
+ 'date': '2018 01 *',
+ 'expected_date': '2018-01',
+ },
+ 'Wildcard as part of the day (space separated)': {
+ 'date': '2018 01 1*',
+ 'expected_date': '2018-01',
+ },
+ 'Wildcard as whole month': {'date': '2018-*', 'expected_date': '2018'},
+ 'Wildcard as part of the month': {'date': '2018-*', 'expected_date': '2018'},
+ 'Wildcard as whole month (space separated)': {
+ 'date': '2018 *',
+ 'expected_date': '2018',
+ },
+ 'Wildcard as part of the month (space separated)': {
+ 'date': '2018 1*',
+ 'expected_date': '2018',
+ },
+ }
+)
def test_truncate_wildcard_from_date_with_wildcard(date, expected_date):
assert _truncate_wildcard_from_date(date) == expected_date
def test_truncate_wildcard_from_date_throws_on_wildcard_in_year():
date = '201*'
- with raises(ValueError):
+ with pytest.raises(ValueError, match='Erroneous date value:'):
_truncate_wildcard_from_date(date)
def test_truncate_wildcard_from_date_throws_with_unsupported_separator():
date = '2018_1*'
- with raises(ValueError):
+ with pytest.raises(ValueError, match='Erroneous date value:'):
_truncate_wildcard_from_date(date)
def test_generate_match_query_with_bool_value():
generated_match_query = generate_match_query('core', True, with_operator_and=True)
- expected_match_query = {
- 'match': {
- 'core': True
- }
- }
+ expected_match_query = {'match': {'core': True}}
assert generated_match_query == expected_match_query
def test_generate_match_query_with_operator_and():
- generated_match_query = generate_match_query('author', 'Ellis, John', with_operator_and=True)
+ generated_match_query = generate_match_query(
+ 'author', 'Ellis, John', with_operator_and=True
+ )
expected_match_query = {
'match': {
@@ -203,13 +197,11 @@ def test_generate_match_query_with_operator_and():
def test_generate_match_query_with_operator_and_false():
- generated_match_query = generate_match_query('document_type', 'book', with_operator_and=False)
+ generated_match_query = generate_match_query(
+ 'document_type', 'book', with_operator_and=False
+ )
- expected_match_query = {
- 'match': {
- 'document_type': 'book'
- }
- }
+ expected_match_query = {'match': {'document_type': 'book'}}
assert generated_match_query == expected_match_query
@@ -220,8 +212,9 @@ def test_wrap_queries_in_bool_clauses_if_more_than_one_with_two_queries():
{'match': {'subject': 'hep'}},
]
- generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries,
- use_must_clause=True)
+ generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
+ queries, use_must_clause=True
+ )
expected_bool_clause = {
'bool': {
@@ -235,58 +228,54 @@ def test_wrap_queries_in_bool_clauses_if_more_than_one_with_two_queries():
assert generated_bool_clause == expected_bool_clause
-def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_drops_bool_clause_with_flag_disabled():
+def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_drops_bool_clause_with_flag_disabled(): # noqa E501
queries = [
{'match': {'title': 'collider'}},
]
- generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries,
- use_must_clause=True)
+ generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
+ queries, use_must_clause=True
+ )
expected_bool_clause = {'match': {'title': 'collider'}}
assert generated_bool_clause == expected_bool_clause
-def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_preserves_bool_clause_with_flag_enabled():
+def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_preserves_bool_clause_with_flag_enabled(): # noqa E501
queries = [
{'match': {'title': 'collider'}},
]
- generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries,
- use_must_clause=True,
- preserve_bool_semantics_if_one_clause=True)
+ generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
+ queries, use_must_clause=True, preserve_bool_semantics_if_one_clause=True
+ )
- expected_bool_clause = {
- 'bool': {
- 'must': [
- {'match': {'title': 'collider'}}
- ]
- }
- }
+ expected_bool_clause = {'bool': {'must': [{'match': {'title': 'collider'}}]}}
assert generated_bool_clause == expected_bool_clause
-def test_wrap_queries_in_bool_clauses_if_more_than_one_with_no_query_returns_empty_dict():
+def test_wrap_queries_in_bool_clauses_if_more_than_one_with_no_query_returns_empty_dict(): # noqa E501
queries = []
- generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries,
- use_must_clause=True)
+ generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
+ queries, use_must_clause=True
+ )
expected_bool_clause = {}
assert generated_bool_clause == expected_bool_clause
-def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_generates_should_clause():
+def test_wrap_queries_in_bool_clauses_if_more_than_one_with_one_query_generates_should_clause(): # noqa E501
queries = [
{'match': {'title': 'collider'}},
]
- generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(queries,
- use_must_clause=False,
- preserve_bool_semantics_if_one_clause=True)
+ generated_bool_clause = wrap_queries_in_bool_clauses_if_more_than_one(
+ queries, use_must_clause=False, preserve_bool_semantics_if_one_clause=True
+ )
expected_bool_clause = {
'bool': {
@@ -322,7 +311,7 @@ def test_generate_nested_query():
{'match': {'journal.volume': 'D42'}},
]
}
- }
+ },
}
}
@@ -343,17 +332,18 @@ def test_generate_nested_query_returns_empty_dict_on_falsy_query():
def test_wrap_query_in_nested_if_field_is_nested():
query = {'match': {'title.name': 'collider'}}
- generated_query = wrap_query_in_nested_if_field_is_nested(query, 'title.name', ['title'])
+ generated_query = wrap_query_in_nested_if_field_is_nested(
+ query, 'title.name', ['title']
+ )
expected_query = {
- 'nested': {
- 'path': 'title',
- 'query': {'match': {'title.name': 'collider'}}
- }
+ 'nested': {'path': 'title', 'query': {'match': {'title.name': 'collider'}}}
}
assert generated_query == expected_query
- generated_query_2 = wrap_query_in_nested_if_field_is_nested(query, 'title.name', ['authors'])
+ generated_query_2 = wrap_query_in_nested_if_field_is_nested(
+ query, 'title.name', ['authors']
+ )
assert generated_query_2 == query