From 72e5a8aef302562e03dd79f8fbe851a8297b9439 Mon Sep 17 00:00:00 2001 From: Pierre Gayvallet Date: Thu, 9 Jan 2025 08:04:29 +0100 Subject: [PATCH] [NL-to-ESQL] update internal documentation (#205853) ## Summary Fix https://github.com/elastic/kibana/issues/205606 - Re-generate the internal ES|QL documentation using the generation script (+ human review) - Add more scenario to the NL-to-ESQL evaluation suite - Some prompt engineering - improving the system instructions / functions summary - add more examples to the summary - adapt a few opinionated examples for some specific functions ## Evaluation - average based on 4 runs for each model/branch tuple - the new tests were locally added to main to run against the same suite and properly evaluate the difference | Model | before (main) | after (PR) | delta | | ------------- | ------------- | ------------- | ------------- | | GPT-4o | 90.9 | 97.74 | + 6.84 | | Claude 3.5 Sonnet v2 | 88.58 | 96.49 | +7.91 | | Gemini 1.5-pro-002 | 88.17 | 94.19 | +6.02 | Overall, the prompt engineering somewhat significantly improved the generation efficiency. --- .../evaluation/scenarios/esql/index.spec.ts | 462 +++++++++++++++++- .../scripts/load_esql_docs/load_esql_docs.ts | 6 +- .../inference/scripts/util/kibana_client.ts | 4 +- .../tasks/nl_to_esql/doc_base/suggestions.ts | 5 + .../nl_to_esql/esql_docs/esql-bit_length.txt | 24 + .../nl_to_esql/esql_docs/esql-bucket.txt | 22 +- .../nl_to_esql/esql_docs/esql-byte_length.txt | 22 + .../tasks/nl_to_esql/esql_docs/esql-case.txt | 14 +- .../nl_to_esql/esql_docs/esql-categorize.txt | 30 ++ .../esql_docs/esql-date_extract.txt | 2 +- .../nl_to_esql/esql_docs/esql-date_parse.txt | 5 +- .../tasks/nl_to_esql/esql_docs/esql-eval.txt | 9 + .../tasks/nl_to_esql/esql_docs/esql-exp.txt | 4 +- .../tasks/nl_to_esql/esql_docs/esql-hash.txt | 30 ++ .../tasks/nl_to_esql/esql_docs/esql-hypot.txt | 28 ++ .../tasks/nl_to_esql/esql_docs/esql-keep.txt | 28 +- .../tasks/nl_to_esql/esql_docs/esql-limit.txt | 13 + .../tasks/nl_to_esql/esql_docs/esql-match.txt | 36 ++ .../nl_to_esql/esql_docs/esql-mv_avg.txt | 5 +- .../nl_to_esql/esql_docs/esql-mv_count.txt | 5 +- .../nl_to_esql/esql_docs/esql-mv_first.txt | 5 +- .../nl_to_esql/esql_docs/esql-mv_last.txt | 5 +- .../nl_to_esql/esql_docs/esql-mv_max.txt | 7 +- .../esql-mv_median_absolute_deviation.txt | 24 + .../nl_to_esql/esql_docs/esql-mv_min.txt | 5 +- .../esql_docs/esql-mv_percentile.txt | 26 + .../nl_to_esql/esql_docs/esql-mv_sort.txt | 10 +- .../nl_to_esql/esql_docs/esql-operators.txt | 6 +- .../nl_to_esql/esql_docs/esql-overview.txt | 3 +- .../tasks/nl_to_esql/esql_docs/esql-qstr.txt | 31 ++ .../nl_to_esql/esql_docs/esql-reverse.txt | 29 ++ .../tasks/nl_to_esql/esql_docs/esql-space.txt | 22 + .../nl_to_esql/esql_docs/esql-st_envelope.txt | 24 + .../esql_docs/esql-st_extent_agg.txt | 25 + .../nl_to_esql/esql_docs/esql-st_xmax.txt | 27 + .../nl_to_esql/esql_docs/esql-st_xmin.txt | 31 ++ .../nl_to_esql/esql_docs/esql-st_ymax.txt | 31 ++ .../nl_to_esql/esql_docs/esql-st_ymin.txt | 27 + .../nl_to_esql/esql_docs/esql-starts_with.txt | 2 +- .../tasks/nl_to_esql/esql_docs/esql-stats.txt | 21 +- .../nl_to_esql/esql_docs/esql-std_dev.txt | 29 ++ .../nl_to_esql/esql_docs/esql-syntax.txt | 9 - .../esql_docs/esql-to_dateperiod.txt | 22 + .../nl_to_esql/esql_docs/esql-to_datetime.txt | 5 +- .../esql_docs/esql-to_timeduration.txt | 23 + .../nl_to_esql/esql_docs/esql-values.txt | 1 - .../tasks/nl_to_esql/system_message.txt | 297 ++++++----- 47 files changed, 1307 insertions(+), 194 deletions(-) create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-bit_length.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-byte_length.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-categorize.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-hash.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-hypot.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-match.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_median_absolute_deviation.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_percentile.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-qstr.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-reverse.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-space.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_envelope.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_extent_agg.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_xmax.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_xmin.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_ymax.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_ymin.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-std_dev.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_dateperiod.txt create mode 100644 x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_timeduration.txt diff --git a/x-pack/platform/plugins/shared/inference/scripts/evaluation/scenarios/esql/index.spec.ts b/x-pack/platform/plugins/shared/inference/scripts/evaluation/scenarios/esql/index.spec.ts index 49a82db8124e9..f20fe4d6aa89d 100644 --- a/x-pack/platform/plugins/shared/inference/scripts/evaluation/scenarios/esql/index.spec.ts +++ b/x-pack/platform/plugins/shared/inference/scripts/evaluation/scenarios/esql/index.spec.ts @@ -140,7 +140,10 @@ async function evaluateEsqlQuery({ esqlDescription: docBase.getSystemMessage(), }); - const requestedDocumentation = docBase.getDocumentation(usedCommands); + const requestedDocumentation = docBase.getDocumentation(usedCommands, { + generateMissingKeywordDoc: false, + }); + requestedDocumentation.commands_and_functions = docBase.getSystemMessage(); const evaluation = await evaluationClient.evaluate({ input: ` @@ -169,6 +172,447 @@ async function evaluateEsqlQuery({ const buildTestDefinitions = (): Section[] => { const testDefinitions: Section[] = [ + { + title: 'ES|QL commands and functions usage', + tests: [ + { + title: 'using FLOOR and CEIL', + question: ` + The user is visualizing the "paris_distance" index. + + Generate a query returning the 5 users closest to Paris, + and for each of them their id and the distance, rounded down and then rounded up. + + You should use the FLOOR and CEIL functions to answer this question. + + The relevant fields are: + - user_id: keyword + - distance: float - the distance between the user and Paris, in km + Note: there are other fields + `, + expected: `FROM paris_distance + | SORT distance ASC + | LIMIT 5 + | EVAL distance_down = FLOOR(distance), distance_up = CEIL(distance) + | KEEP user_id, distance_down, distance_up`, + }, + { + title: 'using MV_COUNT, MV_MAX, MV_MIN and MV_AVG', + question: ` + The user is visualizing the "sets" index, representing sets of numbers. + Each row is composed of a set_id (identifier, unique per row), and of a **multi-valued** integer + field, "values" + + Returns the 5 rows containing the most values, sorted by number of values, + and for each of them, return: + - their id + - the min element + - the max element + - the average of the elements + + The relevant fields of this index are: + - set_id: keyword - the set unique identified + - values: multivalued integer field - the set values + Note: there are other fields + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM sets + | EVAL count = MV_COUNT(values) + | SORT count DESC + | LIMIT 5 + | EVAL min = MV_MIN(values), max = MV_MAX(values), avg = MV_AVG(value) + | KEEP set_id, min, max, avg + """ + + The query **MUST** use MV_COUNT, MV_MIN, MV_MAX and MV_AVG and **NOT** use their aggregation equivalent + or STATS BY given the "values" field is multivalued. Not respecting this particular condition should totally fail the criteria. + `, + ], + }, + { + title: 'using LENGTH, BIT_LENGTH, BYTE_LENGTH', + question: ` + The user is visualizing the "messages" index, storing text messages + Each row is composed of a "message_id" (keyword, unique identifier), and of "content" + field (text, content of the message). + + Returns the 10 messages that have the most characters, sorted by number of characters, + and for each of them, return the following: + - id of the message + - length in characters of the message + - length in bytes of the messages + - length in bits of the message + + The relevant fields of this index are: + - message_id: keyword - the message unique identified + - content: text - content of the message + Note: there are no other fields + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM messages + | EVAL length = LENGTH(content), bytes = BYTE_LENGTH(content), bits = BIT_LENGTH(content) + | SORT length DESC + | LIMIT 10 + """ + + In addition, the query **MUST**: + - use the LENGTH function + - use at least one of BIT_LENGTH and/or BYTE_LENGTH functions + - if only one of BIT_LENGTH or BYTE_LENGTH is used, properly do the conversion (1byte=8bits) + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using CIDR_MATCH and IP_PREFIX', + question: ` + The user is visualizing the "proxy_logs" index, storing access logs entries + + The relevant fields of this index are: + - @timestamp: date - the time of the access + - source_ip: ip - source of the access + - destination_ip: ip - destination of the access + - status: integer - status code of the response + Note: there are no other fields + + Generate a query that shows the number of requests coming from the 192.168.5.0/8 subnet, + grouped by 8bits (/8) subnetworks of the destination IP and sorted by number of entries. + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM proxy_logs + | WHERE CIDR_MATCH(source_ip, "192.168.5.0/8") + | STATS count = COUNT(*) BY subnet = IP_PREFIX(destination_ip, 8, 0) + | SORT count DESC + """ + + In addition, the query **MUST**: + - use CIDR_MATCH in the WHERE clause + - use IP_PREFIX in a STATS aggregation + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using GREATEST and LEAST', + question: ` + The user is visualizing the "number_tuple" index, representing a 3-tuple of number. + + The relevant fields of this index are: + - bag_id: keyword - a unique identifier + - number_1: the first number of the tuple + - number_2: the second number of the tuple + - number_3: the third number of the tuple + Note: there are no other fields + + Generate a query that shows, for each bag: + - the bag id + - the sum of the 3 numbers + - the highest of the 3 numbers + - the lowest of the 3 numbers + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM number_tuple + | EVAL sum = number_1 + number_2 + number_3, highest = GREATEST(number_1, number_2, number_3), lowest = LEAST(number_1, number_2, number_3) + | KEEP bag_id, sum, highest, lowest + """ + + In addition, the query **MUST**: + - use GREATEST + - use LEAST + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using MIN, MAX, MEDIAN, PERCENTILE', + question: ` + The user is visualizing the "access_logs" index, representing access logs to some http server. + + The relevant fields of this index are: + - @timestamp: the timestamp of the access + - status_code: the http status code + - response_time: the response time of the remote server + - response_length: the length of the response body, in bytes + Note: there are other fields + + Generate a query that shows entries over the past 30 days and grouped by status code: + - the minimum response time + - the maximum response time + - the median response time + - the 90 percentile of the response time + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM access_logs + | WHERE @timestamp > NOW() - 30d + | STATS min=MIN(response_time), max=MAX(response_time), med=MEDIAN(response_time), p90=PERCENTILE(response_time, 90) BY status_code + | KEEP status_code, min, max, med, p90 + """ + + In addition, the query **MUST**: + - use aggregations with STATS + - use MIN + - use MAX + - use MEDIAN + - use PERCENTILE + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using LOCATE', + question: ` + The user is visualizing the "messages" index, representing text messages. + + The relevant fields of this index are: + - @timestamp: the datetime the message was sent at + - message_id: the unique id of the message + - content: the text content of the message + Note: there are other fields + + Generate a query that shows, for the 10 most recent messages containing the string "hello" in the content + - the message id + - the datetime the message was sent at + - the first position of the "hello" string in message content + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM messages + | WHERE content LIKE "*hello*" + | SORT @timestamp DESC + | LIMIT 10 + | EVAL position=LOCATE(content, "hello") + | KEEP message_id, @timestamp, position + """ + + In addition, the query **MUST**: + - use one of LIKE, RLIKE or LOCATE for the WHERE clause + - use EVAL and not STATS + - use LOCATE to find the position of "hello" + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using TO_BASE64 and FROM_BASE64', + question: ` + The user is visualizing the "messages" index, representing text messages. + + The relevant fields of this index are: + - @timestamp: the datetime the message was sent at + - message_id: the unique id of the message + - content: the content of the message encoded as b64 + Note: there are other fields + + Generate a query that shows, for the 10 most recent messages: + - the message id, encoded as base64 + - the datetime the message was sent at + - the message content, decoded from base64 + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM messages + | SORT @timestamp DESC + | LIMIT 10 + | EVAL id_encoded=TO_BASE64(message_id), content_decoded=FROM_BASE64(content) + | KEEP id_encoded, @timestamp, content_decoded + """ + + In addition, the query **MUST**: + - use TO_BASE64 to encode message_id + - use FROM_BASE64 to decode content + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using POW, PI, LOG and EXP', + question: ` + The user is visualizing the "points" index, representing two dimension points. + + The relevant fields of this index are: + - x: integer - the x position of the point + - y: integer - the y position of the point + Note: there are other fields + + Generate a query returning, for all rows: + - x + - y + - x^pi + - log2(x) + - e^y + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM points + | EVAL pow=POW(x, PI()), log=LOG(2, x) exp=EXP(y) + | KEEP x, y, pow, log, exp + """ + + In addition, the query **MUST**: + - use POW and PI + - use LOG with the right base (2) as first parameter + - use EXP or E + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using CASE', + question: ` + The user is visualizing the "sample_data" index. + + The relevant fields of this index are: + - @timestamp: timestamp of the entry + - message: text - the log message + Note: there are other fields + + Generate a query returning, for all rows: + - @timestamp + - message + - a column displaying: + - IF message contains "error" then "ERROR" + - ELIF message contains "http" then "NETWORK" + - ELSE "UNKNOWN" + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM sample_data + | EVAL eval=CASE(message LIKE "*error*", "ERROR", message LIKE "*http*", "NETWORK", "UNKNOWN") + | KEEP @timestamp, message, eval + """ + + In addition, the query **MUST**: + - use CASE for the evaluated column + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using DATE_DIFF', + question: ` + The user is visualizing the "personal_info" index. + + The relevant fields of this index are: + - user_name: keyword - the name of the person + - birth_date: datetime - the person's birth date + - wedding_date: datetime - the person's wedding date if wed, null otherwise + Note: there are other fields + + Generate a query returning, for the 15 older persons that got wed: + - their user name + - their age when they got wed + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM personal_info + | WHERE wedding_date IS NOT NULL + | LIMIT 15 + | EVAL wedding_age=DATE_DIFF("years", birth_date, wedding_date) + | KEEP user_name, wedding_age + """ + + In addition, the query **MUST**: + - use DATE_DIFF or DATE_EXTRACT to evaluate the wedding age + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using DATE_EXTRACT', + question: ` + The user is visualizing the "personal_info" index. + + The relevant fields of this index are: + - user_name: keyword - the name of the person + - birth_date: datetime - the person's birth date + + Generate a query returning, for the all entries in the index: + - their user name + - their year of birth + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM personal_info + | EVAL birth_year=DATE_EXTRACT("year", birth_date) + | KEEP user_name, birth_year + """ + + In addition, the query **MUST**: + - use DATE_EXTRACT or DATE_TRUNC to evaluate the year of birth with the parameters at the correct position + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + { + title: 'using DATE_PARSE', + question: ` + The user is visualizing the "personal_info" index. + + The relevant fields of this index are: + - user_name: keyword - the name of the person + - birth_date: string - the person birth date as a string following the "yyyy-MM-dd" format, e.g. "1987-11-30" + + Generate a query returning, for the all entries in the index, sorted by date of birth + - their user name + - their date of birth + `, + criteria: [ + ` + The answer provides a ES|QL query that is functionally equivalent to: + + """esql + FROM personal_info + | EVAL birth=DATE_PARSE("yyyy-MM-dd", birth_date) + | KEEP user_name, birth + | SORT birth + """ + + In addition, the query **MUST**: + - use DATE_PARSE with the correct format as first parameter ("yyyy-MM-dd") + **Not respecting any of those particular conditions should totally fail the criteria** + `, + ], + }, + ], + }, { title: 'ES|QL query generation', tests: [ @@ -197,13 +641,13 @@ const buildTestDefinitions = (): Section[] => { { title: 'Generates a query to show employees which have a palindrome as last name', question: `From the employees index, I want to find all employees with a palindrome as last name - (which can be read the same backward and forward), and then return their last name and first name - - last_name - - first_name`, - criteria: [ - `The assistant should not provide an ES|QL query, and explicitly mention that there is no - way to check for palindromes using ES|QL.`, - ], + (which can be read the same backward and forward), and then return their last name and first name. + Assume the following fields: + - last_name: Last name of the employee (capitalized) + - first_name: First name of the employee (capitalized)`, + expected: `FROM employees + | WHERE TO_LOWER(last_name) == REVERSE(TO_LOWER(last_name)) + | KEEP last_name, first_name`, }, { title: 'Generates a query to show the top 10 domains by doc count', @@ -268,7 +712,7 @@ const buildTestDefinitions = (): Section[] => { - message - level - @timestamp`, - expected: `│ FROM sample_logs + expected: `FROM sample_logs | WHERE source IN ("foo", "bar") AND LENGTH(message) > 62 AND message NOT LIKE "*dolly*" AND level == "INFO" | STATS COUNT(*) BY day = BUCKET(@timestamp, 1d) | SORT day ASC`, diff --git a/x-pack/platform/plugins/shared/inference/scripts/load_esql_docs/load_esql_docs.ts b/x-pack/platform/plugins/shared/inference/scripts/load_esql_docs/load_esql_docs.ts index fba1d75956bf0..3c8c23c99d649 100644 --- a/x-pack/platform/plugins/shared/inference/scripts/load_esql_docs/load_esql_docs.ts +++ b/x-pack/platform/plugins/shared/inference/scripts/load_esql_docs/load_esql_docs.ts @@ -69,7 +69,7 @@ yargs(process.argv.slice(2)) }); log.info(`Using connector ${connector.connectorId}`); - const chatClient = kibanaClient.createInferenceClient({ + const inferenceClient = kibanaClient.createInferenceClient({ connectorId: connector.connectorId, }); @@ -84,14 +84,14 @@ yargs(process.argv.slice(2)) log.info(`Retrieving and converting documentation from ${builtDocsDir}...`); const extraction = await extractDocEntries({ builtDocsDir, - inferenceClient: chatClient, + inferenceClient, log, }); log.info(`Rewriting documentation...`); const docFiles = await generateDoc({ extraction, - inferenceClient: chatClient, + inferenceClient, log, }); diff --git a/x-pack/platform/plugins/shared/inference/scripts/util/kibana_client.ts b/x-pack/platform/plugins/shared/inference/scripts/util/kibana_client.ts index a3a75ea980523..7fe36df5f3fe6 100644 --- a/x-pack/platform/plugins/shared/inference/scripts/util/kibana_client.ts +++ b/x-pack/platform/plugins/shared/inference/scripts/util/kibana_client.ts @@ -212,10 +212,10 @@ export class KibanaClient { return this.axios .post( this.getUrl({ - pathname: `/internal/inference/chat_complete/stream`, + pathname: `/internal/inference/chat_complete`, }), body, - { responseType: 'stream', timeout: NaN } + { timeout: NaN } ) .then((response) => { return response.data; diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/doc_base/suggestions.ts b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/doc_base/suggestions.ts index 42ee960301b76..715e093a3148b 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/doc_base/suggestions.ts +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/doc_base/suggestions.ts @@ -13,6 +13,11 @@ const suggestions: Suggestion[] = [ return ['BUCKET']; } }, + (keywords) => { + if (keywords.includes('TO_DATETIME')) { + return ['DATE_PARSE']; + } + }, ]; /** diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-bit_length.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-bit_length.txt new file mode 100644 index 0000000000000..11a5b8f1728ae --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-bit_length.txt @@ -0,0 +1,24 @@ +# BIT_LENGTH + +This function calculates the bit length of a given string. + +## Syntax + +`BIT_LENGTH(string)` + +### Parameters + +#### string + +This is the string whose bit length you want to calculate. If `null` is provided, the function will return `null`. + +**Note**: Strings are in UTF-8 format, which means a single character may occupy multiple bytes. + +## Examples + +```esql +FROM airports +| WHERE country == "India" +| KEEP city +| EVAL fn_length = LENGTH(city), fn_bit_length = BIT_LENGTH(city) +``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-bucket.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-bucket.txt index 617952666542a..53d889332f414 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-bucket.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-bucket.txt @@ -30,8 +30,7 @@ BUCKET can operate in two modes: - one where the bucket size is computed based on a bucket count recommendation and a range, - and another where the bucket size is provided directly. -When the bucket size is provided directly for time interval, -it is expressed as a *timespan literal*, e.g. +When the bucket size is provided directly for time interval, it is expressed as a **timespan literal**, e.g. - GOOD: `BUCKET(@timestamp, 1 month)` - BAD: `BUCKET(@timestamp, "month")` @@ -74,19 +73,29 @@ FROM employees More examples: +*Regrouping employees in buckets based on salary and counting them* ```esql FROM employees | WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z" -| STATS c = COUNT(1) BY b = BUCKET(salary, 5000.) +| STATS c = COUNT(*) BY b = BUCKET(salary, 5000.) | SORT b ``` +*Group data emitted over the last 24h into 25 buckets* ```esql FROM sample_data | WHERE @timestamp >= NOW() - 1 day and @timestamp < NOW() | STATS COUNT(*) BY bucket = BUCKET(@timestamp, 25, NOW() - 1 day, NOW()) ``` +*Similar to previous example but with fixed 1 hour bucket size* +```esql +FROM sample_data +| WHERE @timestamp >= NOW() - 1 day and @timestamp < NOW() +| STATS COUNT(*) BY bucket = BUCKET(@timestamp, 1 hour) +``` + +*Group employees in 20 buckets based on their hire_date and then calculate the average salary for each bucket* ```esql FROM employees | WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z" @@ -94,9 +103,10 @@ FROM employees | SORT bucket ``` +*Similar to previous example but using fixed 1 month buckets size* ```esql FROM employees -| STATS s1 = BUCKET(salary / 1000 + 999, 50.) + 2 BY b1 = BUCKET(salary / 100 + 99, 50.), b2 = BUCKET(salary / 1000 + 999, 50.) -| SORT b1, b2 -| KEEP b1, s1, b2 +| WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z" +| STATS AVG(salary) BY bucket = BUCKET(hire_date, 1 month) +| SORT bucket ``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-byte_length.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-byte_length.txt new file mode 100644 index 0000000000000..b233190aa4cc2 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-byte_length.txt @@ -0,0 +1,22 @@ +# BYTE_LENGTH + +This function calculates the byte length of a given string. + +## Syntax + +`BYTE_LENGTH(string)` + +### Parameters + +#### string + +The text string for which the byte length is to be determined. If `null` is provided, the function will return `null`. + +## Examples + +```esql +FROM airports +| WHERE country == "India" +| KEEP city +| EVAL fn_length = LENGTH(city), fn_byte_length = BYTE_LENGTH(city) +``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-case.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-case.txt index 110f0ee1a242b..5484a8706391c 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-case.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-case.txt @@ -4,7 +4,7 @@ The CASE function accepts pairs of conditions and values. It returns the value t ## Syntax -`CASE(condition, trueValue)` +`CASE(condition, trueValue, elseValue)` ### Parameters @@ -16,16 +16,20 @@ A condition to evaluate. The value that is returned when the corresponding condition is the first to evaluate to `true`. If no condition matches, the default value is returned. +#### elseValue + +The value that will be returned when no condition evaluates to `true`. + ## Examples -Determine whether employees are monolingual, bilingual, or polyglot: +In this example, employees are categorized as monolingual, bilingual, or polyglot depending on how many languages they speak: ```esql FROM employees | EVAL type = CASE( languages <= 1, "monolingual", languages <= 2, "bilingual", - "polyglot") + "polyglot") | KEEP emp_no, languages, type ``` @@ -46,6 +50,6 @@ Calculate an hourly error rate as a percentage of the total number of log messag FROM sample_data | EVAL error = CASE(message LIKE "*error*", 1, 0) | EVAL hour = DATE_TRUNC(1 hour, @timestamp) -| STATS error_rate = AVG(error) by hour +| STATS error_rate = AVG(error) BY hour | SORT hour -``` \ No newline at end of file +``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-categorize.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-categorize.txt new file mode 100644 index 0000000000000..1625aaa738448 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-categorize.txt @@ -0,0 +1,30 @@ +# CATEGORIZE + +The `CATEGORIZE` function organizes textual data into groups of similar format. + +> **Note:** The `CATEGORIZE` function is currently in technical preview and may undergo changes or be removed in future releases. + +## Syntax + +`CATEGORIZE(field)` + +### Parameters + +#### field + +The expression that is to be categorized. + +## Examples + +The following example demonstrates how to use `CATEGORIZE` to group server log messages into categories and then aggregate their counts. + +```esql +FROM sample_data +| STATS count = COUNT() BY category=CATEGORIZE(message) +``` + +## Limitations + +- `CATEGORIZE` can't be used within other expressions +- `CATEGORIZE` can't be used with multiple groupings +- `CATEGORIZE` can't be used or referenced within aggregate functions diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-date_extract.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-date_extract.txt index fa2cf8c0c88a6..555ea5b1f3cc5 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-date_extract.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-date_extract.txt @@ -10,7 +10,7 @@ The DATE_EXTRACT function is used to extract specific parts of a date. #### datePart -This is the part of the date you want to extract, such as "year", "month" or ""hour_of_day". +This is the part of the date you want to extract, such as "year", "month" or "hour_of_day". #### date diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-date_parse.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-date_parse.txt index f62cf0c5f9a4c..d949c92156ba4 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-date_parse.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-date_parse.txt @@ -1,6 +1,6 @@ # DATE_PARSE -The DATE_PARSE function is used to convert a date string into a date format based on the provided pattern. +The DATE_PARSE function is used to convert a date string into a date based on the provided format pattern. ## Syntax @@ -23,6 +23,7 @@ ROW date_string = "2022-05-06" | EVAL date = DATE_PARSE("yyyy-MM-dd", date_string) ``` -ROW date_string = "2023-12-25" +```esql +FROM logs | EVAL date = DATE_PARSE("yyyy-MM-dd", date_string) ``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-eval.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-eval.txt index ee512ededc6c4..5cf834f5e9618 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-eval.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-eval.txt @@ -65,6 +65,15 @@ FROM employees | STATS avg_height_feet = AVG(`height * 3.281`) ``` +Any number of evaluations can be performed in a single EVAL command + +```esql +FROM triangle +| EVAL cos = COS(angle), tan = TAN(angle), sin = SIN(angle), acos=ACOS(angle), asin=ASIN(angle) +| SORT angle DESC +| LIMIT 10 +``` + ### Limitations - If a column with the same name already exists, the existing column is dropped. - If a column name is used more than once, only the rightmost duplicate creates a column. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-exp.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-exp.txt index 0f55dc85702e5..ec8374e9a6a7b 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-exp.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-exp.txt @@ -21,6 +21,6 @@ ROW d = 5.0 ``` ```esql -ROW value = 2.0 -| EVAL result = EXP(value) +FROM geo +| EVAL exp = EXP(x) ``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-hash.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-hash.txt new file mode 100644 index 0000000000000..5485869a90e45 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-hash.txt @@ -0,0 +1,30 @@ +# HASH + +The HASH function computes the hash of a given input using a specified algorithm. + +## Syntax + +`HASH(algorithm, input)` + +### Parameters + +#### algorithm + +The hash algorithm to be used. + +The supported algorithms are: +- "MD5" +- "SHA-1" +- "SHA-256" + +#### input + +The value to be hashed. + +## Examples + +```esql +FROM messages +| EVAL hashed_content = HASH("SHA-1", content) +| KEEP message_id, hashed_content +``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-hypot.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-hypot.txt new file mode 100644 index 0000000000000..8033c8e7c33d2 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-hypot.txt @@ -0,0 +1,28 @@ +# HYPOT + +The HYPOT function is used to calculate the hypotenuse of two numbers. + +## Syntax + +`HYPOT(number1, number2)` + +### Parameters + +#### number1 + +This is a numeric value. If it's `null`, the function will also return `null`. + +#### number2 + +This is also a numeric value. If it's `null`, the function will also return `null`. + +## Examples + +Check the hypotenuse of two variables through the following example: + +```esql +ROW a = 3.0, b = 4.0 +| EVAL c = HYPOT(a, b) +``` + +Note that the HYPOT function returns the hypotenuse in double data type. Besides, if any of the numbers is infinity, the hypotenuse returns `null`. \ No newline at end of file diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-keep.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-keep.txt index 84d8207bdf934..b55077f6b5437 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-keep.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-keep.txt @@ -18,9 +18,9 @@ The KEEP command is used to specify which columns to return and their order. When a field name matches multiple expressions, precedence rules are applied. Fields are added in the order they appear. If one field matches multiple expressions, the following precedence rules apply (from highest to lowest priority): -1. Complete field name (no wildcards) -2. Partial wildcard expressions (for example: `fieldNam*`) -3. Wildcard only (`*`) +1. Complete field name (without wildcards) +2. Partial wildcard expressions (like `fieldNam*`) +3. Only wildcard (`*`) If a field matches two expressions with the same precedence, the rightmost expression wins. @@ -28,48 +28,46 @@ Important: only the columns in the KEEP command can be used after a KEEP command ## Examples -#### Example 1: Specifying Columns Explicitly -This example demonstrates how to explicitly specify the columns to be returned. +Return columns in a specified order: ```esql FROM employees | KEEP emp_no, first_name, last_name, height ``` -#### Example 2: Using Wildcards to Match Column Names -This example shows how to use wildcards to return all columns that match a specific pattern. +If you do not want to mention each column by name, you can use wildcards to select all columns that match a certain pattern: ```esql FROM employees | KEEP h* ``` -#### Example 3: Combining Wildcards and Explicit Column Names -This example illustrates how to combine wildcards and explicit column names, and how precedence rules are applied. +The wildcard asterisk (`*`) by itself translates to all columns that are not matched by other arguments. + +This command will first return all columns with a name that starts with `h`, followed by all other columns: ```esql FROM employees | KEEP h*, * ``` -#### Example 4: Precedence Rules with Complete Field Names -This example demonstrates how complete field names take precedence over wildcard expressions. +The following examples demonstrate how precedence rules function when a field name corresponds to multiple expressions. + +Clear field name takes precedence over wildcard expressions: ```esql FROM employees | KEEP first_name, last_name, first_name* ``` -#### Example 5: Wildcard Expressions with Same Priority -This example shows how the last wildcard expression wins when multiple wildcard expressions have the same priority. +Wildcard expressions have the same priority, with the last one winning (despite it being a less specific match): ```esql FROM employees | KEEP first_name*, last_name, first_na* ``` -#### Example 6: Simple Wildcard Expression with Lowest Precedence -This example illustrates how the simple wildcard expression `*` has the lowest precedence. +A simple wildcard expression `*` has the minimum precedence. The sequence of output is determined by other arguments: ```esql FROM employees diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-limit.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-limit.txt index 1a77939b4afbd..2778f82cbaff2 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-limit.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-limit.txt @@ -56,6 +56,19 @@ FROM employees | LIMIT 5 ``` +`LIMIT` can and should be used as soon as possible in the query + +For example this query uses SORT and LIMIT as soon as it can and before further computations: + +```esql +FROM sets +| EVAL count = MV_COUNT(values) +| SORT count DESC +| LIMIT 5 +| EVAL min = MV_MIN(values), max = MV_MAX(values), avg = MV_AVG(value) +| KEEP set_id, min, max, avg +``` + ## Limitations There is no way to achieve pagination with LIMIT, there is no offset parameter. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-match.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-match.txt new file mode 100644 index 0000000000000..e75f91b018069 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-match.txt @@ -0,0 +1,36 @@ +# MATCH + +`MATCH` is a function used to execute a match query on a specified field. It works on various field types including text fields, boolean, dates, and numeric types. It returns 'true' when the provided query matches the row. + +## Syntax + +`MATCH (field, query)` + +### Parameters + +#### `field` + +This represents the field that the query will target. If the field contains multiple values, +`MATCH` will process each value. + +#### `query` + +This is the value that is being searched in the provided field. + +## Examples + +In this example, `"Faulkner"` is matched against the `author` field in `books` data. `MATCH` returns true if it finds the provided query, in this case `"Faulkner"` in the author field. The query then keeps the columns `book_no` and `author`, sorts by `book_no` and limits the result to 5. + +```esql +FROM books +| WHERE MATCH(author, "Faulkner") +| KEEP book_no, author +| SORT book_no +| LIMIT 5; +``` + +## Notes + +- Do not use `MATCH` in production - it is in technical preview and may be changed or removed in a future release +- `MATCH` relies on Elasticsearch Match query under the hood, and should be used for full-text search only. For more traditional + text matching, `LIKE` or `RLIKE` should be used instead. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_avg.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_avg.txt index 81d9eb231311b..8b071e8629cf3 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_avg.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_avg.txt @@ -19,7 +19,8 @@ ROW a=[3, 5, 1, 6] | EVAL avg_a = MV_AVG(a) ``` +**Retrieving the average value from a multivalued field** ```esql -ROW scores=[10, 20, 30, 40] -| EVAL average_score = MV_AVG(scores) +FROM bag_of_numbers +| EVAL min = MV_AVG(numbers) ``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_count.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_count.txt index 808563d91b3bf..b468be2ba6af1 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_count.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_count.txt @@ -19,7 +19,8 @@ ROW a=["foo", "zoo", "bar"] | EVAL count_a = MV_COUNT(a) ``` +**Counting the number of element in a multivalued field** ```esql -ROW b=["apple", "banana", "cherry", "date"] -| EVAL count_b = MV_COUNT(b) +FROM bag_of_numbers +| EVAL count = MV_COUNT(numbers) ``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_first.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_first.txt index 7b04ce040c7b0..3cbba2efc7425 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_first.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_first.txt @@ -19,9 +19,10 @@ ROW a="foo;bar;baz" | EVAL first_a = MV_FIRST(SPLIT(a, ";")) ``` +**Retrieving the first element from a multivalued field** ```esql -ROW b="apple;banana;cherry" -| EVAL first_b = MV_FIRST(SPLIT(b, ";")) +FROM bag_of_numbers +| EVAL first = MV_FIRST(numbers) ``` ## Notes diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_last.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_last.txt index 2a9efa61ea0d6..c85995575c486 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_last.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_last.txt @@ -21,9 +21,10 @@ ROW a="foo;bar;baz" | EVAL last_a = MV_LAST(SPLIT(a, ";")) ``` +**Retrieving the last element from a multivalued field** ```esql -ROW a="apple;banana;cherry" -| EVAL last_fruit = MV_LAST(SPLIT(a, ";")) +FROM bag_of_numbers +| EVAL last = MV_LAST(numbers) ``` ## Notes diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_max.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_max.txt index 03f894ce203a8..b38c1bd0252ed 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_max.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_max.txt @@ -21,9 +21,8 @@ ROW a=[3, 5, 1] | EVAL max_a = MV_MAX(a) ``` -MV_MAX function can be used with any column type, including `keyword` columns. In such cases, it selects the last string, comparing their utf-8 representation byte by byte: - +**Retrieving the max value from a multivalued field** ```esql -ROW a=["foo", "zoo", "bar"] -| EVAL max_a = MV_MAX(a) +FROM bag_of_numbers +| EVAL max = MV_MAX(numbers) ``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_median_absolute_deviation.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_median_absolute_deviation.txt new file mode 100644 index 0000000000000..6f47135f5b097 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_median_absolute_deviation.txt @@ -0,0 +1,24 @@ +# MV_MEDIAN_ABSOLUTE_DEVIATION + +The MV_MEDIAN_ABSOLUTE_DEVIATION function transforms a multi-valued field into a single-valued field that retains the median absolute deviation. It computes this as a median of the deviation of each datum from the entire sample's median. In other words, for a random variable `X`, the median absolute deviation can be represented as `median(|median(X) - X|)`. + +## Syntax + +`MV_MEDIAN_ABSOLUTE_DEVIATION(number)` + +### Parameters + +#### number + +A multi-valued expression. + +*Notice*: If the field comprises an even amount of values, the median is deduced as an average of the two central values. If the value isn't a floating-point number, the average values are rounded towards 0. + +## Examples + +```esql +ROW values = [0, 2, 5, 6] +| EVAL median_absolute_deviation = MV_MEDIAN_ABSOLUTE_DEVIATION(values), median = MV_MEDIAN(values) +``` + +This example illustrates the computation of the median absolute deviation and the median from a list of numerical values. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_min.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_min.txt index 97cb8db004cda..514f25420331b 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_min.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_min.txt @@ -19,7 +19,8 @@ ROW a=[2, 1] | EVAL min_a = MV_MIN(a) ``` +**Retrieving the min value from a multivalued field** ```esql -ROW a=["foo", "bar"] -| EVAL min_a = MV_MIN(a) +FROM bag_of_numbers +| EVAL min = MV_MIN(numbers) ``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_percentile.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_percentile.txt new file mode 100644 index 0000000000000..1d6d0d802f6ec --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_percentile.txt @@ -0,0 +1,26 @@ +# MV_PERCENTILE + +This function converts a multivalued field into a single-valued field. The single-valued field it produces contains the value at which a specified percentage of observed values occur. + +## Syntax + +`MV_PERCENTILE(number, percentile)` + +### Parameters + +#### number + +This refers to a multivalue expression. + +#### percentile + +Value for the percentile to calculate. The value should range from 0 and 100. Values outside this range return null. + +## Examples + +Consider an instance where you want to calculate the 50th percentile (or median) of a set of numbers - `[5, 5, 10, 12, 5000]`. This can be done using the following statement. + +```esql +ROW values = [5, 5, 10, 12, 5000] +| EVAL p50 = MV_PERCENTILE(values, 50), median = MV_MEDIAN(values) +``` \ No newline at end of file diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_sort.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_sort.txt index a2191a59214d8..a7ddc9452c1ba 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_sort.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-mv_sort.txt @@ -22,14 +22,20 @@ Without order parameter ```esql ROW names = ["Alice", "Bob", "Charlie"] -| EVAL sorted_names = mv_sort(names) +| EVAL sorted_names = MV_SORT(names) ``` With order parameter ```esql ROW a = [4, 2, -3, 2] -| EVAL sa = mv_sort(a), sd = mv_sort(a, "DESC") +| EVAL sd = MV_SORT(a, "DESC") +``` + +**Sorting a multivalued field** +```esql +FROM bag_of_numbers +| EVAL sorted = MV_SORT(numbers) ``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-operators.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-operators.txt index 0e79037636072..a6ebcfdbd6bdb 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-operators.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-operators.txt @@ -241,9 +241,13 @@ FROM employees The `::` operator provides a convenient alternative syntax to the `TO_` conversion functions. -Example: +Examples: ```esql FROM employees | EVAL salary = salary::double ``` + +```esql +ROW ver = CONCAT(("0"::INT + 1)::STRING, ".2.3")::VERSION +``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-overview.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-overview.txt index 952ba28dd0b8e..5a2a6252728ca 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-overview.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-overview.txt @@ -57,7 +57,8 @@ ES|QL does not support configurations where the `_source` field is disabled. ES| #### Full-Text Search -Because of the way ES|QL treats `text` values, full-text search is not yet supported. Queries on `text` fields are like queries on `keyword` fields: they are case-sensitive and need to match the full string. +Because of the way ES|QL treats `text` values, queries on `text` fields are like queries on `keyword` fields: they are case-sensitive and need to match the full string. +To perform full-text search on `text` fields, search functions such as `MATCH` should be used. #### Time Series Data Streams diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-qstr.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-qstr.txt new file mode 100644 index 0000000000000..89f5d8a81c1be --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-qstr.txt @@ -0,0 +1,31 @@ +# QSTR + +The QSTR function performs a query string query, returning true if the provided query string matches a row. + +Please note this functionality is currently in its technical preview stage, which means it might undergo changes or removal in future releases. Elastic commits to address any issues during this period. However, since it's a technical preview, it doesn't come under the support SLA of official GA features. + +## Syntax + +`QSTR(query)` + +### Parameters + +#### query + +The query parameter must be a string written in the Lucene query format. + +## Examples + +Conduct a query string query on a book's author: + +```esql +FROM books +| WHERE QSTR("author: Faulkner") +| KEEP book_no, author +| SORT book_no +| LIMIT 5; +``` + +## Notes + +- Do not use `QSTR` in production - it is in technical preview and may be changed or removed in a future release diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-reverse.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-reverse.txt new file mode 100644 index 0000000000000..f1372b06abf61 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-reverse.txt @@ -0,0 +1,29 @@ +# REVERSE + +The REVERSE function returns a reversed form of the input string. + +## Syntax + +`REVERSE(str)` + +### Parameters + +#### str + +The string you want to reverse. If the string is `null`, the function will also return `null`. + +## Examples + +Here's an example of how to reverse a string: + +```esql +ROW message = "Some Text" +| EVAL message_reversed = REVERSE(message); +``` + +REVERSE also works with unicode characters, keeping unicode grapheme clusters intact during reversal: + +```esql +ROW bending_arts = "💧🪨🔥💨" +| EVAL bending_arts_reversed = REVERSE(bending_arts); +``` \ No newline at end of file diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-space.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-space.txt new file mode 100644 index 0000000000000..fb00d39e6b426 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-space.txt @@ -0,0 +1,22 @@ +# SPACE + +The SPACE function creates a string composed of a specific number of spaces. + +## Syntax + +`SPACE(number)` + +### Parameters + +#### number + +The number of spaces the function should generate. + +## Examples + +This example demonstrates how to use the SPACE function to insert a space into a string: + +```esql +ROW message = CONCAT("Hello", SPACE(1), "World!"); +``` +In this example, the SPACE function creates a single space, which is then used to separate the words "Hello" and "World!" in the resulting string. If desired, the `number` parameter could be adjusted in order to generate more spaces. \ No newline at end of file diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_envelope.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_envelope.txt new file mode 100644 index 0000000000000..6ecdf385da18b --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_envelope.txt @@ -0,0 +1,24 @@ +# ST_ENVELOPE + +The ST_ENVELOPE function determines the minimum bounding box for the provided geometry. + +## Syntax + +`ST_ENVELOPE(geometry)` + +### Parameters + +#### geometry + +The `geometry` parameter refers to the input geometry. This should be an expression of type `geo_point`, `geo_shape`, `cartesian_point`, or `cartesian_shape`. If the parameter is `null`, the function will also return `null`. + +## Examples + +Here is an example where ST_ENVELOPE is used to determine the minimum bounding box of a city's boundary: + +```esql +FROM airport_city_boundaries +| WHERE abbrev == "CPH" +| EVAL envelope = ST_ENVELOPE(city_boundary) +| KEEP abbrev, airport, envelope +``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_extent_agg.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_extent_agg.txt new file mode 100644 index 0000000000000..87a6299daa741 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_extent_agg.txt @@ -0,0 +1,25 @@ +# ST_EXTENT_AGG + +This function calculates the spatial extent over a field that has a geometry type, returning a bounding box that contains all values of the specified field. + +## Syntax + +`ST_EXTENT_AGG(field)` + +### Parameters + +#### field + +The field of geometry type over which the spatial extent will be calculated. + +## Examples + +The following example calculates the spatial extent over the 'location' field for all airports in India: + +```esql +FROM airports +| WHERE country == "India" +| STATS extent = ST_EXTENT_AGG(location) +``` + +This query returns a bounding box that encompasses all airport locations in India. \ No newline at end of file diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_xmax.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_xmax.txt new file mode 100644 index 0000000000000..bf6c95eee2b8a --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_xmax.txt @@ -0,0 +1,27 @@ +# ST_XMAX + +The ST_XMAX function extracts the maximum value of the x coordinates from the supplied geometry. + +## Syntax + +`ST_XMAX(point)` + +### Parameters + +#### point + +This is an expression of type `geo_point`, `geo_shape`, `cartesian_point` or `cartesian_shape`. The function returns `null` if the point is `null`. + +## Examples + +Here's an example of how to use the ST_XMAX function: + +```esql +FROM airport_city_boundaries +| WHERE abbrev == "CPH" +| EVAL envelope = ST_ENVELOPE(city_boundary) +| EVAL xmin = ST_XMIN(envelope), xmax = ST_XMAX(envelope), ymin = ST_YMIN(envelope), ymax = ST_YMAX(envelope) +| KEEP abbrev, airport, xmin, xmax, ymin, ymax +``` + +In this example, the ST_XMAX function is used to extract the maximum x coordinate from the envelope of the 'city_boundary' field. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_xmin.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_xmin.txt new file mode 100644 index 0000000000000..f96d3705f5897 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_xmin.txt @@ -0,0 +1,31 @@ +# ST_XMIN + +ST_XMIN retrieves the minimum 'x' coordinate from the provided geometry. + +## Syntax + +`ST_XMIN(point)` + +### Parameters + +#### point + +This is an expression of either `geo_point`, `geo_shape`, `cartesian_point`, or `cartesian_shape` type. If this parameter is null, the function will return null. + +## Explanation + +ST_XMIN function extracts the minimum value of the 'x' coordinates from the provided geometry data. In cases where the geometry is either of type `geo_point` or `geo_shape`, this is equivalent to extracting the minimum longitude value. + +## Examples + +This example query returns the bounding envelope coordinates of Copenhagen Airport: + +```esql +FROM airport_city_boundaries +| WHERE abbrev == "CPH" +| EVAL envelope = ST_ENVELOPE(city_boundary) +| EVAL xmin = ST_XMIN(envelope), xmax = ST_XMAX(envelope), ymin = ST_YMIN(envelope), ymax = ST_YMAX(envelope) +| KEEP abbrev, airport, xmin, xmax, ymin, ymax +``` + +In this query, the `ST_XMIN` function is used to extract the smallest 'x' value from the geometric 'envelope' surrounding the airport. \ No newline at end of file diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_ymax.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_ymax.txt new file mode 100644 index 0000000000000..560c76082eb1d --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_ymax.txt @@ -0,0 +1,31 @@ +# ST_YMAX + +Extracts the maximum value of the `y` coordinates from the given geometry input. + +## Syntax + +`ST_YMAX(point)` + +### Parameters + +#### point + +An expression of type `geo_point`, `geo_shape`, `cartesian_point`, or `cartesian_shape`. If the value is `null`, the function also returns `null`. + +## Examples + +Here is an example of using the `ST_YMAX` function: + +```esql +FROM airport_city_boundaries +| WHERE abbrev == "CPH" +| EVAL envelope = ST_ENVELOPE(city_boundary) +| EVAL xmin = ST_XMIN(envelope), xmax = ST_XMAX(envelope), ymin = ST_YMIN(envelope), ymax = ST_YMAX(envelope) +| KEEP abbrev, airport, xmin, xmax, ymin, ymax +``` + +The example above first uses the `ST_ENVELOPE` function to find the smaller rectangular polygon that contains `city_boundary`. Then it uses the `ST_XMIN`, `ST_XMAX`, `ST_YMIN`, and `ST_YMAX` functions to calculate the minimum and maximum `x` and `y` coordinates of the rectangle, respectively. Lastly, it keeps only the columns of interest: `abbrev`, `airport`, `xmin`, `xmax`, `ymin`, and `ymax`. + +When the `point` parameter is of type `geo_point` or `geo_shape`, using the `ST_YMAX` function is equivalent to finding the maximum `latitude` value. + +Where applicable, if there are limitations impacting this function, they will be mentioned in a "Limitations" section at the end of this document. \ No newline at end of file diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_ymin.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_ymin.txt new file mode 100644 index 0000000000000..574c0c3855dfa --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-st_ymin.txt @@ -0,0 +1,27 @@ +# ST_YMIN + +The ST_YMIN function extracts the smallest value of the `y` coordinates from the provided geometry. + +## Syntax + +`ST_YMIN(point)` + +### Parameters + +#### point + +A given expression of types `geo_point`, `geo_shape`, `cartesian_point`, or `cartesian_shape`. If the value is `null`, the function will also return `null`. + +## Examples + +This example demonstrates how to extract the minimum `y` coordinate from a geographical boundary outline: + +```esql +FROM airport_city_boundaries +| WHERE abbrev == "CPH" +| EVAL envelope = ST_ENVELOPE(city_boundary) +| EVAL xmin = ST_XMIN(envelope), xmax = ST_XMAX(envelope), ymin = ST_YMIN(envelope), ymax = ST_YMAX(envelope) +| KEEP abbrev, airport, xmin, xmax, ymin, ymax +``` + +In the case of `geo_point` or `geo_shape`, using the ST_YMIN function is equivalent to retrieving the minimum `latitude` value. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-starts_with.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-starts_with.txt index 31578d3786ee1..bc19d7bf8d2f2 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-starts_with.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-starts_with.txt @@ -18,7 +18,7 @@ This is a string expression that will be checked if it is the starting sequence ## Examples -The following example checks if the `last_name` of employees starts with the letter "B": +This example checks if the last name of employee records starts with "B": ```esql FROM employees diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-stats.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-stats.txt index 795213778c87b..2ac2f8cc8f9d7 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-stats.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-stats.txt @@ -38,14 +38,16 @@ The following aggregation functions are supported: - `MIN` - `PERCENTILE` - `ST_CENTROID_AGG` +- `STD_DEV` - `SUM` - `TOP` - `VALUES` - `WEIGHTED_AVG` -> Note: `STATS` without any groups is significantly faster than adding a group. +The following grouping functions are supported: -> Note: Grouping on a single expression is currently much more optimized than grouping on many expressions. In some tests, grouping on a single `keyword` column was found to be five times faster than grouping on two `keyword` columns. Do not attempt to work around this by combining the two columns together with a function like `CONCAT` and then grouping - this will not be faster. +- `BUCKET` +- `CATEGORIZE` ## Examples @@ -123,11 +125,16 @@ FROM employees | EVAL avg_salary_rounded = ROUND(`AVG(salary)`) ``` -## Notes +STATS works with grouping functions such as BUCKET, e.g. grouping data based on their timestamp: -- If multiple columns share the same name, all but the rightmost column with this name are ignored. +```esql +FROM sample_data +| WHERE @timestamp >= NOW() - 1 day and @timestamp < NOW() +| STATS COUNT(*) BY bucket = BUCKET(@timestamp, 1 hour) +``` -### Limitations +## Notes -- **Performance**: `STATS` without any groups is much faster than adding a group. Grouping on a single expression is more optimized than grouping on multiple expressions. -- **Multivalue Fields**: If the grouping key is multivalued, the input row is included in all groups. +- If multiple columns share the same name, all but the rightmost column with this name are ignored. +- `STATS` without any groups is much faster than adding a group. Grouping on a single expression is more optimized than grouping on multiple expressions. +- If the grouping key is multivalued, the input row is included in all groups. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-std_dev.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-std_dev.txt new file mode 100644 index 0000000000000..f3063438a4997 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-std_dev.txt @@ -0,0 +1,29 @@ +# STD_DEV + +The STD_DEV function calculates the standard deviation of a numeric field. + +## Syntax + +`STD_DEV(number)` + +### Parameters + +#### number + +A numeric field for which the standard deviation is calculated. + +## Examples + +This example calculates the standard deviation of the 'height' column: + +```esql +FROM employees +| STATS STD_DEV(height) +``` + +In this example, we first calculate the maximum salary change for each employee using the `MV_MAX` function. The `STD_DEV` function is then used to calculate the standard deviation of these maximum salary changes: + +```esql +FROM employees +| STATS std_dev_salary_change = STD_DEV(MV_MAX(salary_change)) +``` diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-syntax.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-syntax.txt index 85df775422801..704a89bb44665 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-syntax.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-syntax.txt @@ -130,15 +130,6 @@ FROM weather_data 4. Calculate the total sales for each week in the last quarter: -```esql -FROM sales -| WHERE @timestamp > NOW() - 1 quarter -| STATS weekly_sales = SUM(sales_amount) BY week = DATE_TRUNC(1 week, @timestamp) -| SORT week -``` - -4. The same example with BUCKET instead of DATE_TRUNC: - ```esql FROM sales | WHERE @timestamp > NOW() - 1 quarter diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_dateperiod.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_dateperiod.txt new file mode 100644 index 0000000000000..8375d683dd745 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_dateperiod.txt @@ -0,0 +1,22 @@ +# TO_DATEPERIOD + +The `TO_DATEPERIOD` function converts an input value into a `date_period` value. + +## Syntax + +`TO_DATEPERIOD(field)` + +### Parameters + +#### field + +The input value. This must be a valid constant date period expression. + +## Examples + +This example demonstrates the usage of the `TO_DATEPERIOD` function: + +```esql +ROW x = "2024-01-01"::datetime +| EVAL y = x + "3 DAYS"::date_period, z = x - TO_DATEPERIOD("3 days"); +``` \ No newline at end of file diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_datetime.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_datetime.txt index 579765a4685f5..5510b7d3d4692 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_datetime.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_datetime.txt @@ -10,7 +10,8 @@ The TO_DATETIME function converts an input value into a date value. #### field -The input value to be converted. This can be a single or multi-valued column or an expression. +The input value to be converted, either single or multi-valued column or an expression. +If of type string, the input must follow the `yyyy-MM-dd'T'HH:mm:ss.SSS'Z'` format. To convert strings in other formats, use DATE_PARSE. ## Examples @@ -30,6 +31,6 @@ ROW int = [0, 1] ## Notes -- TO_DATETIME converts an input value into a date value. A string will only be successfully converted if it follows the format `yyyy-MM-dd'T'HH:mm:ss.SSS'Z'`. To convert dates in other formats, use the `DATE_PARSE` function. +- Can only convert string with the exact format `yyyy-MM-dd'T'HH:mm:ss.SSS'Z'`. To convert dates in other formats, use the `DATE_PARSE` function. - When converting from nanosecond resolution to millisecond resolution with this function, the nanosecond date is truncated, not rounded. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_timeduration.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_timeduration.txt new file mode 100644 index 0000000000000..3c95cf4f75cc6 --- /dev/null +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-to_timeduration.txt @@ -0,0 +1,23 @@ +# TO_TIMEDURATION + +The `TO_TIMEDURATION` function converts an input value into a `time_duration` value. + +## Syntax + +`TO_TIMEDURATION(field)` + +### Parameters + +#### field + +This is the input value. It must be a valid constant time duration expression. + +## Examples + +Here's an example of how to use the `TO_TIMEDURATION` function: + +```esql +ROW x = "2024-01-01"::datetime +| EVAL y = x + "3 hours"::time_duration, z = x - TO_TIMEDURATION("3 hours"); +``` +In this example, `TO_TIMEDURATION` function is used to convert the string "3 hours" into a `time_duration` value, which is then subtracted from the datetime value stored in the variable `x`. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-values.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-values.txt index 1ea45c3642e0a..10c8021ce96d6 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-values.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/esql_docs/esql-values.txt @@ -29,5 +29,4 @@ FROM employees ## Limitations -- This functionality is in technical preview and may be changed or removed in a future release - The VALUES function can consume a significant amount of memory. ES|QL does not currently support growing aggregations beyond memory. Therefore, if the function collects more values than can fit into memory, it will fail the query with a Circuit Breaker Error. diff --git a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/system_message.txt b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/system_message.txt index da590d9531ccb..7f3579381e620 100644 --- a/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/system_message.txt +++ b/x-pack/platform/plugins/shared/inference/server/tasks/nl_to_esql/system_message.txt @@ -54,97 +54,116 @@ The following processing commands are available: ### Grouping functions -The STATS ... BY command supports these grouping functions: - -BUCKET: Creates groups of values out of a datetime or numeric input. +BUCKET: Creates groups of values out of a datetime or numeric input +CATEGORIZE: Organize textual data into groups of similar format ### Aggregation functions -The STATS ... BY command supports these aggregation functions: - -AVG -COUNT -COUNT_DISTINCT -MAX -MEDIAN -MEDIAN_ABSOLUTE_DEVIATION -MIN -PERCENTILE -ST_CENTROID_AGG -SUM -TOP -VALUES -WEIGHTED_AVG +AVG: calculates the average of a numeric field +COUNT: returns the total number of input values +COUNT_DISTINCT: return the number of distinct values in a field +MAX: calculates the maximum value of a field +MEDIAN: calculates the median value of a numeric field +MEDIAN_ABSOLUTE_DEVIATION: calculates the median absolute deviation of a numeric field +MIN: calculates the minimum value of a field +PERCENTILE: calculates a specified percentile of a numeric field +STD_DEV: calculates the standard deviation of a numeric field +SUM: calculates the total sum of a numeric expression +TOP: collects the top values for a specified field +VALUES: returns all values in a group as a multivalued field +WEIGHTED_AVG: calculates the weighted average of a numeric expression ### Conditional functions and expressions Conditional functions return one of their arguments by evaluating in an if-else manner -CASE -COALESCE -GREATEST -LEAST +CASE: accepts pairs of conditions and values and returns the value that belongs to the first condition that evaluates to true +COALESCE: returns the first non-null argument from the list of provided arguments +GREATEST: returns the maximum value from multiple columns +LEAST: returns the smallest value from multiple columns + +### Search functions + +Search functions perform full-text search against the data + +MATCH: execute a match query on a specified field (tech preview) +QSTR: performs a Lucene query string query (tech preview) ### Date-time functions -DATE_DIFF -DATE_EXTRACT -DATE_FORMAT -DATE_PARSE -DATE_TRUNC -NOW +DATE_DIFF: calculates the difference between two timestamps in a given unit +DATE_EXTRACT: extract a specific part of a date +DATE_FORMAT: returns a string representation of a date using the provided format +DATE_PARSE: convert a date string into a date +DATE_TRUNC: rounds down a date to the nearest specified interval +NOW: returns the current date and time ### Mathematical functions -ABS -ACOS -ASIN -ATAN -ATAN2 -CEIL -COS -COSH -E -FLOOR -LOG -LOG10 -PI -POW -ROUND -SIN -SINH -SQRT -TAN -TANH -TAU +ABS: returns the absolute value of a number +ACOS: returns the arccosine of a number +ASIN: returns the arcsine of a number +ATAN: returns the arctangent of a number +ATAN2: returns the angle from the positive x-axis to a point (x, y) +CBRT: calculates the cube root of a given number +CEIL: rounds a number up to the nearest integer +COS: returns the cosine of a given angle +COSH: returns the hyperbolic cosine of a given angle +E: returns Euler's number +EXP: returns the value of Euler's number raised to the power of a given number +FLOOR: rounds a number down to the nearest integer +HYPOT: calculate the hypotenuse of two numbers +LOG: calculates the logarithm of a given value to a specified base +LOG10: calculates the logarithm of a value to base 10 +PI: returns the mathematical constant Pi +POW: calculates the value of a base raised to the power of an exponent +ROUND: rounds a numeric value to a specified number of decimal +SIGNUM: returns the sign of a given number +SIN: calculates the sine of a given angle +SINH: calculates the hyperbolic sine of a given angle +SQRT: calculates the square root of a given number +TAN: calculates the tangent of a given angle +TANH: calculates the hyperbolic tangent of a given angle +TAU: returns the mathematical constant τ (tau) ### String functions -CONCAT -ENDS_WITH -FROM_BASE64 -LEFT -LENGTH -LOCATE -LTRIM -REPEAT -REPLACE -RIGHT -RTRIM -SPLIT -STARTS_WITH -SUBSTRING -TO_BASE64 -TO_LOWER -TO_UPPER -TRIM +BIT_LENGTH: calculates the bit length of a string +BYTE_LENGTH: calculates the byte length of a string +CONCAT: combines two or more strings into one +ENDS_WITH: checks if a given string ends with a specified suffix +FROM_BASE64: decodes a base64 string +HASH: computes the hash of a given input using a specified algorithm +LEFT: extracts a specified number of characters from the start of a string +LENGTH: calculates the character length of a given string +LOCATE: returns the position of a specified substring within a string +LTRIM: remove leading whitespaces from a string +REPEAT: generates a string by repeating a specified string a certain number of times +REPLACE: substitutes any match of a regular expression within a string with a replacement string +REVERSE: reverses a string +RIGHT: extracts a specified number of characters from the end of a string +RTRIM: remove trailing whitespaces from a string +SPACE: creates a string composed of a specific number of spaces +SPLIT: split a single valued string into multiple strings based on a delimiter +STARTS_WITH: checks if a given string begins with another specified string +SUBSTRING: extracts a portion of a string +TO_BASE64: encodes a string to a base64 +TO_LOWER: converts a string to lowercase +TO_UPPER: converts a string to uppercase +TRIM: removes leading and trailing whitespaces from a string + +### IP Functions + +CIDR_MATCH: checks if an IP address falls within specified network blocks +IP_PREFIX: truncates an IP address to a specified prefix length ### Type conversion functions TO_BOOLEAN TO_CARTESIANPOINT TO_CARTESIANSHAPE -TO_DATETIME +TO_DATETIME (prefer DATE_PARSE to convert strings to datetime) +TO_DATEPERIOD TO_DEGREES TO_DOUBLE TO_GEOPOINT @@ -154,30 +173,50 @@ TO_IP TO_LONG TO_RADIANS TO_STRING +TO_TIMEDURATION TO_UNSIGNED_LONG TO_VERSION -### IP Functions - -CIDR_MATCH -IP_PREFIX - ### Multivalue functions -MV_APPEND -MV_AVG -MV_CONCAT -MV_COUNT -MV_DEDUPE -MV_FIRST -MV_LAST -MV_MAX -MV_MEDIAN -MV_MIN -NV_SORT -MV_SLIDE -MV_SUM -MV_ZIP +Multivalue function are used to manipulate and transform multi-value fields. + +MV_APPEND: concatenates the values of two multi-value fields +MV_AVG: returns the average of all values in a multivalued field +MV_CONCAT: transforms a multivalued string expression into a single valued string +MV_COUNT: counts the total number of values in a multivalued expression +MV_DEDUPE: eliminates duplicate values from a multivalued field +MV_FIRST: returns the first value of a multivalued field +MV_LAST: returns the last value of a multivalued field +MV_MAX: returns the max value of a multivalued field +MV_MEDIAN: returns the median value of a multivalued field +MV_MEDIAN_ABSOLUTE_DEVIATION: returns the median absolute deviation of a multivalued field +MV_MIN: returns the min value of a multivalued field +MV_PERCENTILE: returns the specified percentile of a multivalued field +MV_SLIDE: extract a subset of a multivalued field using specified start and end index values +MV_SORT: sorts a multivalued field in lexicographical order. +MV_SUM: returns the sum of all values of a multivalued field +MV_ZIP: combines the values from two multivalued fields with a specified delimiter + +### Spacial functions + +ST_CONTAINS: checks if the first specified geometry encompasses the second one +ST_DISJOINT: checks if two geometries or geometry columns are disjoint +ST_DISTANCE: calculates the distance between two points +ST_ENVELOPE: calculates the minimum bounding box for the provided geometry +ST_INTERSECTS: checks if two geometries intersect +ST_WITHIN: checks if the first geometry is located within the second geometry +ST_X: extracts the x coordinate from a given point +ST_XMAX: extracts the maximum value of the x coordinates from a geometry +ST_XMIN: extracts the minimum value of the x coordinates from a geometry +ST_Y: extracts the y coordinate from a given point +ST_YMAX: extracts the maximum value of the y coordinates from a geometry +ST_YMIN: extracts the minimum value of the y coordinates from a geometry + +### Spacial aggregations functions + +ST_EXTENT_AGG: calculates the spatial extent over a field that has a geometry type +ST_CENTROID_AGG: calculates the spatial centroid over a spatial point geometry field ### Operators @@ -185,44 +224,68 @@ Binary operators: ==, !=, <, <=, >, >=, +, -, *, /, % Logical operators: AND, OR, NOT Predicates: IS NULL, IS NOT NULL Unary operators: - -IN +IN: test if a field or expression is in a list of literals LIKE: filter data based on string patterns using wildcards RLIKE: filter data based on string patterns using regular expressions +Cast (`::`): provides a convenient alternative syntax to the `TO_` conversion functions # Usage examples Here are some examples of ES|QL queries: +**Returns the 10 latest errors from the logs** +```esql +FROM logs +| WHERE level == "ERROR" +| SORT @timestamp DESC +| LIMIT 10 +``` + +**Returns the title and description of last month's blog articles** +```esql +FROM blogposts +| WHERE published > NOW() - 1 month +| KEEP title, description +| SORT title +``` + +**Returns the number of employees from the "NL" country using STATS** ```esql FROM employees -| WHERE country == "NL" AND gender == "M" +| WHERE country == "NL" | STATS COUNT(*) ``` +**Returns the number of order for each month over last year** ```esql -FROM employees -| EVAL trunk_worked_seconds = avg_worked_seconds / 100000000 * 100000000 -| STATS c = count(languages.long) BY languages.long, trunk_worked_seconds -| SORT c desc, languages.long, trunk_worked_seconds +FROM orders +| WHERE order_date > NOW() - 1 year +| STATS count = COUNT(*) BY date_bucket = BUCKET(order_date, 1 month) ``` -*Extracting structured data from logs using DISSECT* +**Extracting structured data from logs using DISSECT** ```esql -ROW a = "2023-01-23T12:15:00.000Z - some text - 127.0.0.1" -| DISSECT a "%{date} - %{msg} - %{ip}" +FROM postgres-logs* +// messages are similar to "2023-01-23T12:15:00.000Z - some text - 127.0.0.1" +| DISSECT message "%{date} - %{msg} - %{ip}" +// keep columns created by the dissect command | KEEP date, msg, ip -| EVAL date = TO_DATETIME(date) +// evaluate date from string representation +| EVAL date = DATE_PARSE("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", date) ``` +**Find contributors which first name starts with "b", sort them by number of commits and +then returns their first and last names for the top 5** ```esql -FROM employees -| WHERE first_name LIKE "?b*" +FROM commits +| WHERE TO_LOWER(first_name) LIKE "b*" | STATS doc_count = COUNT(*) by first_name, last_name | SORT doc_count DESC | KEEP first_name, last_name +| LIMIT 5 ``` -**Returning average salary per hire date with 20 buckets** +**Returning average salary per hire date split in 20 buckets using BUCKET** ```esql FROM employees | WHERE hire_date >= "1985-01-01T00:00:00Z" AND hire_date < "1986-01-01T00:00:00Z" @@ -238,13 +301,18 @@ FROM employees | SORT b ``` +**returns total and recent hire counts plus ratio break down by country** ```esql FROM employees +// insert a boolean column using case for conditional evaluation | EVAL is_recent_hire = CASE(hire_date <= "2023-01-01T00:00:00Z", 1, 0) +// using stats with multiple grouping expressions | STATS total_recent_hires = SUM(is_recent_hire), total_hires = COUNT(*) BY country +// evaluate the recent hiring rate by country based on the previous grouping expressions | EVAL recent_hiring_rate = total_recent_hires / total_hires ``` +**computes failure ratios from logs** ```esql FROM logs-* | WHERE @timestamp <= NOW() - 24 hours @@ -252,10 +320,13 @@ FROM logs-* | EVAL is_5xx = CASE(http.response.status_code >= 500, 1, 0) // count total events and failed events to calculate a rate | STATS total_events = COUNT(*), total_failures = SUM(is_5xx) BY host.hostname, bucket = BUCKET(@timestamp, 1 hour) +// evaluate the failure ratio | EVAL failure_rate_per_host = total_failures / total_events +// drops the temporary columns | DROP total_events, total_failures ``` +**Returning the number of logs grouped by level over the past 24h** ```esql FROM logs-* | WHERE @timestamp <= NOW() - 24 hours @@ -266,21 +337,25 @@ FROM logs-* **Returning all first names for each first letter** ```esql FROM employees +// evaluate first letter | EVAL first_letter = SUBSTRING(first_name, 0, 1) +// group all first_name into a multivalued field, break down by first_letter | STATS first_name = MV_SORT(VALUES(first_name)) BY first_letter | SORT first_letter ``` +**Retrieving the min, max and average value from a multivalued field** ```esql -FROM employees -| WHERE still_hired == true -| EVAL hired = DATE_FORMAT("YYYY", hire_date) -| STATS avg_salary = AVG(salary) BY languages -| EVAL avg_salary = ROUND(avg_salary) -| EVAL lang_code = TO_STRING(languages) -| ENRICH languages_policy ON lang_code WITH lang = language_name -| WHERE lang IS NOT NULL -| KEEP avg_salary, lang -| SORT avg_salary ASC -| LIMIT 3 +FROM bag_of_numbers +| EVAL min = MV_MIN(numbers), max = MV_MAX(numbers), avg = MV_AVG(numbers) +| KEEP bad_id, min, max, avg +``` + +**Converts a date string into datetime using DATE_PARSE** +```esql +FROM personal_info +// birth_date is a text field storing date with the "yyyy-MM-dd" format +| EVAL birth=DATE_PARSE("yyyy-MM-dd", birth_date) +| KEEP user_name, birth +| SORT birth ```