From dcf4f331c3cbc22dda3f0f57e9d0387abbfe7775 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Thu, 2 Nov 2023 13:31:56 +0100 Subject: [PATCH] Fix flaky failure_distributed_results (#7307) Sometimes in CI we run into this failure: ```diff SELECT resultId, nodeport, rowcount, targetShardId, targetShardIndex FROM partition_task_list_results('test', $$ SELECT * FROM source_table $$, 'target_table') NATURAL JOIN pg_dist_node; -WARNING: connection to the remote node localhost:xxxxx failed with the following error: connection not open +ERROR: connection to the remote node localhost:9060 failed with the following error: connection not open SELECT * FROM distributed_result_info ORDER BY resultId; - resultid | nodeport | rowcount | targetshardid | targetshardindex ---------------------------------------------------------------------- - test_from_100800_to_0 | 9060 | 22 | 100805 | 0 - test_from_100801_to_0 | 57637 | 2 | 100805 | 0 - test_from_100801_to_1 | 57637 | 15 | 100806 | 1 - test_from_100802_to_1 | 57637 | 10 | 100806 | 1 - test_from_100802_to_2 | 57637 | 5 | 100807 | 2 - test_from_100803_to_2 | 57637 | 18 | 100807 | 2 - test_from_100803_to_3 | 57637 | 4 | 100808 | 3 - test_from_100804_to_3 | 9060 | 24 | 100808 | 3 -(8 rows) - +ERROR: current transaction is aborted, commands ignored until end of transaction block -- fetch from worker 2 should fail SAVEPOINT s1; +ERROR: current transaction is aborted, commands ignored until end of transaction block SELECT fetch_intermediate_results('{test_from_100802_to_1,test_from_100802_to_2}'::text[], 'localhost', :worker_2_port) > 0 AS fetched; -ERROR: could not open file "base/pgsql_job_cache/xx_x_xxx/test_from_100802_to_1.data": No such file or directory -CONTEXT: while executing command on localhost:xxxxx +ERROR: current transaction is aborted, commands ignored until end of transaction block ROLLBACK TO SAVEPOINT s1; +ERROR: savepoint "s1" does not exist -- fetch from worker 1 should succeed SELECT fetch_intermediate_results('{test_from_100802_to_1,test_from_100802_to_2}'::text[], 'localhost', :worker_1_port) > 0 AS fetched; - fetched ---------------------------------------------------------------------- - t -(1 row) - +ERROR: current transaction is aborted, commands ignored until end of transaction block -- make sure the results read are same as the previous transaction block SELECT count(*), sum(x) FROM read_intermediate_results('{test_from_100802_to_1,test_from_100802_to_2}'::text[],'binary') AS res (x int); - count | sum ---------------------------------------------------------------------- - 15 | 863 -(1 row) - +ERROR: current transaction is aborted, commands ignored until end of transaction block ROLLBACk; ``` As outlined in the #7306 I created, the reason for this is related to only having a single connection open to the node. Finding and fixing the full cause is not trivial, so instead this PR starts working around this bug by forcing maximum parallelism. Preferably we'd want this workaround not to be necessary, but that requires spending time to fix this. For now having a less flaky CI is good enough. (cherry picked from commit f171ec98fc58ac1ab2fdf808535c3259c4dfc3d1) --- src/test/regress/expected/failure_distributed_results.out | 2 ++ src/test/regress/sql/failure_distributed_results.sql | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/test/regress/expected/failure_distributed_results.out b/src/test/regress/expected/failure_distributed_results.out index fc97c9af609..a316763e32d 100644 --- a/src/test/regress/expected/failure_distributed_results.out +++ b/src/test/regress/expected/failure_distributed_results.out @@ -14,6 +14,8 @@ SELECT citus.mitmproxy('conn.allow()'); (1 row) SET citus.next_shard_id TO 100800; +-- Needed because of issue #7306 +SET citus.force_max_query_parallelization TO true; -- always try the 1st replica before the 2nd replica. SET citus.task_assignment_policy TO 'first-replica'; -- diff --git a/src/test/regress/sql/failure_distributed_results.sql b/src/test/regress/sql/failure_distributed_results.sql index 95e4d5513bf..93e4a9a3391 100644 --- a/src/test/regress/sql/failure_distributed_results.sql +++ b/src/test/regress/sql/failure_distributed_results.sql @@ -15,6 +15,8 @@ SET client_min_messages TO WARNING; SELECT citus.mitmproxy('conn.allow()'); SET citus.next_shard_id TO 100800; +-- Needed because of issue #7306 +SET citus.force_max_query_parallelization TO true; -- always try the 1st replica before the 2nd replica. SET citus.task_assignment_policy TO 'first-replica';