From d59f4eaa06e3f321c4387902ab03f61bdc12b6d7 Mon Sep 17 00:00:00 2001
From: "dependabot-preview[bot]"
<27856297+dependabot-preview[bot]@users.noreply.github.com>
Date: Wed, 28 Apr 2021 22:14:22 +0000
Subject: [PATCH 001/102] Upgrade to GitHub-native Dependabot
---
.github/dependabot.yml | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
create mode 100644 .github/dependabot.yml
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..b5158981
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,18 @@
+version: 2
+registries:
+ python-index-pypi-org:
+ type: python-index
+ url: https://pypi.org/
+ replaces-base: true
+ username: "${{secrets.PYTHON_INDEX_PYPI_ORG_USERNAME}}"
+ password: "${{secrets.PYTHON_INDEX_PYPI_ORG_PASSWORD}}"
+
+updates:
+- package-ecosystem: pip
+ directory: "/"
+ schedule:
+ interval: daily
+ time: "19:00"
+ open-pull-requests-limit: 10
+ registries:
+ - python-index-pypi-org
From 963ada115dc7038b0d2dcd29dba5765627b0477c Mon Sep 17 00:00:00 2001
From: william dutton
Date: Wed, 9 Nov 2022 08:52:01 +1000
Subject: [PATCH 002/102] fix workflow
---
.github/workflows/test.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e963e1f6..8a601204 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -13,7 +13,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
- python-version: '3.6'
+ python-version: '3.x'
- name: Install requirements
run: pip install flake8 pycodestyle
- name: Check syntax
From 9f96e1676c0be03774273917040546a1b97b2f3f Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 17 Apr 2023 11:22:32 +1000
Subject: [PATCH 003/102] [QOLDEV-347] apply 'str' fallback type correctly,
#182
- If all types have been rejected, ensure that the fallback flag is correctly set
---
ckanext/xloader/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
index cbffaa2f..79facbea 100644
--- a/ckanext/xloader/utils.py
+++ b/ckanext/xloader/utils.py
@@ -175,10 +175,10 @@ def type_guess(rows, types=TYPES, strict=False):
for ci, cell in enumerate(row):
if not cell:
continue
- at_least_one_value[ci] = True
for type in list(guesses[ci].keys()):
if not isinstance(cell, type):
guesses[ci].pop(type)
+ at_least_one_value[ci] = True if guesses[ci] else False
# no need to set guessing weights before this
# because we only accept a type if it never fails
for i, guess in enumerate(guesses):
From cf04a5c5c38443f3d98e0e7b8a4ed0ceede90aa0 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 17 Apr 2023 12:50:09 +1000
Subject: [PATCH 004/102] [QOLDEV-347] fix validation errors on empty strings,
#182
- replace empty strings with None if they have types that will choke on empty string
---
ckanext/xloader/loader.py | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index afc3c980..75bddf51 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -318,9 +318,16 @@ def row_iterator():
logger.info('Copying to database...')
count = 0
+ # Some types cannot be stored as empty strings and must be converted to None,
+ # https://github.com/ckan/ckanext-xloader/issues/182
+ non_empty_types = ['timestamp', 'numeric']
for i, records in enumerate(chunky(result, 250)):
count += len(records)
logger.info('Saving chunk {number}'.format(number=i))
+ for row in records:
+ for column_index, column_name in enumerate(row):
+ if headers_dicts[column_index]['type'] in non_empty_types and row[column_name] == '':
+ row[column_name] = None
send_resource_to_datastore(resource_id, headers_dicts, records)
logger.info('...copying done')
From b8b99143be32bd0da3e7e125e5f02cff7c8212eb Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 17 Apr 2023 14:54:23 +1000
Subject: [PATCH 005/102] [QOLDEV-347] add tests for edge cases we're fixing
- Column that has some rows with free text and others with numeric data
- Column that has some rows with timestamp and others with empty string
---
.../samples/mixed_numeric_string_sample.csv | 3 +++
.../tests/samples/sample_with_blanks.csv | 4 ++++
ckanext/xloader/tests/test_loader.py | 24 +++++++++++++++++++
3 files changed, 31 insertions(+)
create mode 100644 ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv
create mode 100644 ckanext/xloader/tests/samples/sample_with_blanks.csv
diff --git a/ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv b/ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv
new file mode 100644
index 00000000..9d076602
--- /dev/null
+++ b/ckanext/xloader/tests/samples/mixed_numeric_string_sample.csv
@@ -0,0 +1,3 @@
+Funding agency,Program title,Maximum (indicative) grant amount
+DTIS,Accessible Tourism Infrastructure Grants,Five hundred thousand dollars
+DTIS,Boosting Accessible Tourism Experiences Grants,5000
diff --git a/ckanext/xloader/tests/samples/sample_with_blanks.csv b/ckanext/xloader/tests/samples/sample_with_blanks.csv
new file mode 100644
index 00000000..b53b25db
--- /dev/null
+++ b/ckanext/xloader/tests/samples/sample_with_blanks.csv
@@ -0,0 +1,4 @@
+Funding agency,Program title,Opening date,Service ID
+DTIS,Visitor First Experiences Fund,23/03/2023,63039
+DTIS,First Nations Sport and Recreation Program Round 2,22/03/2023,63040
+,,,63041
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index f31b663b..68452d11 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -612,6 +612,30 @@ def test_german(self, Session):
u"tsvector",
] + [u"text"] * (len(records[0]) - 1)
+ def test_with_blanks(self, Session):
+ csv_filepath = get_sample_filepath("sample_with_blanks.csv")
+ resource_id = "test1"
+ factories.Resource(id=resource_id)
+ loader.load_csv(
+ csv_filepath,
+ resource_id=resource_id,
+ mimetype="text/csv",
+ logger=logger,
+ )
+ assert len(self._get_records(Session, "test1")) == 3
+
+ def test_with_mixed_types(self, Session):
+ csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
+ resource_id = "test1"
+ factories.Resource(id=resource_id)
+ loader.load_csv(
+ csv_filepath,
+ resource_id=resource_id,
+ mimetype="text/csv",
+ logger=logger,
+ )
+ assert len(self._get_records(Session, "test1")) == 2
+
def test_reload(self, Session):
csv_filepath = get_sample_filepath("simple.csv")
resource_id = "test1"
From b62aa6ccfc2f54008e4bbd240fb031bb130cd1ed Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 17 Apr 2023 16:05:54 +1000
Subject: [PATCH 006/102] [QOLDEV-347] tighten Flake8 rules
- Remove unused imports, or tag those that serve a purpose (testing what can be imported)
- Remove obsolete exclusions from Flake8 config
---
.flake8 | 4 ----
ckanext/xloader/jobs.py | 2 +-
ckanext/xloader/loader.py | 1 -
ckanext/xloader/parser.py | 2 --
ckanext/xloader/plugin.py | 1 -
ckanext/xloader/tests/ckan_setup.py | 2 +-
ckanext/xloader/tests/fixtures.py | 5 ++---
7 files changed, 4 insertions(+), 13 deletions(-)
diff --git a/.flake8 b/.flake8
index a4eea9e3..32068ca7 100644
--- a/.flake8
+++ b/.flake8
@@ -17,8 +17,4 @@ max-line-length=127
# List ignore rules one per line.
ignore =
- E501
- C901
W503
- F401
- F403
diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py
index 4c4068f9..0d242db1 100644
--- a/ckanext/xloader/jobs.py
+++ b/ckanext/xloader/jobs.py
@@ -16,7 +16,7 @@
import sqlalchemy as sa
from ckan import model
-from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config, check_ckan_version
+from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config
from . import loader
from . import db
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 75bddf51..55c9cab5 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -14,7 +14,6 @@
from unidecode import unidecode
import ckan.plugins as p
-import ckan.plugins.toolkit as tk
from .job_exceptions import FileCouldNotBeLoadedError, LoaderError
from .parser import XloaderCSVParser
diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py
index b2a6f889..b52c59a3 100644
--- a/ckanext/xloader/parser.py
+++ b/ckanext/xloader/parser.py
@@ -1,10 +1,8 @@
# -*- coding: utf-8 -*-
import csv
-from codecs import iterencode
from decimal import Decimal, InvalidOperation
from itertools import chain
-import six
from ckan.plugins.toolkit import asbool
from dateutil.parser import isoparser, parser
from dateutil.parser import ParserError
diff --git a/ckanext/xloader/plugin.py b/ckanext/xloader/plugin.py
index dbde8ed5..159b99de 100644
--- a/ckanext/xloader/plugin.py
+++ b/ckanext/xloader/plugin.py
@@ -6,7 +6,6 @@
from ckan.plugins import toolkit
from . import action, auth, helpers as xloader_helpers, utils
-from .loader import fulltext_function_exists, get_write_engine
try:
config_declarations = toolkit.blanket.config_declarations
diff --git a/ckanext/xloader/tests/ckan_setup.py b/ckanext/xloader/tests/ckan_setup.py
index ae8bfb3e..ff43d74c 100644
--- a/ckanext/xloader/tests/ckan_setup.py
+++ b/ckanext/xloader/tests/ckan_setup.py
@@ -1,5 +1,5 @@
try:
- from ckan.tests.pytest_ckan.ckan_setup import *
+ from ckan.tests.pytest_ckan.ckan_setup import * # noqa
except ImportError:
import pkg_resources
from paste.deploy import loadapp
diff --git a/ckanext/xloader/tests/fixtures.py b/ckanext/xloader/tests/fixtures.py
index f43916ab..9a7ad37f 100644
--- a/ckanext/xloader/tests/fixtures.py
+++ b/ckanext/xloader/tests/fixtures.py
@@ -1,6 +1,5 @@
# -*- coding: utf-8 -*-
-import sqlalchemy
-import sqlalchemy.orm as orm
+from sqlalchemy import orm
import os
from ckanext.datastore.tests import helpers as datastore_helpers
@@ -11,7 +10,7 @@
)
try:
- from ckan.tests.pytest_ckan.fixtures import *
+ from ckan.tests.pytest_ckan.fixtures import * # noqa
except ImportError:
import pytest
From 939ff6bfc9125d3c281b3401cff3d92456c05d44 Mon Sep 17 00:00:00 2001
From: antuarc
Date: Fri, 19 May 2023 11:39:10 +1000
Subject: [PATCH 007/102] [QOLSVC-1863] truncate on-page XLoader logs if there
are too many
- Show the first 100 and last 100 logs, with a message between to say how many were skipped
---
ckanext/xloader/templates/xloader/resource_data.html | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/ckanext/xloader/templates/xloader/resource_data.html b/ckanext/xloader/templates/xloader/resource_data.html
index a94ad631..e9786776 100644
--- a/ckanext/xloader/templates/xloader/resource_data.html
+++ b/ckanext/xloader/templates/xloader/resource_data.html
@@ -62,6 +62,8 @@
{{ _('Upload Log') }}
{% for item in status.task_info.logs %}
+ {# Truncate very long loops, showing just the start and end #}
+ {% if loop.index <= 100 or loop.revindex <= 100 %}
{% set icon = 'ok' if item.level == 'INFO' else 'exclamation' %}
{% set class = ' failure' if icon == 'exclamation' else ' success' %}
{% set popover_content = 'test' %}
@@ -77,6 +79,12 @@ {{ _('Upload Log') }}
+ {% elif loop.index == 101 %}
+
+
+ Skipping {{ loop.revindex - 100}} records
...
+
+ {% endif %}
{% endfor %}
From 05b2b888a2e6772e019a84f59ee33145dc93e99e Mon Sep 17 00:00:00 2001
From: antuarc
Date: Fri, 19 May 2023 12:19:05 +1000
Subject: [PATCH 008/102] [QOLSVC-1863] make XLoader log truncation
configurable
- Start with the first and last 50 rows, provide a link to double it
---
.../xloader/templates/xloader/resource_data.html | 14 +++++++++++---
ckanext/xloader/utils.py | 15 +++++++++------
ckanext/xloader/views.py | 10 ++++++++--
3 files changed, 28 insertions(+), 11 deletions(-)
diff --git a/ckanext/xloader/templates/xloader/resource_data.html b/ckanext/xloader/templates/xloader/resource_data.html
index e9786776..e24f79d8 100644
--- a/ckanext/xloader/templates/xloader/resource_data.html
+++ b/ckanext/xloader/templates/xloader/resource_data.html
@@ -61,9 +61,11 @@
{% if status.status and status.task_info and show_table %}
{{ _('Upload Log') }}
+ {% set rows = rows or 50 %}
{% for item in status.task_info.logs %}
{# Truncate very long loops, showing just the start and end #}
- {% if loop.index <= 100 or loop.revindex <= 100 %}
+ {% if loop.index <= rows or loop.revindex <= rows
+ or (loop.index == rows + 1 and loop.revindex == rows + 1) %}
{% set icon = 'ok' if item.level == 'INFO' else 'exclamation' %}
{% set class = ' failure' if icon == 'exclamation' else ' success' %}
{% set popover_content = 'test' %}
@@ -79,10 +81,16 @@ {{ _('Upload Log') }}
- {% elif loop.index == 101 %}
+ {% elif loop.index == rows + 1 %}
- Skipping {{ loop.revindex - 100}} records
...
+
+ Skipping {{ loop.revindex - rows}} records...
+
+
+ Show more
+
+
{% endif %}
{% endfor %}
diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
index 79facbea..bda96fd7 100644
--- a/ckanext/xloader/utils.py
+++ b/ckanext/xloader/utils.py
@@ -11,7 +11,7 @@
import ckan.plugins as p
-def resource_data(id, resource_id):
+def resource_data(id, resource_id, rows=None):
if p.toolkit.request.method == "POST":
try:
@@ -44,13 +44,16 @@ def resource_data(id, resource_id):
except p.toolkit.NotAuthorized:
return p.toolkit.abort(403, p.toolkit._("Not authorized to see this page"))
+ extra_vars={
+ "status": xloader_status,
+ "resource": resource,
+ "pkg_dict": pkg_dict,
+ }
+ if rows:
+ extra_vars["rows"] = rows
return p.toolkit.render(
"xloader/resource_data.html",
- extra_vars={
- "status": xloader_status,
- "resource": resource,
- "pkg_dict": pkg_dict,
- },
+ extra_vars=extra_vars,
)
diff --git a/ckanext/xloader/views.py b/ckanext/xloader/views.py
index 198de320..1ca212c8 100644
--- a/ckanext/xloader/views.py
+++ b/ckanext/xloader/views.py
@@ -1,4 +1,4 @@
-from flask import Blueprint
+from flask import Blueprint, request
import ckanext.xloader.utils as utils
@@ -12,4 +12,10 @@ def get_blueprints():
@xloader.route("/dataset//resource_data/", methods=("GET", "POST"))
def resource_data(id, resource_id):
- return utils.resource_data(id, resource_id)
+ rows = request.args.get('rows')
+ if rows:
+ try:
+ rows = int(rows)
+ except ValueError:
+ rows = None
+ return utils.resource_data(id, resource_id, rows)
From d937dea83c949d837bf8f18452b34646dcd48e2b Mon Sep 17 00:00:00 2001
From: antuarc
Date: Fri, 19 May 2023 12:58:33 +1000
Subject: [PATCH 009/102] [QOLSVC-1863] adjust XLoader log truncation
appearance
- Add a notice at the top of the page when logs have been hidden.
- Clarify that we are hiding logs, not skipping actual processing of data.
- Add 'Show All' link to show all logs if the user is confident they can handle it.
---
.../templates/xloader/resource_data.html | 20 ++++++++++++++++---
1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/ckanext/xloader/templates/xloader/resource_data.html b/ckanext/xloader/templates/xloader/resource_data.html
index e24f79d8..d9a22058 100644
--- a/ckanext/xloader/templates/xloader/resource_data.html
+++ b/ckanext/xloader/templates/xloader/resource_data.html
@@ -61,8 +61,22 @@
{% if status.status and status.task_info and show_table %}
{{ _('Upload Log') }}
+ {% set items = status.task_info.logs %}
{% set rows = rows or 50 %}
- {% for item in status.task_info.logs %}
+ {% set skipped_rows = (items | length) - (rows * 2) %}
+ {% if skipped_rows > 1 %}
+ -
+
+
+ {{ skipped_rows }} out of {{ items | length }} logs will be hidden.
+
+
+ Show more Show all
+
+
+
+ {% endif %}
+ {% for item in items %}
{# Truncate very long loops, showing just the start and end #}
{% if loop.index <= rows or loop.revindex <= rows
or (loop.index == rows + 1 and loop.revindex == rows + 1) %}
@@ -85,10 +99,10 @@ {{ _('Upload Log') }}
-
- Skipping {{ loop.revindex - rows}} records...
+ Skipping {{ skipped_rows }} logs...
- Show more
+ Show more Show all
From 288b0ab3d7dfc1e807b30cb61ab862a9024a8fc1 Mon Sep 17 00:00:00 2001
From: antuarc
Date: Fri, 19 May 2023 13:34:21 +1000
Subject: [PATCH 010/102] [QOLSVC-1863] ignore negative numbers of logs to
display
---
ckanext/xloader/utils.py | 2 +-
ckanext/xloader/views.py | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
index bda96fd7..ec8e4bbd 100644
--- a/ckanext/xloader/utils.py
+++ b/ckanext/xloader/utils.py
@@ -44,7 +44,7 @@ def resource_data(id, resource_id, rows=None):
except p.toolkit.NotAuthorized:
return p.toolkit.abort(403, p.toolkit._("Not authorized to see this page"))
- extra_vars={
+ extra_vars = {
"status": xloader_status,
"resource": resource,
"pkg_dict": pkg_dict,
diff --git a/ckanext/xloader/views.py b/ckanext/xloader/views.py
index 1ca212c8..5a56322c 100644
--- a/ckanext/xloader/views.py
+++ b/ckanext/xloader/views.py
@@ -16,6 +16,8 @@ def resource_data(id, resource_id):
if rows:
try:
rows = int(rows)
+ if rows < 0:
+ rows = None
except ValueError:
rows = None
return utils.resource_data(id, resource_id, rows)
From b351e95c4a38e10971098755667ea59a5804f8f8 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Fri, 26 May 2023 12:12:49 +1000
Subject: [PATCH 011/102] [QOLDEV-424] add unit test for parsing CSV file with
commas inside quotes
---
.../tests/samples/sample_with_quoted_commas.csv | 4 ++++
ckanext/xloader/tests/test_loader.py | 12 ++++++++++++
2 files changed, 16 insertions(+)
create mode 100644 ckanext/xloader/tests/samples/sample_with_quoted_commas.csv
diff --git a/ckanext/xloader/tests/samples/sample_with_quoted_commas.csv b/ckanext/xloader/tests/samples/sample_with_quoted_commas.csv
new file mode 100644
index 00000000..7fe94e5b
--- /dev/null
+++ b/ckanext/xloader/tests/samples/sample_with_quoted_commas.csv
@@ -0,0 +1,4 @@
+Funding agency,Program title,Opening date,Service ID
+DTIS,"Department of Employment, Small Business and Training",23/03/2023,63039
+DTIS,"Foo, baz, meh",22/03/2023,63040
+,,,63041
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index 68452d11..1ab79524 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -624,6 +624,18 @@ def test_with_blanks(self, Session):
)
assert len(self._get_records(Session, "test1")) == 3
+ def test_with_quoted_commas(self, Session):
+ csv_filepath = get_sample_filepath("sample_with_quoted_commas.csv")
+ resource_id = "test1"
+ factories.Resource(id=resource_id)
+ loader.load_csv(
+ csv_filepath,
+ resource_id=resource_id,
+ mimetype="text/csv",
+ logger=logger,
+ )
+ assert len(self._get_records(Session, "test1")) == 3
+
def test_with_mixed_types(self, Session):
csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
resource_id = "test1"
From 83e1b86675ad8c6f22b2252e0568b8a36f66b5d1 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Fri, 26 May 2023 12:24:25 +1000
Subject: [PATCH 012/102] [QOLDEV-424] add unit test for parsing CSV file with
commas inside quotes using tabulator
---
ckanext/xloader/tests/test_loader.py | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index 1ab79524..1b4a2ec5 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -1171,3 +1171,15 @@ def test_no_entries(self):
mimetype="csv",
logger=logger,
)
+
+ def test_with_quoted_commas(self, Session):
+ csv_filepath = get_sample_filepath("sample_with_quoted_commas.csv")
+ resource_id = "test1"
+ factories.Resource(id=resource_id)
+ loader.load_table(
+ csv_filepath,
+ resource_id=resource_id,
+ mimetype="text/csv",
+ logger=logger,
+ )
+ assert len(self._get_records(Session, "test1")) == 3
From 9ae1b26e307c03e9b376bec36ef0be735b4c73cc Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 29 May 2023 09:59:15 +1000
Subject: [PATCH 013/102] [QOLDEV-424] add unit test for parsing CSV file with
a mixture of single and double quotes
---
.../samples/sample_with_mixed_quotes.csv | 136 ++++++++++++++++++
ckanext/xloader/tests/test_loader.py | 24 ++++
2 files changed, 160 insertions(+)
create mode 100644 ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv
diff --git a/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv b/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv
new file mode 100644
index 00000000..8408a155
--- /dev/null
+++ b/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv
@@ -0,0 +1,136 @@
+Category,Category name,Priority,Initiative name,Investment objectives,Primary digital priority,Initiative stage,Actual start date,Approved end date,Date data current at,Percentage complete,Overall status,Project commencement allocation,Approved expenditure,Actual cost to date,Scope change event,Cost re-evaluation event,Delivery delay event,Project journey and reasons for variance,Learn more (URL)
+DESBT,"Department of Employment, Small Business and Training",High,Business Launchpad Project - Stage 2,"This BLP initiative has a customer-journey based platform for businesses located throughout Queensland. The aim is to assist businesses to better understand their start up and compliance requirements, with a view to streamlining the complex regulatory environment that may delay or impede businesses from starting, growing, and employing. As at 1 July 2022, Business Launchpad Stage 2 has approval to extend beyond the SBRR scope of deliverables to focus on a revised user journey, inclusion of additional industries, and a broader coverage of more than 95% of the Queensland population, to be completed by 30 June 2023.",Collaboration,Delivery,01/07/2022,30/06/2023,31/03/2023,41,G,5633000,5739000,2352000,N,N,N,"As at 31 March 2023
+- Overall 'green' (on track) status
+- Revised user journey following results of BLP UX/UI testing
+- Transition to support progressing with documentation and walk-through of the solution.
+- Ongoing high levels of BLP usage reflecting the success of search engine marketing. BLP focused campaign to further increase awareness and usage is being finalised.
+
+As at 28 February 2023
+- Overall 'green' (on track) status
+- Results of BLP UX/UI testing is guiding development of the revised user journey.
+- BLP transition to BAU support continuing with workshops, showcases and handover documentation.
+- BLP usage is increasing
+
+As at 31 January 2023
+- Continued amber status [closely monitored] with risks under management
+- Search Engine Marketing -'Always On' yielding good results with continued increase in users and the proportion benefitting from BLP
+- Good progress on development of revised BLP user journey.
+
+As at 31 December 2022
+Status AMBER [Closely monitored]
+- Search Engine Marketing commenced 19 December 2022 and already showing increased users and proportion of customers benefitting from BLP
+- External assurance review completed and reported 'green' rating for confidence of delivery.
+
+As at 30 November 2022
+- Continued amber status pending risk management
+- Marketing to commence to increase awareness of platform
+- Good progress on development of revised user journey
+
+As at 31 October 2022
+Status AMBER [Closely monitored]
+- BLP Stage 2 continue reporting amber status reflective of ongoing high-level risks associated with demand-driven labour-market conditions and planned transition to support.
+- Communications and engagement are in progress.
+- The revised user journey continues development and testing. This is planned to be ready for release in the first quarter of 2023. As at 30 September 2022
+Status AMBER [Closely monitored]
+Project journey events:
+- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress.
+- BLP industries expanded to include all industries.
+- Engagement with agencies continues, to heighten BLP awareness and complete validation following recent expansion to encompass all industries.
+
+As at 31 August 2022
+Status GREEN [On track]
+The project is reporting green overall. Ongoing resourcing risk will continue to be monitored and managed for the life of the project, due to a tight labour market.
+Project journey events:
+- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress.
+- Further analysis of June/July 2022 marketing campaign has offered recommendations for consideration, to improve target audience awareness and BLP uptake.
+- BLP industries expanded to include Retail Trade, Accommodation and Non-residential Construction industries finalised.
+- Engagement with agencies continues, to heighten BLP awareness and complete validation following recent expansion with three additional industries.
+
+As at 31 July 2022
+Status AMBER [Closely monitored]
+The project is continuing to report amber overall mainly due to ongoing resourcing challenges.
+Project journey events:
+- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness, is progressing.
+- Analysis of a major marketing campaign conducted in June/July 2022 showed a significant step-up in number of BLP users.
+- The target of 95% of Queensland population coverage was met in June 2022 with 100% of Queensland population now covered on BLP.
+- Agency engagement for extension industries has commenced.
+
+As at 1 July 2022
+BLP commenced work on expanding industries to include Retail Trade, Accommodation and Non-residential Construction industries.
+
+As at June 2022
+Stage 2 of the project is commencing and will build up the solution delivered in BLP Stage 1. Customer journey will be revised in line with outcome of customer testing. The increased coverage target of at least 95% of the Queensland population was met in June 2022, with all local governments included on BLP. Benefits realisation through marketing and promotion of BLP.",https://www.business.qld.gov.au/starting-business/planning/launchpad
+DESBT,"Department of Employment, Small Business and Training",High,VET Modernisation and Transformation Program - Tranche 1,"The Vocational Education and Training (VET) Modernisation and Transformation (VMT) Program seeks to reduce the risks associated with department legacy systems by delivering contemporary, consolidated, integrated, user-friendly applications to support delivery of VET outcomes. To optimise the technical capabilities of the new solutions, engagement with business teams in the review and development of business processes is a priority. ",Trust,Delivery,01/07/2021,31/08/2023,28/02/2023,52,G,8692200,9614968,4961147,Y,Y,Y,"As at 28 February 2023
+- Tranche 1 VMT projects continue on schedule and on budget for Tranche 1 completion by 31 August 2023.
+- Customer Engagement and Contract Establishment projects continue to progress focusing on delivery activities for new CRM and Portal enhancements.
+- VMT Tranche 2 Business Case tracking for completion April 2023.
+
+As at 31 January 2023
+- VMT Projects continue to track to schedule and on budget for Tranche 1 completion 31 August 2023.
+- Customer Engagement and Contract Establishment Projects progressing well with delivery activities for new CRM and Portal enhancements.
+
+As at 31 December 2022
+Status GREEN
+- VMT projects continuing to track to board endorsed updated schedule and on budget for Tranche 1 completion on 31 August 2023.
+- Customer Engagement and Contract Establishment projects completed partner onboarding and delivery activities underway.
+- Planning in progress for Tranche 2, focusing on remaining legacy systems for planned commencement at completion of Tranch 1.
+
+As at 30 November 2022
+Status GREEN
+- Tranche 1 delivery date extended to 31 August 2023 due to CRM vendor procurement delays and subsequent additional time requirements for build completion and testing of new CRM.
+- All projects maintaining momentum and progressing to revised schedule within budget.
+
+As at 31 October 2022
+Status GREEN
+-New 'Partner Portal' Digital Channel continues to perform well with 3516 registered, active, external users from 634 different organisations. Update release being planned for January 2023.
+-SkillsCRM (CEP Project) delivery partner on-boarded and formal delivery stage commenced.
+-Contract Establishment and Variation (CEV PRoject) continuing delivery partner select with a view to commencing prior to end of December 2022.
+
+As at 30 September 2022 Status GREEN.
+The VMT 'Partner Portal' solution was successfully launched on the 17 August 2022. The decommissioning of the outdated legacy application, 'DETConnect', has completed. Work is now increasing on the next VET systems to be replaced, SkillsCRM (via the Customer Engagement Project) and Policy on Line (via the Contract Establishment and Variation Project).
+Project Journey Events:
+- Partner Portal. After the successful launch of Partner Portal and decommissioning of DETConnect, the transition to BAU is underway with the Project team continuing to support business until BAU transition is completed.
+- Data, Infrastructure and Reporting.
+New 'Data Lake' infrastructure built. Data ingestion processes being trialled. QTS report requirement gathering underway which will showcase new capability once completed. Compliance tool SMCM successfully launched September 30.
+-Customer Engagement Project (CEP). Completed assurance reviews successfully. Delivery partner selection completed. Partner and formal delivery stage due to start 18 October 2022. Ramp up of activities continuing with business demonstrations of CRM proof of concept.
+-Contract Establishment and Variation (CEV).
+Requirements gathering completed. Delivery partner selection process commenced. 'As is' process documentation underway.
+
+As at 31 August 2022
+Status GREEN. The project remains on track. Successful launch of new secure 'Partner Portal' Digital Channel for VET related organisations occurred 17 August 2022.
+
+Current Projects underway:
+- Partner Portal. Go-live occurred on track 17 August 2022. All registered VET organisations now able to use the portal to access key applications and send information to DESBT via secure channel. Enhanced support being provided for 6 weeks. Legacy system decommissioning underway.
+- Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) continuing and requirement gathering of first report planned to use new capabilites commenced.
+- Customer Services Hub (CRM). Implementation partner selection complete. Solution delivery activities due to start by end September 2022.
+- Contract Engagement and Variation. Requirements gathering complete and partner selection process to commence by end September 2022.
+
+As at 31 July 2022
+Status GREEN
+
+Project journey events:
+Implementation of next changes to VMT applications remain on track for August 2022 with full launch of new secure Partner Portal Digital Channel for VET related organisations.
+VMT Program scope adjusted to include additional at risk system decommission activties during this financial year. Approved expenditure updated to align with revised scope.
+
+Current Projects underway
+- Partner Portal. Opened for registrations 4 July 2022. Majority of VET related organisation now registered. Full access (go-live) on track to commence 17 August 2022. Legacy system to be disabled and decommissioned September 2022.
+- Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) underway with population and work on first report to commence in September.
+- Customer Services Hub (CRM). Requirements confirmed and partner selection underway. Work on legacy CRM replacement due to start September/October 2022.
+- Contract Engagement and Variation. Requirements gathering and new process design activities in progress.
+
+15 May 2022 Update
+Status GREEN
+
+Implementation of next changes to VET applications on track for August 2022 with introduction of new secure 'Patner Portal' Digital Channel for VET related organisations.
+
+Projects Completed
+-Database consolidation - key databases transitioned to supported versions and platforms. Completed November 2021.
+-System to System Integration platform. Completed 9 May 2022.
+
+Current projects underway
+-Partner Portal secure digital channel, in final testing. Pilot successfully complete and on track for release in August 2022.
+Projects in startup
+-Data, Infrastructure and Reporting, planning underway.
+-Customer Services Hub (CRM), planning underway.
+-Contract Engagement and Variation, planning underway.
+-Planning continues for Tranche 2.",https://portal.desbt.qld.gov.au/
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index 1b4a2ec5..451c42ae 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -636,6 +636,18 @@ def test_with_quoted_commas(self, Session):
)
assert len(self._get_records(Session, "test1")) == 3
+ def test_with_mixed_quotes(self, Session):
+ csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv")
+ resource_id = "test1"
+ factories.Resource(id=resource_id)
+ loader.load_csv(
+ csv_filepath,
+ resource_id=resource_id,
+ mimetype="text/csv",
+ logger=logger,
+ )
+ assert len(self._get_records(Session, "test1")) == 2
+
def test_with_mixed_types(self, Session):
csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
resource_id = "test1"
@@ -1183,3 +1195,15 @@ def test_with_quoted_commas(self, Session):
logger=logger,
)
assert len(self._get_records(Session, "test1")) == 3
+
+ def test_with_mixed_quotes(self, Session):
+ csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv")
+ resource_id = "test1"
+ factories.Resource(id=resource_id)
+ loader.load_table(
+ csv_filepath,
+ resource_id=resource_id,
+ mimetype="text/csv",
+ logger=logger,
+ )
+ assert len(self._get_records(Session, "test1")) == 2
From 08298013eb4f4b9fbc514e449dd829a8048ef4a0 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 29 May 2023 10:19:16 +1000
Subject: [PATCH 014/102] [QOLDEV-424] reuse sample size constant for both
loading methods
---
ckanext/xloader/loader.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 55c9cab5..15783021 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -16,7 +16,7 @@
import ckan.plugins as p
from .job_exceptions import FileCouldNotBeLoadedError, LoaderError
-from .parser import XloaderCSVParser
+from .parser import CSV_SAMPLE_LINES, XloaderCSVParser
from .utils import headers_guess, type_guess
from ckan.plugins.toolkit import config
@@ -36,12 +36,12 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
# Determine the header row
try:
file_format = os.path.splitext(csv_filepath)[1].strip('.')
- with Stream(csv_filepath, format=file_format) as stream:
+ with Stream(csv_filepath, format=file_format, sample_size=CSV_SAMPLE_LINES) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
- with Stream(csv_filepath, format=file_format) as stream:
+ with Stream(csv_filepath, format=file_format, sample_size=CSV_SAMPLE_LINES) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
From b444c6c147cea8a0f0e4ad721abe52677c415b49 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 29 May 2023 10:19:42 +1000
Subject: [PATCH 015/102] [QOLDEV-424] increase CSV sample size to better match
Messytables behaviour
---
ckanext/xloader/loader.py | 2 +-
ckanext/xloader/parser.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 15783021..7ab76ca5 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -72,7 +72,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
- with Stream(csv_filepath, format=file_format, skip_rows=skip_rows) as stream:
+ with Stream(csv_filepath, format=file_format, skip_rows=skip_rows, sample_size=CSV_SAMPLE_LINES) as stream:
stream.save(target=f_write.name, format='csv', encoding='utf-8',
delimiter=delimiter)
csv_filepath = f_write.name
diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py
index b52c59a3..82539f4d 100644
--- a/ckanext/xloader/parser.py
+++ b/ckanext/xloader/parser.py
@@ -12,7 +12,7 @@
from ckan.plugins.toolkit import config
-CSV_SAMPLE_LINES = 100
+CSV_SAMPLE_LINES = 1000
class XloaderCSVParser(Parser):
From 37a2a5428ab6fd886e8bd93bf78743ed9a6e8daa Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 29 May 2023 11:21:32 +1000
Subject: [PATCH 016/102] [QOLDEV-424] set default CSV sample size in config
- This is more efficient than setting it on each call,
and applies even to code that just reads the config without accepting an override.
---
ckanext/xloader/loader.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 7ab76ca5..2060a9ef 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -10,7 +10,7 @@
import psycopg2
from six.moves import zip
-from tabulator import Stream, TabulatorException
+from tabulator import config as tabulator_config, Stream, TabulatorException
from unidecode import unidecode
import ckan.plugins as p
@@ -28,6 +28,7 @@
_drop_indexes = datastore_db._drop_indexes
MAX_COLUMN_LENGTH = 63
+tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
@@ -36,12 +37,12 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
# Determine the header row
try:
file_format = os.path.splitext(csv_filepath)[1].strip('.')
- with Stream(csv_filepath, format=file_format, sample_size=CSV_SAMPLE_LINES) as stream:
+ with Stream(csv_filepath, format=file_format) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
- with Stream(csv_filepath, format=file_format, sample_size=CSV_SAMPLE_LINES) as stream:
+ with Stream(csv_filepath, format=file_format) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
@@ -72,7 +73,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
- with Stream(csv_filepath, format=file_format, skip_rows=skip_rows, sample_size=CSV_SAMPLE_LINES) as stream:
+ with Stream(csv_filepath, format=file_format, skip_rows=skip_rows) as stream:
stream.save(target=f_write.name, format='csv', encoding='utf-8',
delimiter=delimiter)
csv_filepath = f_write.name
From 51fffade18e76eb6ec41a29ae885f5c00aac16a2 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 29 May 2023 11:56:38 +1000
Subject: [PATCH 017/102] [QOLDEV-424] alter sample mixed-quotes file to use
generic data
---
.../samples/sample_with_mixed_quotes.csv | 122 +++++++++---------
1 file changed, 61 insertions(+), 61 deletions(-)
diff --git a/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv b/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv
index 8408a155..a9527cf7 100644
--- a/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv
+++ b/ckanext/xloader/tests/samples/sample_with_mixed_quotes.csv
@@ -1,136 +1,136 @@
Category,Category name,Priority,Initiative name,Investment objectives,Primary digital priority,Initiative stage,Actual start date,Approved end date,Date data current at,Percentage complete,Overall status,Project commencement allocation,Approved expenditure,Actual cost to date,Scope change event,Cost re-evaluation event,Delivery delay event,Project journey and reasons for variance,Learn more (URL)
-DESBT,"Department of Employment, Small Business and Training",High,Business Launchpad Project - Stage 2,"This BLP initiative has a customer-journey based platform for businesses located throughout Queensland. The aim is to assist businesses to better understand their start up and compliance requirements, with a view to streamlining the complex regulatory environment that may delay or impede businesses from starting, growing, and employing. As at 1 July 2022, Business Launchpad Stage 2 has approval to extend beyond the SBRR scope of deliverables to focus on a revised user journey, inclusion of additional industries, and a broader coverage of more than 95% of the Queensland population, to be completed by 30 June 2023.",Collaboration,Delivery,01/07/2022,30/06/2023,31/03/2023,41,G,5633000,5739000,2352000,N,N,N,"As at 31 March 2023
+DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Silly Walks project - Stage 2,"Lorum ipsum.",Collaboration,Delivery,01/07/1970,30/06/1971,31/03/1971,41,G,5633000,5739000,2352000,N,N,N,"As at 31 March 1971
- Overall 'green' (on track) status
-- Revised user journey following results of BLP UX/UI testing
+- Revised user journey following results of Silly Walk UX/UI testing
- Transition to support progressing with documentation and walk-through of the solution.
-- Ongoing high levels of BLP usage reflecting the success of search engine marketing. BLP focused campaign to further increase awareness and usage is being finalised.
+- Ongoing high levels of silly walk usage reflecting the success of search engine marketing. Silly walk focused campaign to further increase awareness and usage is being finalised.
-As at 28 February 2023
+As at 28 February 1971
- Overall 'green' (on track) status
-- Results of BLP UX/UI testing is guiding development of the revised user journey.
-- BLP transition to BAU support continuing with workshops, showcases and handover documentation.
-- BLP usage is increasing
+- Results of Silly Walk UX/UI testing is guiding development of the revised user journey.
+- Silly Walk transition to BAU support continuing with workshops, showcases and handover documentation.
+- Silly Walk usage is increasing
-As at 31 January 2023
+As at 31 January 1971
- Continued amber status [closely monitored] with risks under management
-- Search Engine Marketing -'Always On' yielding good results with continued increase in users and the proportion benefitting from BLP
-- Good progress on development of revised BLP user journey.
+- Search Engine Marketing -'Always On' yielding good results with continued increase in users and the proportion benefitting from Silly Walk
+- Good progress on development of revised Silly Walk user journey.
-As at 31 December 2022
+As at 31 December 1970
Status AMBER [Closely monitored]
-- Search Engine Marketing commenced 19 December 2022 and already showing increased users and proportion of customers benefitting from BLP
+- Search Engine Marketing commenced 19 December 1970 and already showing increased users and proportion of customers benefitting from Silly Walk
- External assurance review completed and reported 'green' rating for confidence of delivery.
-As at 30 November 2022
+As at 30 November 1970
- Continued amber status pending risk management
- Marketing to commence to increase awareness of platform
- Good progress on development of revised user journey
-As at 31 October 2022
+As at 31 October 1970
Status AMBER [Closely monitored]
-- BLP Stage 2 continue reporting amber status reflective of ongoing high-level risks associated with demand-driven labour-market conditions and planned transition to support.
+- Silly Walk Stage 2 continue reporting amber status reflective of ongoing high-level risks associated with demand-driven labour-market conditions and planned transition to support.
- Communications and engagement are in progress.
-- The revised user journey continues development and testing. This is planned to be ready for release in the first quarter of 2023. As at 30 September 2022
+- The revised user journey continues development and testing. This is planned to be ready for release in the first quarter of 1971. As at 30 September 1970
Status AMBER [Closely monitored]
Project journey events:
- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress.
-- BLP industries expanded to include all industries.
-- Engagement with agencies continues, to heighten BLP awareness and complete validation following recent expansion to encompass all industries.
+- Silly Walk industries expanded to include all industries.
+- Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion to encompass all industries.
-As at 31 August 2022
+As at 31 August 1970
Status GREEN [On track]
The project is reporting green overall. Ongoing resourcing risk will continue to be monitored and managed for the life of the project, due to a tight labour market.
Project journey events:
- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness continues to progress.
-- Further analysis of June/July 2022 marketing campaign has offered recommendations for consideration, to improve target audience awareness and BLP uptake.
-- BLP industries expanded to include Retail Trade, Accommodation and Non-residential Construction industries finalised.
-- Engagement with agencies continues, to heighten BLP awareness and complete validation following recent expansion with three additional industries.
+- Further analysis of June/July 1970 marketing campaign has offered recommendations for consideration, to improve target audience awareness and Silly Walk uptake.
+- Silly Walk industries expanded to include Retail Trade, Accommodation and Non-residential Construction industries finalised.
+- Engagement with agencies continues, to heighten Silly Walk awareness and complete validation following recent expansion with three additional industries.
-As at 31 July 2022
+As at 31 July 1970
Status AMBER [Closely monitored]
The project is continuing to report amber overall mainly due to ongoing resourcing challenges.
Project journey events:
- A revised customer journey in line with outcomes of customer testing and retesting to validate solution usefulness, is progressing.
-- Analysis of a major marketing campaign conducted in June/July 2022 showed a significant step-up in number of BLP users.
-- The target of 95% of Queensland population coverage was met in June 2022 with 100% of Queensland population now covered on BLP.
+- Analysis of a major marketing campaign conducted in June/July 1970 showed a significant step-up in number of Silly Walk users.
+- The target of 95% of Circus population coverage was met in June 1970 with 100% of Circus population now covered on Silly Walk.
- Agency engagement for extension industries has commenced.
-As at 1 July 2022
-BLP commenced work on expanding industries to include Retail Trade, Accommodation and Non-residential Construction industries.
+As at 1 July 1970
+Silly Walk commenced work on expanding industries to include Retail Trade, Accommodation and Non-residential Construction industries.
-As at June 2022
-Stage 2 of the project is commencing and will build up the solution delivered in BLP Stage 1. Customer journey will be revised in line with outcome of customer testing. The increased coverage target of at least 95% of the Queensland population was met in June 2022, with all local governments included on BLP. Benefits realisation through marketing and promotion of BLP.",https://www.business.qld.gov.au/starting-business/planning/launchpad
-DESBT,"Department of Employment, Small Business and Training",High,VET Modernisation and Transformation Program - Tranche 1,"The Vocational Education and Training (VET) Modernisation and Transformation (VMT) Program seeks to reduce the risks associated with department legacy systems by delivering contemporary, consolidated, integrated, user-friendly applications to support delivery of VET outcomes. To optimise the technical capabilities of the new solutions, engagement with business teams in the review and development of business processes is a priority. ",Trust,Delivery,01/07/2021,31/08/2023,28/02/2023,52,G,8692200,9614968,4961147,Y,Y,Y,"As at 28 February 2023
-- Tranche 1 VMT projects continue on schedule and on budget for Tranche 1 completion by 31 August 2023.
+As at June 1970
+Stage 2 of the project is commencing and will build up the solution delivered in Silly Walk Stage 1. Customer journey will be revised in line with outcome of customer testing. The increased coverage target of at least 95% of the Circus population was met in June 1970, with all local governments included on Silly Walk. Benefits realisation through marketing and promotion of Silly Walk.",https://example.com
+DDSSHHESW,"Department of Defence, Social Security, Health, Housing, Education, and Silly Walks",High,Flying Circus Modernisation and Transformation Program - Tranche 1,"The Flying Circus Modernisation and Transformation (FCMT) Program seeks to reduce the risks associated with department legacy systems by delivering contemporary, consolidated, integrated, user-friendly applications to support delivery of Flying Circus outcomes. To optimise the technical capabilities of the new solutions, engagement with business teams in the review and development of business processes is a priority. ",Trust,Delivery,01/07/1969,31/08/1971,28/02/1971,52,G,8692200,9614968,4961147,Y,Y,Y,"As at 28 February 1971
+- Tranche 1 FCMT projects continue on schedule and on budget for Tranche 1 completion by 31 August 1971.
- Customer Engagement and Contract Establishment projects continue to progress focusing on delivery activities for new CRM and Portal enhancements.
-- VMT Tranche 2 Business Case tracking for completion April 2023.
+- FCMT Tranche 2 Business Case tracking for completion April 1971.
-As at 31 January 2023
-- VMT Projects continue to track to schedule and on budget for Tranche 1 completion 31 August 2023.
+As at 31 January 1971
+- FCMT Projects continue to track to schedule and on budget for Tranche 1 completion 31 August 1971.
- Customer Engagement and Contract Establishment Projects progressing well with delivery activities for new CRM and Portal enhancements.
-As at 31 December 2022
+As at 31 December 1970
Status GREEN
-- VMT projects continuing to track to board endorsed updated schedule and on budget for Tranche 1 completion on 31 August 2023.
+- FCMT projects continuing to track to board endorsed updated schedule and on budget for Tranche 1 completion on 31 August 1971.
- Customer Engagement and Contract Establishment projects completed partner onboarding and delivery activities underway.
- Planning in progress for Tranche 2, focusing on remaining legacy systems for planned commencement at completion of Tranch 1.
-As at 30 November 2022
+As at 30 November 1970
Status GREEN
-- Tranche 1 delivery date extended to 31 August 2023 due to CRM vendor procurement delays and subsequent additional time requirements for build completion and testing of new CRM.
+- Tranche 1 delivery date extended to 31 August 1971 due to CRM vendor procurement delays and subsequent additional time requirements for build completion and testing of new CRM.
- All projects maintaining momentum and progressing to revised schedule within budget.
-As at 31 October 2022
+As at 31 October 1970
Status GREEN
--New 'Partner Portal' Digital Channel continues to perform well with 3516 registered, active, external users from 634 different organisations. Update release being planned for January 2023.
+-New 'Partner Portal' Digital Channel continues to perform well with 3516 registered, active, external users from 634 different organisations. Update release being planned for January 1971.
-SkillsCRM (CEP Project) delivery partner on-boarded and formal delivery stage commenced.
--Contract Establishment and Variation (CEV PRoject) continuing delivery partner select with a view to commencing prior to end of December 2022.
+-Contract Establishment and Variation (CEV PRoject) continuing delivery partner select with a view to commencing prior to end of December 1970.
-As at 30 September 2022 Status GREEN.
-The VMT 'Partner Portal' solution was successfully launched on the 17 August 2022. The decommissioning of the outdated legacy application, 'DETConnect', has completed. Work is now increasing on the next VET systems to be replaced, SkillsCRM (via the Customer Engagement Project) and Policy on Line (via the Contract Establishment and Variation Project).
+As at 30 September 1970 Status GREEN.
+The FCMT 'Partner Portal' solution was successfully launched on the 17 August 1970. The decommissioning of the outdated legacy application, 'WalkConnect', has completed. Work is now increasing on the next Flying Circus systems to be replaced, SkillsCRM (via the Customer Engagement Project) and Policy on Line (via the Contract Establishment and Variation Project).
Project Journey Events:
-- Partner Portal. After the successful launch of Partner Portal and decommissioning of DETConnect, the transition to BAU is underway with the Project team continuing to support business until BAU transition is completed.
+- Partner Portal. After the successful launch of Partner Portal and decommissioning of WalkConnect, the transition to BAU is underway with the Project team continuing to support business until BAU transition is completed.
- Data, Infrastructure and Reporting.
New 'Data Lake' infrastructure built. Data ingestion processes being trialled. QTS report requirement gathering underway which will showcase new capability once completed. Compliance tool SMCM successfully launched September 30.
--Customer Engagement Project (CEP). Completed assurance reviews successfully. Delivery partner selection completed. Partner and formal delivery stage due to start 18 October 2022. Ramp up of activities continuing with business demonstrations of CRM proof of concept.
+-Customer Engagement Project (CEP). Completed assurance reviews successfully. Delivery partner selection completed. Partner and formal delivery stage due to start 18 October 1970. Ramp up of activities continuing with business demonstrations of CRM proof of concept.
-Contract Establishment and Variation (CEV).
Requirements gathering completed. Delivery partner selection process commenced. 'As is' process documentation underway.
-As at 31 August 2022
-Status GREEN. The project remains on track. Successful launch of new secure 'Partner Portal' Digital Channel for VET related organisations occurred 17 August 2022.
+As at 31 August 1970
+Status GREEN. The project remains on track. Successful launch of new secure 'Partner Portal' Digital Channel for Flying Circus related organisations occurred 17 August 1970.
Current Projects underway:
-- Partner Portal. Go-live occurred on track 17 August 2022. All registered VET organisations now able to use the portal to access key applications and send information to DESBT via secure channel. Enhanced support being provided for 6 weeks. Legacy system decommissioning underway.
+- Partner Portal. Go-live occurred on track 17 August 1970. All registered Flying Circus organisations now able to use the portal to access key applications and send information to DDSSHHESW via secure channel. Enhanced support being provided for 6 weeks. Legacy system decommissioning underway.
- Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) continuing and requirement gathering of first report planned to use new capabilites commenced.
-- Customer Services Hub (CRM). Implementation partner selection complete. Solution delivery activities due to start by end September 2022.
-- Contract Engagement and Variation. Requirements gathering complete and partner selection process to commence by end September 2022.
+- Customer Services Hub (CRM). Implementation partner selection complete. Solution delivery activities due to start by end September 1970.
+- Contract Engagement and Variation. Requirements gathering complete and partner selection process to commence by end September 1970.
-As at 31 July 2022
+As at 31 July 1970
Status GREEN
Project journey events:
-Implementation of next changes to VMT applications remain on track for August 2022 with full launch of new secure Partner Portal Digital Channel for VET related organisations.
-VMT Program scope adjusted to include additional at risk system decommission activties during this financial year. Approved expenditure updated to align with revised scope.
+Implementation of next changes to FCMT applications remain on track for August 1970 with full launch of new secure Partner Portal Digital Channel for Flying Circus related organisations.
+FCMT Program scope adjusted to include additional at risk system decommission activties during this financial year. Approved expenditure updated to align with revised scope.
Current Projects underway
-- Partner Portal. Opened for registrations 4 July 2022. Majority of VET related organisation now registered. Full access (go-live) on track to commence 17 August 2022. Legacy system to be disabled and decommissioned September 2022.
+- Partner Portal. Opened for registrations 4 July 1970. Majority of Flying Circus related organisation now registered. Full access (go-live) on track to commence 17 August 1970. Legacy system to be disabled and decommissioned September 1970.
- Data, Infrastructure and Reporting. Build of initial Data Lake (centralised, quality, information source) underway with population and work on first report to commence in September.
-- Customer Services Hub (CRM). Requirements confirmed and partner selection underway. Work on legacy CRM replacement due to start September/October 2022.
+- Customer Services Hub (CRM). Requirements confirmed and partner selection underway. Work on legacy CRM replacement due to start September/October 1970.
- Contract Engagement and Variation. Requirements gathering and new process design activities in progress.
-15 May 2022 Update
+15 May 1970 Update
Status GREEN
-Implementation of next changes to VET applications on track for August 2022 with introduction of new secure 'Patner Portal' Digital Channel for VET related organisations.
+Implementation of next changes to Flying Circus applications on track for August 1970 with introduction of new secure 'Silly Portal' Digital Channel for Flying Circus related organisations.
Projects Completed
--Database consolidation - key databases transitioned to supported versions and platforms. Completed November 2021.
--System to System Integration platform. Completed 9 May 2022.
+-Database consolidation - key databases transitioned to supported versions and platforms. Completed November 1969.
+-System to System Integration platform. Completed 9 May 1970.
Current projects underway
--Partner Portal secure digital channel, in final testing. Pilot successfully complete and on track for release in August 2022.
+-Partner Portal secure digital channel, in final testing. Pilot successfully complete and on track for release in August 1970.
Projects in startup
-Data, Infrastructure and Reporting, planning underway.
-Customer Services Hub (CRM), planning underway.
-Contract Engagement and Variation, planning underway.
--Planning continues for Tranche 2.",https://portal.desbt.qld.gov.au/
+-Planning continues for Tranche 2.",https://example.com
From 4656f063f833cd0b771223b3860ad8e37791d076 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Fri, 28 Jul 2023 14:48:09 +1000
Subject: [PATCH 018/102] [QOLDEV-424] fix tests to avoid hardcoding resource
IDs
- Our test IDs don't have the right format, and we should avoid hardcoding IDs anyway
---
ckanext/xloader/helpers.py | 4 +-
ckanext/xloader/tests/test_loader.py | 192 +++++++++++++--------------
2 files changed, 98 insertions(+), 98 deletions(-)
diff --git a/ckanext/xloader/helpers.py b/ckanext/xloader/helpers.py
index 3c071028..829b7b74 100644
--- a/ckanext/xloader/helpers.py
+++ b/ckanext/xloader/helpers.py
@@ -28,11 +28,11 @@ def xloader_status_description(status):
return _('Not Uploaded Yet')
-def is_resource_supported_by_xloader(res_dict, check_access = True):
+def is_resource_supported_by_xloader(res_dict, check_access=True):
is_supported_format = XLoaderFormats.is_it_an_xloader_format(res_dict.get('format'))
is_datastore_active = res_dict.get('datastore_active', False)
if check_access:
- user_has_access = toolkit.h.check_access('package_update', {'id':res_dict.get('package_id')})
+ user_has_access = toolkit.h.check_access('package_update', {'id': res_dict.get('package_id')})
else:
user_has_access = True
try:
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index 451c42ae..8cc69a06 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -85,8 +85,8 @@ def _get_column_types(self, Session, table_name):
class TestLoadCsv(TestLoadBase):
def test_simple(self, Session):
csv_filepath = get_sample_filepath("simple.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
@@ -95,7 +95,7 @@ def test_simple(self, Session):
)
assert self._get_records(
- Session, "test1", limit=1, exclude_full_text_column=False
+ Session, resource_id, limit=1, exclude_full_text_column=False
) == [
(
1,
@@ -105,7 +105,7 @@ def test_simple(self, Session):
u"Galway",
)
]
- assert self._get_records(Session, "test1") == [
+ assert self._get_records(Session, resource_id) == [
(1, u"2011-01-01", u"1", u"Galway"),
(2, u"2011-01-02", u"-1", u"Galway"),
(3, u"2011-01-03", u"0", u"Galway"),
@@ -113,14 +113,14 @@ def test_simple(self, Session):
(5, None, None, u"Berkeley"),
(6, u"2011-01-03", u"5", None),
]
- assert self._get_column_names(Session, "test1") == [
+ assert self._get_column_names(Session, resource_id) == [
u"_id",
u"_full_text",
u"date",
u"temperature",
u"place",
]
- assert self._get_column_types(Session, "test1") == [
+ assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
u"text",
@@ -130,8 +130,8 @@ def test_simple(self, Session):
def test_simple_with_indexing(self, Session):
csv_filepath = get_sample_filepath("simple.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
fields = loader.load_csv(
csv_filepath,
resource_id=resource_id,
@@ -144,7 +144,7 @@ def test_simple_with_indexing(self, Session):
assert (
self._get_records(
- Session, "test1", limit=1, exclude_full_text_column=False
+ Session, resource_id, limit=1, exclude_full_text_column=False
)[0][1]
== "'-01':2,3 '1':4 '2011':1 'galway':5"
)
@@ -155,8 +155,8 @@ def test_boston_311_complete(self):
# to get the test file:
# curl -o ckanext/xloader/tests/samples/boston_311.csv https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2968e2c0-d479-49ba-a884-4ef523ada3c0/download/311.csv # noqa
csv_filepath = get_sample_filepath("boston_311.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
import time
t0 = time.time()
@@ -179,8 +179,8 @@ def test_boston_311_sample5(self):
# to create the test file:
# head -n 100001 ckanext/xloader/tests/samples/boston_311.csv > ckanext/xloader/tests/samples/boston_311_sample5.csv
csv_filepath = get_sample_filepath("boston_311_sample5.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
import time
t0 = time.time()
@@ -199,8 +199,8 @@ def test_boston_311_sample5(self):
def test_boston_311(self, Session):
csv_filepath = get_sample_filepath("boston_311_sample.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
@@ -208,7 +208,7 @@ def test_boston_311(self, Session):
logger=logger,
)
- records = self._get_records(Session, "test1")
+ records = self._get_records(Session, resource_id)
print(records)
assert records == [
(
@@ -308,8 +308,8 @@ def test_boston_311(self, Session):
u"Citizens Connect App",
),
] # noqa
- print(self._get_column_names(Session, "test1"))
- assert self._get_column_names(Session, "test1") == [
+ print(self._get_column_names(Session, resource_id))
+ assert self._get_column_names(Session, resource_id) == [
u"_id",
u"_full_text",
u"CASE_ENQUIRY_ID",
@@ -342,16 +342,16 @@ def test_boston_311(self, Session):
u"Longitude",
u"Source",
] # noqa
- print(self._get_column_types(Session, "test1"))
- assert self._get_column_types(Session, "test1") == [
+ print(self._get_column_types(Session, resource_id))
+ assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
] + [u"text"] * (len(records[0]) - 1)
def test_brazilian(self, Session):
csv_filepath = get_sample_filepath("brazilian_sample.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
@@ -359,7 +359,7 @@ def test_brazilian(self, Session):
logger=logger,
)
- records = self._get_records(Session, "test1")
+ records = self._get_records(Session, resource_id)
print(records)
assert records[0] == (
1,
@@ -459,8 +459,8 @@ def test_brazilian(self, Session):
None,
None,
) # noqa
- print(self._get_column_names(Session, "test1"))
- assert self._get_column_names(Session, "test1") == [
+ print(self._get_column_names(Session, resource_id))
+ assert self._get_column_names(Session, resource_id) == [
u"_id",
u"_full_text",
u"NU_ANO_CENSO",
@@ -559,16 +559,16 @@ def test_brazilian(self, Session):
u"PROVA_MEAN_MAT_I_MUN",
u"PROVA_MEAN_MAT_T_MUN",
] # noqa
- print(self._get_column_types(Session, "test1"))
- assert self._get_column_types(Session, "test1") == [
+ print(self._get_column_types(Session, resource_id))
+ assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
] + [u"text"] * (len(records[0]) - 1)
def test_german(self, Session):
csv_filepath = get_sample_filepath("german_sample.csv")
- resource_id = "test_german"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
@@ -576,7 +576,7 @@ def test_german(self, Session):
logger=logger,
)
- records = self._get_records(Session, "test_german")
+ records = self._get_records(Session, resource_id)
print(records)
assert records[0] == (
1,
@@ -591,8 +591,8 @@ def test_german(self, Session):
u"24221",
u"672",
)
- print(self._get_column_names(Session, "test_german"))
- assert self._get_column_names(Session, "test_german") == [
+ print(self._get_column_names(Session, resource_id))
+ assert self._get_column_names(Session, resource_id) == [
u"_id",
u"_full_text",
u"Stadtname",
@@ -606,64 +606,64 @@ def test_german(self, Session):
u"Schuler_Berufsausbildung_2010/2011",
u"Schuler_andere allgemeinbildende Schulen_2010/2011",
]
- print(self._get_column_types(Session, "test_german"))
- assert self._get_column_types(Session, "test_german") == [
+ print(self._get_column_types(Session, resource_id))
+ assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
] + [u"text"] * (len(records[0]) - 1)
def test_with_blanks(self, Session):
csv_filepath = get_sample_filepath("sample_with_blanks.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
- assert len(self._get_records(Session, "test1")) == 3
+ assert len(self._get_records(Session, resource_id)) == 3
def test_with_quoted_commas(self, Session):
csv_filepath = get_sample_filepath("sample_with_quoted_commas.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
- assert len(self._get_records(Session, "test1")) == 3
+ assert len(self._get_records(Session, resource_id)) == 3
def test_with_mixed_quotes(self, Session):
csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
- assert len(self._get_records(Session, "test1")) == 2
+ assert len(self._get_records(Session, resource_id)) == 2
def test_with_mixed_types(self, Session):
csv_filepath = get_sample_filepath("mixed_numeric_string_sample.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
- assert len(self._get_records(Session, "test1")) == 2
+ assert len(self._get_records(Session, resource_id)) == 2
def test_reload(self, Session):
csv_filepath = get_sample_filepath("simple.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
@@ -679,15 +679,15 @@ def test_reload(self, Session):
logger=logger,
)
- assert len(self._get_records(Session, "test1")) == 6
- assert self._get_column_names(Session, "test1") == [
+ assert len(self._get_records(Session, resource_id)) == 6
+ assert self._get_column_names(Session, resource_id) == [
u"_id",
u"_full_text",
u"date",
u"temperature",
u"place",
]
- assert self._get_column_types(Session, "test1") == [
+ assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
u"text",
@@ -701,8 +701,8 @@ def test_reload(self, Session):
)
def test_reload_with_overridden_types(self, Session):
csv_filepath = get_sample_filepath("simple.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
@@ -732,15 +732,15 @@ def test_reload_with_overridden_types(self, Session):
fields=fields, resource_id=resource_id, logger=logger
)
- assert len(self._get_records(Session, "test1")) == 6
- assert self._get_column_names(Session, "test1") == [
+ assert len(self._get_records(Session, resource_id)) == 6
+ assert self._get_column_names(Session, resource_id) == [
u"_id",
u"_full_text",
u"date",
u"temperature",
u"place",
]
- assert self._get_column_types(Session, "test1") == [
+ assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
u"timestamp",
@@ -750,7 +750,7 @@ def test_reload_with_overridden_types(self, Session):
# check that rows with nulls are indexed correctly
records = self._get_records(
- Session, "test1", exclude_full_text_column=False
+ Session, resource_id, exclude_full_text_column=False
)
print(records)
assert records[4][1] == "'berkeley':1"
@@ -775,8 +775,8 @@ def test_encode_headers(self):
def test_column_names(self, Session):
csv_filepath = get_sample_filepath("column_names.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
csv_filepath,
resource_id=resource_id,
@@ -784,12 +784,12 @@ def test_column_names(self, Session):
logger=logger,
)
- assert self._get_column_names(Session, "test1")[2:] == [
+ assert self._get_column_names(Session, resource_id)[2:] == [
u"d@t$e",
u"t^e&m*pe!r(a)t?u:r%%e",
r"p\l/a[c{e%",
]
- assert self._get_records(Session, "test1")[0] == (
+ assert self._get_records(Session, resource_id)[0] == (
1,
u"2011-01-01",
u"1",
@@ -800,8 +800,8 @@ def test_column_names(self, Session):
class TestLoadUnhandledTypes(TestLoadBase):
def test_kml(self):
filepath = get_sample_filepath("polling_locations.kml")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
with pytest.raises(LoaderError) as exception:
loader.load_csv(
filepath,
@@ -817,8 +817,8 @@ def test_kml(self):
def test_geojson(self):
filepath = get_sample_filepath("polling_locations.geojson")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
with pytest.raises(LoaderError) as exception:
loader.load_csv(
filepath,
@@ -839,8 +839,8 @@ def test_geojson(self):
)
def test_shapefile_zip_python2(self):
filepath = get_sample_filepath("polling_locations.shapefile.zip")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
with pytest.raises(LoaderError):
loader.load_csv(
filepath,
@@ -859,8 +859,8 @@ def test_shapefile_zip_python3(self, Session):
# finds, 'Polling_Locations.cpg'. This file only contains the
# following data: `UTF-8`.
filepath = get_sample_filepath("polling_locations.shapefile.zip")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_csv(
filepath,
resource_id=resource_id,
@@ -868,8 +868,8 @@ def test_shapefile_zip_python3(self, Session):
logger=logger,
)
- assert self._get_records(Session, "test1") == []
- assert self._get_column_names(Session, "test1") == [
+ assert self._get_records(Session, resource_id) == []
+ assert self._get_column_names(Session, resource_id) == [
'_id',
'_full_text',
'UTF-8'
@@ -879,8 +879,8 @@ def test_shapefile_zip_python3(self, Session):
class TestLoadTabulator(TestLoadBase):
def test_simple(self, Session):
csv_filepath = get_sample_filepath("simple.xls")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_table(
csv_filepath,
resource_id=resource_id,
@@ -891,7 +891,7 @@ def test_simple(self, Session):
assert (
"'galway':"
in self._get_records(
- Session, "test1", limit=1, exclude_full_text_column=False
+ Session, resource_id, limit=1, exclude_full_text_column=False
)[0][1]
)
# Indexed record looks like this (depending on CKAN version?):
@@ -899,7 +899,7 @@ def test_simple(self, Session):
# "'-01':4,5 '00':6,7,8 '1':1 '2011':3 'galway':2"
# "'-01':2,3 '00':5,6 '1':7 '2011':1 'galway':8 't00':4"
- assert self._get_records(Session, "test1") == [
+ assert self._get_records(Session, resource_id) == [
(1, datetime.datetime(2011, 1, 1, 0, 0), Decimal("1"), u"Galway",),
(
2,
@@ -927,14 +927,14 @@ def test_simple(self, Session):
u"Berkeley",
),
]
- assert self._get_column_names(Session, "test1") == [
+ assert self._get_column_names(Session, resource_id) == [
u"_id",
u"_full_text",
u"date",
u"temperature",
u"place",
]
- assert self._get_column_types(Session, "test1") == [
+ assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
u"timestamp",
@@ -948,8 +948,8 @@ def test_boston_311_complete(self):
# to get the test file:
# curl -o ckanext/xloader/tests/samples/boston_311.csv https://data.boston.gov/dataset/8048697b-ad64-4bfc-b090-ee00169f2323/resource/2968e2c0-d479-49ba-a884-4ef523ada3c0/download/311.csv # noqa
csv_filepath = get_sample_filepath("boston_311.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
import time
t0 = time.time()
@@ -972,8 +972,8 @@ def test_boston_311_sample5(self):
# to create the test file:
# head -n 100001 ckanext/xloader/tests/samples/boston_311.csv > ckanext/xloader/tests/samples/boston_311_sample5.csv
csv_filepath = get_sample_filepath("boston_311_sample5.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
import time
t0 = time.time()
@@ -992,8 +992,8 @@ def test_boston_311_sample5(self):
def test_boston_311(self, Session):
csv_filepath = get_sample_filepath("boston_311_sample.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_table(
csv_filepath,
resource_id=resource_id,
@@ -1001,7 +1001,7 @@ def test_boston_311(self, Session):
logger=logger,
)
- records = self._get_records(Session, "test1")
+ records = self._get_records(Session, resource_id)
print(records)
assert records == [
(
@@ -1101,8 +1101,8 @@ def test_boston_311(self, Session):
u"Citizens Connect App",
),
] # noqa
- print(self._get_column_names(Session, "test1"))
- assert self._get_column_names(Session, "test1") == [
+ print(self._get_column_names(Session, resource_id))
+ assert self._get_column_names(Session, resource_id) == [
u"_id",
u"_full_text",
u"CASE_ENQUIRY_ID",
@@ -1135,8 +1135,8 @@ def test_boston_311(self, Session):
u"Longitude",
u"Source",
] # noqa
- print(self._get_column_types(Session, "test1"))
- assert self._get_column_types(Session, "test1") == [
+ print(self._get_column_types(Session, resource_id))
+ assert self._get_column_types(Session, resource_id) == [
u"int4",
u"tsvector",
u"numeric",
@@ -1174,8 +1174,8 @@ def test_no_entries(self):
csv_filepath = get_sample_filepath("no_entries.csv")
# no datastore table is created - we need to except, or else
# datastore_active will be set on a non-existent datastore table
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
with pytest.raises(LoaderError):
loader.load_table(
csv_filepath,
@@ -1186,24 +1186,24 @@ def test_no_entries(self):
def test_with_quoted_commas(self, Session):
csv_filepath = get_sample_filepath("sample_with_quoted_commas.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_table(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
- assert len(self._get_records(Session, "test1")) == 3
+ assert len(self._get_records(Session, resource_id)) == 3
def test_with_mixed_quotes(self, Session):
csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv")
- resource_id = "test1"
- factories.Resource(id=resource_id)
+ resource = factories.Resource()
+ resource_id = resource['id']
loader.load_table(
csv_filepath,
resource_id=resource_id,
mimetype="text/csv",
logger=logger,
)
- assert len(self._get_records(Session, "test1")) == 2
+ assert len(self._get_records(Session, resource_id)) == 2
From b23b22c9426f0a7436cdad8bcb5b745bea4230ca Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Fri, 28 Jul 2023 15:44:31 +1000
Subject: [PATCH 019/102] [QOLDEV-424] ensure consistent column name ordering
in tests
- This is intermittently breaking the boston_311 test, when columns load in an unexpected order
---
ckanext/xloader/tests/test_loader.py | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index 8cc69a06..d55ec949 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -64,8 +64,12 @@ def _get_column_names(self, Session, table_name):
# SELECT column_name FROM information_schema.columns WHERE table_name='test1';
c = Session.connection()
sql = (
- "SELECT column_name FROM information_schema.columns "
- "WHERE table_name='{}';".format(table_name)
+ """
+ SELECT column_name
+ FROM information_schema.columns
+ WHERE table_name='{}'
+ ORDER BY ordinal_position;
+ """.format(table_name)
)
results = c.execute(sql)
records = results.fetchall()
@@ -74,8 +78,12 @@ def _get_column_names(self, Session, table_name):
def _get_column_types(self, Session, table_name):
c = Session.connection()
sql = (
- "SELECT udt_name FROM information_schema.columns "
- "WHERE table_name='{}';".format(table_name)
+ """
+ SELECT udt_name
+ FROM information_schema.columns
+ WHERE table_name='{}'
+ ORDER BY ordinal_position;
+ """.format(table_name)
)
results = c.execute(sql)
records = results.fetchall()
From bef37d2a561ecc3c27780e102a216df74c1582cc Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 1 Aug 2023 14:16:56 +1000
Subject: [PATCH 020/102] [QOLDEV-490] use fast loading for resources that
already have a data dictionary
- Tabulator is good at type guessing but is slow. Once it has configured the column types, there's no need to use it every time
---
ckanext/xloader/jobs.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py
index 0d242db1..f2263dec 100644
--- a/ckanext/xloader/jobs.py
+++ b/ckanext/xloader/jobs.py
@@ -212,7 +212,7 @@ def tabulator_load():
logger.info("'use_type_guessing' mode is: %s",
use_type_guessing)
try:
- if use_type_guessing:
+ if use_type_guessing and not loader.datastore_resource_exists(resource['id']):
tabulator_load()
else:
try:
From 92009687c3a52aaf49bd00e39f4a90d40b05d871 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 1 Aug 2023 14:38:55 +1000
Subject: [PATCH 021/102] [QOLDEV-490] extend job timeout if type guessing is
needed
- Also refactor: move the function to check if a datastore entry exists, from the job module to utils,
so different modules can check it.
---
ckanext/xloader/action.py | 4 ++++
ckanext/xloader/jobs.py | 14 +++++---------
ckanext/xloader/loader.py | 13 +------------
ckanext/xloader/utils.py | 19 ++++++++++++++++++-
4 files changed, 28 insertions(+), 22 deletions(-)
diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py
index 3fa26803..f52d8d77 100644
--- a/ckanext/xloader/action.py
+++ b/ckanext/xloader/action.py
@@ -153,6 +153,10 @@ def xloader_submit(context, data_dict):
}
}
timeout = config.get('ckanext.xloader.job_timeout', '3600')
+ if not utils.datastore_resource_exists(res_id):
+ # Expand timeout for resources that have to be type-guessed
+ timeout = timeout * 3
+
try:
job = enqueue_job(
jobs.xloader_data_into_datastore, [data], rq_kwargs=dict(timeout=timeout)
diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py
index f2263dec..94784bd7 100644
--- a/ckanext/xloader/jobs.py
+++ b/ckanext/xloader/jobs.py
@@ -18,10 +18,9 @@
from ckan import model
from ckan.plugins.toolkit import get_action, asbool, ObjectNotFound, config
-from . import loader
-from . import db
+from . import db, loader
from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError
-from .utils import set_resource_metadata
+from .utils import set_resource_metadata, should_guess_types
try:
from ckan.lib.api_token import get_user_from_token
@@ -206,13 +205,10 @@ def tabulator_load():
logger.info('Loading CSV')
# If ckanext.xloader.use_type_guessing is not configured, fall back to
# deprecated ckanext.xloader.just_load_with_messytables
- use_type_guessing = asbool(config.get(
- 'ckanext.xloader.use_type_guessing', config.get(
- 'ckanext.xloader.just_load_with_messytables', False)))
- logger.info("'use_type_guessing' mode is: %s",
- use_type_guessing)
+ use_type_guessing = should_guess_types(resource['id'])
+ logger.info("'use_type_guessing' mode is: %s", use_type_guessing)
try:
- if use_type_guessing and not loader.datastore_resource_exists(resource['id']):
+ if use_type_guessing:
tabulator_load()
else:
try:
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 2060a9ef..11eb637c 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -17,7 +17,7 @@
from .job_exceptions import FileCouldNotBeLoadedError, LoaderError
from .parser import CSV_SAMPLE_LINES, XloaderCSVParser
-from .utils import headers_guess, type_guess
+from .utils import datastore_resource_exists, headers_guess, type_guess
from ckan.plugins.toolkit import config
@@ -402,17 +402,6 @@ def send_resource_to_datastore(resource_id, headers, records):
.format(str(e)))
-def datastore_resource_exists(resource_id):
- from ckan import model
- context = {'model': model, 'ignore_auth': True}
- try:
- response = p.toolkit.get_action('datastore_search')(context, dict(
- id=resource_id, limit=0))
- except p.toolkit.ObjectNotFound:
- return False
- return response or {'fields': []}
-
-
def delete_datastore_resource(resource_id):
from ckan import model
context = {'model': model, 'user': '', 'ignore_auth': True}
diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
index 0d2a182b..62b25320 100644
--- a/ckanext/xloader/utils.py
+++ b/ckanext/xloader/utils.py
@@ -9,7 +9,7 @@
from decimal import Decimal
import ckan.plugins as p
-from ckan.plugins.toolkit import config
+from ckan.plugins.toolkit import asbool, config
# resource.formats accepted by ckanext-xloader. Must be lowercase here.
DEFAULT_FORMATS = [
@@ -245,3 +245,20 @@ def type_guess(rows, types=TYPES, strict=False):
guesses_tuples = [(t, guess[t]) for t in types if t in guess]
_columns.append(max(guesses_tuples, key=lambda t_n: t_n[1])[0])
return _columns
+
+
+def datastore_resource_exists(resource_id):
+ context = {'model': model, 'ignore_auth': True}
+ try:
+ response = p.toolkit.get_action('datastore_search')(context, dict(
+ id=resource_id, limit=0))
+ except p.toolkit.ObjectNotFound:
+ return False
+ return response or {'fields': []}
+
+
+def should_guess_types(resource_id):
+ return asbool(
+ config.get('ckanext.xloader.use_type_guessing', config.get(
+ 'ckanext.xloader.just_load_with_messytables', False))) \
+ and datastore_resource_exists(resource_id)
From 507508ef3a0082cc7e2e33a8b39dca9be33877a7 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 1 Aug 2023 15:22:58 +1000
Subject: [PATCH 022/102] [QOLDEV-490] always use COPY for large files
- Ignore the 'use_type_guessing' flag for large files since they will take too long.
---
ckanext/xloader/config_declaration.yaml | 13 ++++++++++---
ckanext/xloader/jobs.py | 11 +++++++++--
ckanext/xloader/utils.py | 9 +--------
3 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/ckanext/xloader/config_declaration.yaml b/ckanext/xloader/config_declaration.yaml
index b31f12e2..feb1cc9c 100644
--- a/ckanext/xloader/config_declaration.yaml
+++ b/ckanext/xloader/config_declaration.yaml
@@ -29,9 +29,7 @@ groups:
default: 1_000_000_000
example: 100000
description: |
- The connection string for the jobs database used by XLoader. The
- default of an sqlite file is fine for development. For production use a
- Postgresql database.
+ The maximum file size that XLoader will attempt to load.
type: int
required: false
- key: ckanext.xloader.use_type_guessing
@@ -48,6 +46,15 @@ groups:
type: bool
required: false
legacy_key: ckanext.xloader.just_load_with_messytables
+ - key: ckanext.xloader.max_type_guessing_length
+ default: 0
+ example: 100000
+ description: |
+ The maximum file size that will be passed to Tabulator if the
+ use_type_guessing flag is enabled. Larger files will use COPY even if
+ the flag is set. Defaults to 1/10 of the maximum content length.
+ type: int
+ required: false
- key: ckanext.xloader.parse_dates_dayfirst
default: False
example: False
diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py
index 94784bd7..7819c96e 100644
--- a/ckanext/xloader/jobs.py
+++ b/ckanext/xloader/jobs.py
@@ -7,6 +7,7 @@
import tempfile
import json
import datetime
+import os
import traceback
import sys
@@ -20,7 +21,7 @@
from . import db, loader
from .job_exceptions import JobError, HTTPError, DataTooBigError, FileCouldNotBeLoadedError
-from .utils import set_resource_metadata, should_guess_types
+from .utils import datastore_resource_exists, set_resource_metadata
try:
from ckan.lib.api_token import get_user_from_token
@@ -32,6 +33,8 @@
requests.packages.urllib3.disable_warnings()
MAX_CONTENT_LENGTH = int(config.get('ckanext.xloader.max_content_length') or 1e9)
+# Don't try Tabulator load on large files
+MAX_TYPE_GUESSING_LENGTH = int(config.get('ckanext.xloader.max_type_guessing_length') or MAX_CONTENT_LENGTH / 10)
MAX_EXCERPT_LINES = int(config.get('ckanext.xloader.max_excerpt_lines') or 0)
CHUNK_SIZE = 16 * 1024 # 16kb
DOWNLOAD_TIMEOUT = 30
@@ -205,7 +208,11 @@ def tabulator_load():
logger.info('Loading CSV')
# If ckanext.xloader.use_type_guessing is not configured, fall back to
# deprecated ckanext.xloader.just_load_with_messytables
- use_type_guessing = should_guess_types(resource['id'])
+ use_type_guessing = asbool(
+ config.get('ckanext.xloader.use_type_guessing', config.get(
+ 'ckanext.xloader.just_load_with_messytables', False))) \
+ and datastore_resource_exists(resource['id']) \
+ and os.path.getsize(tmp_file.name) <= MAX_TYPE_GUESSING_LENGTH
logger.info("'use_type_guessing' mode is: %s", use_type_guessing)
try:
if use_type_guessing:
diff --git a/ckanext/xloader/utils.py b/ckanext/xloader/utils.py
index 62b25320..994e6754 100644
--- a/ckanext/xloader/utils.py
+++ b/ckanext/xloader/utils.py
@@ -9,7 +9,7 @@
from decimal import Decimal
import ckan.plugins as p
-from ckan.plugins.toolkit import asbool, config
+from ckan.plugins.toolkit import config
# resource.formats accepted by ckanext-xloader. Must be lowercase here.
DEFAULT_FORMATS = [
@@ -255,10 +255,3 @@ def datastore_resource_exists(resource_id):
except p.toolkit.ObjectNotFound:
return False
return response or {'fields': []}
-
-
-def should_guess_types(resource_id):
- return asbool(
- config.get('ckanext.xloader.use_type_guessing', config.get(
- 'ckanext.xloader.just_load_with_messytables', False))) \
- and datastore_resource_exists(resource_id)
From d425e31326855a579d3905ee1b44721beeccf79e Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 1 Aug 2023 15:56:29 +1000
Subject: [PATCH 023/102] adjust datastore tab link to work on current CKAN
2.10
---
ckanext/xloader/templates/package/resource_read.html | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ckanext/xloader/templates/package/resource_read.html b/ckanext/xloader/templates/package/resource_read.html
index 6d5f5ff2..56bf0266 100644
--- a/ckanext/xloader/templates/package/resource_read.html
+++ b/ckanext/xloader/templates/package/resource_read.html
@@ -1,6 +1,6 @@
{% ckan_extends %}
-{% block action_manage_inner %}
+{% block action_manage %}
{{ super() }}
{% if h.is_resource_supported_by_xloader(res) %}
- {% link_for _('DataStore'), named_route='xloader.resource_data', id=pkg.name, resource_id=res.id, class_='btn btn-light', icon='cloud-upload' %}
From 6df99ead7b017f7f19478fae9202b142f1a75be3 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 7 Aug 2023 09:28:25 +1000
Subject: [PATCH 024/102] [QOLDEV-490] fix timeout extension
- Can't multiply a string by 3, we get repeated text instead of arithmetic multiplication
---
ckanext/xloader/action.py | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/ckanext/xloader/action.py b/ckanext/xloader/action.py
index f52d8d77..e45394a9 100644
--- a/ckanext/xloader/action.py
+++ b/ckanext/xloader/action.py
@@ -152,10 +152,11 @@ def xloader_submit(context, data_dict):
'original_url': resource_dict.get('url'),
}
}
- timeout = config.get('ckanext.xloader.job_timeout', '3600')
- if not utils.datastore_resource_exists(res_id):
- # Expand timeout for resources that have to be type-guessed
- timeout = timeout * 3
+ # Expand timeout for resources that have to be type-guessed
+ timeout = config.get(
+ 'ckanext.xloader.job_timeout',
+ '3600' if utils.datastore_resource_exists(res_id) else '10800')
+ log.debug("Timeout for XLoading resource %s is %s", res_id, timeout)
try:
job = enqueue_job(
From 388bc7b70c7bd733380fc73fe4b74d384bb7b114 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 7 Aug 2023 09:50:30 +1000
Subject: [PATCH 025/102] [QOLDEV-490] fix load method logic
- Missed a 'not' when checking if datastore exists
---
ckanext/xloader/jobs.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py
index 7819c96e..9fae67c2 100644
--- a/ckanext/xloader/jobs.py
+++ b/ckanext/xloader/jobs.py
@@ -211,7 +211,7 @@ def tabulator_load():
use_type_guessing = asbool(
config.get('ckanext.xloader.use_type_guessing', config.get(
'ckanext.xloader.just_load_with_messytables', False))) \
- and datastore_resource_exists(resource['id']) \
+ and not datastore_resource_exists(resource['id']) \
and os.path.getsize(tmp_file.name) <= MAX_TYPE_GUESSING_LENGTH
logger.info("'use_type_guessing' mode is: %s", use_type_guessing)
try:
From da8c602e93922dc682994ff0bec39c479f5fc412 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 22 Aug 2023 13:11:19 +1000
Subject: [PATCH 026/102] [QOLDEV-554] fix overly aggressive timestamp parsing
- Use our guessed types to restrict the behaviour of the numeric/timestamp converter,
so columns that sniff as text won't be incorrectly partially converted to timestamps
---
ckanext/xloader/loader.py | 5 +--
ckanext/xloader/parser.py | 75 ++++++++++++++++++++++++++-------------
2 files changed, 54 insertions(+), 26 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 11eb637c..92e990ed 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -16,7 +16,7 @@
import ckan.plugins as p
from .job_exceptions import FileCouldNotBeLoadedError, LoaderError
-from .parser import CSV_SAMPLE_LINES, XloaderCSVParser
+from .parser import CSV_SAMPLE_LINES, XloaderCSVParser, TypeConverter
from .utils import datastore_resource_exists, headers_guess, type_guess
from ckan.plugins.toolkit import config
@@ -279,9 +279,10 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
for t, h in zip(types, headers)]
headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
+ type_converter = TypeConverter(types=types)
with Stream(table_filepath, format=file_format, skip_rows=skip_rows,
- custom_parsers={'csv': XloaderCSVParser}) as stream:
+ post_parse=[type_converter.convert_types]) as stream:
def row_iterator():
for row in stream:
data_row = {}
diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py
index 82539f4d..b7a1c34e 100644
--- a/ckanext/xloader/parser.py
+++ b/ckanext/xloader/parser.py
@@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
import csv
+import datetime
from decimal import Decimal, InvalidOperation
from itertools import chain
from ckan.plugins.toolkit import asbool
-from dateutil.parser import isoparser, parser
-from dateutil.parser import ParserError
+from dateutil.parser import isoparser, parser, ParserError
from tabulator import helpers
from tabulator.parser import Parser
@@ -97,28 +97,7 @@ def type_value(value):
if value in ('', None):
return ''
- try:
- return Decimal(value)
- except InvalidOperation:
- pass
-
- try:
- i = isoparser()
- return i.isoparse(value)
- except ValueError:
- pass
-
- try:
- p = parser()
- yearfirst = asbool(config.get(
- 'ckanext.xloader.parse_dates_yearfirst', False))
- dayfirst = asbool(config.get(
- 'ckanext.xloader.parse_dates_dayfirst', False))
- return p.parse(value, yearfirst=yearfirst, dayfirst=dayfirst)
- except ParserError:
- pass
-
- return value
+ return to_number(value) or to_timestamp(value) or value
sample, dialect = self.__prepare_dialect(self.__chars)
items = csv.reader(chain(sample, self.__chars), dialect=dialect)
@@ -159,3 +138,51 @@ class dialect(csv.excel):
self.__dialect = dialect
return sample, dialect
+
+
+class TypeConverter:
+ """ Post-process table cells to convert strings into numbers and timestamps
+ as desired.
+ """
+
+ def __init__(self, types):
+ self.types = types
+
+ def convert_types(self, extended_rows):
+ """ Try converting cells to numbers or timestamps if applicable.
+ If a list of types was supplied, use that.
+ If not, then try converting each column to numeric first,
+ then to a timestamp. If both fail, just keep it as a string.
+ """
+ for row_number, headers, row in extended_rows:
+ for cell_index, cell_value in enumerate(row):
+ if cell_value is None:
+ row[cell_index] = ''
+ if cell_value:
+ cell_type = self.types[cell_index]
+ if cell_type == Decimal:
+ row[cell_index] = to_number(cell_value) or cell_value
+ elif cell_type == datetime.datetime:
+ row[cell_index] = to_timestamp(row[cell_index]) or cell_value
+ yield (row_number, headers, row)
+
+
+def to_number(value):
+ try:
+ return Decimal(value)
+ except InvalidOperation:
+ return None
+
+
+def to_timestamp(value):
+ try:
+ i = isoparser()
+ return i.isoparse(value)
+ except ValueError:
+ try:
+ p = parser()
+ yearfirst = asbool(config.get('ckanext.xloader.parse_dates_yearfirst', False))
+ dayfirst = asbool(config.get('ckanext.xloader.parse_dates_dayfirst', False))
+ return p.parse(value, yearfirst=yearfirst, dayfirst=dayfirst)
+ except ParserError:
+ return None
From 13d076fe0e7bb108eebf445404262a2ba61b5e69 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 22 Aug 2023 13:26:34 +1000
Subject: [PATCH 027/102] [QOLDEV-554] skip conversion if value already has the
desired type
---
ckanext/xloader/parser.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py
index b7a1c34e..e063c762 100644
--- a/ckanext/xloader/parser.py
+++ b/ckanext/xloader/parser.py
@@ -168,6 +168,8 @@ def convert_types(self, extended_rows):
def to_number(value):
+ if isinstance(value, Decimal):
+ return value
try:
return Decimal(value)
except InvalidOperation:
@@ -175,6 +177,8 @@ def to_number(value):
def to_timestamp(value):
+ if isinstance(value, datetime.datetime):
+ return value
try:
i = isoparser()
return i.isoparse(value)
From 03967f7b69f182de6e99d29b858283fab085e7ed Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 22 Aug 2023 14:19:44 +1000
Subject: [PATCH 028/102] [QOLDEV-554] add unit test for time ranges being
preserved as strings
---
.../tests/samples/non_timestamp_sample.csv | 3 +++
ckanext/xloader/tests/test_loader.py | 19 +++++++++++++++++++
2 files changed, 22 insertions(+)
create mode 100644 ckanext/xloader/tests/samples/non_timestamp_sample.csv
diff --git a/ckanext/xloader/tests/samples/non_timestamp_sample.csv b/ckanext/xloader/tests/samples/non_timestamp_sample.csv
new file mode 100644
index 00000000..daf438e5
--- /dev/null
+++ b/ckanext/xloader/tests/samples/non_timestamp_sample.csv
@@ -0,0 +1,3 @@
+Title,Postal postcode,Latitude,Longitude,Mon am,Mon pm,Last updated
+Adavale,4474,-25.9092582,144.5975769,8:00,16:00,19/07/2018
+Aramac,4726,-22.971298,145.241481,9:00-13:00,14:00-16:45,17/07/2018
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index d55ec949..2752e11a 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -1215,3 +1215,22 @@ def test_with_mixed_quotes(self, Session):
logger=logger,
)
assert len(self._get_records(Session, resource_id)) == 2
+
+ def test_preserving_time_ranges(self, Session):
+ """ Time ranges should not be treated as timestamps
+ """
+ csv_filepath = get_sample_filepath("non_timestamp_sample.csv")
+ resource = factories.Resource()
+ resource_id = resource['id']
+ loader.load_table(
+ csv_filepath,
+ resource_id=resource_id,
+ mimetype="text/csv",
+ logger=logger,
+ )
+ assert self._get_records(Session, resource_id) == [
+ (1, "Adavale", 4474, Decimal("-25.9092582"), Decimal("144.5975769"),
+ "8:00", "16:00", datetime.datetime(2018, 7, 19)),
+ (2, "Aramac", 4726, Decimal("-22.971298"), Decimal("145.241481"),
+ "9:00-13:00", "14:00-16:45", datetime.datetime(2018, 7, 17))
+ ]
From f5fad5b65062e7ae94bfd87251b047b9cf8cf7ec Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 22 Aug 2023 14:44:32 +1000
Subject: [PATCH 029/102] [QOLDEV-554] add extra data to the unit test
---
ckanext/xloader/tests/samples/non_timestamp_sample.csv | 1 +
ckanext/xloader/tests/test_loader.py | 2 ++
2 files changed, 3 insertions(+)
diff --git a/ckanext/xloader/tests/samples/non_timestamp_sample.csv b/ckanext/xloader/tests/samples/non_timestamp_sample.csv
index daf438e5..d1b39e90 100644
--- a/ckanext/xloader/tests/samples/non_timestamp_sample.csv
+++ b/ckanext/xloader/tests/samples/non_timestamp_sample.csv
@@ -1,3 +1,4 @@
Title,Postal postcode,Latitude,Longitude,Mon am,Mon pm,Last updated
Adavale,4474,-25.9092582,144.5975769,8:00,16:00,19/07/2018
Aramac,4726,-22.971298,145.241481,9:00-13:00,14:00-16:45,17/07/2018
+Barcaldine,4725,-23.55327901,145.289156,9:00-12:30,13:30-16:30,20/07/2018
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index 2752e11a..c01d830c 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -1233,4 +1233,6 @@ def test_preserving_time_ranges(self, Session):
"8:00", "16:00", datetime.datetime(2018, 7, 19)),
(2, "Aramac", 4726, Decimal("-22.971298"), Decimal("145.241481"),
"9:00-13:00", "14:00-16:45", datetime.datetime(2018, 7, 17))
+ (3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"),
+ "9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20))
]
From e6b05c6450b4a5c2dd8e18fee59e15d38e4f95b9 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 22 Aug 2023 15:55:20 +1000
Subject: [PATCH 030/102] [QOLDEV-554] restrict recognised date types
- Apply a regex to limit the values that will potentially be parsed as dates.
We aren't interested in anything that doesn't seem to have day, month, or year components.
---
ckanext/xloader/parser.py | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py
index e063c762..d17b2f80 100644
--- a/ckanext/xloader/parser.py
+++ b/ckanext/xloader/parser.py
@@ -3,6 +3,8 @@
import datetime
from decimal import Decimal, InvalidOperation
from itertools import chain
+import re
+import six
from ckan.plugins.toolkit import asbool
from dateutil.parser import isoparser, parser, ParserError
@@ -13,6 +15,7 @@
from ckan.plugins.toolkit import config
CSV_SAMPLE_LINES = 1000
+DATE_REGEX = re.compile(r'''^\d{1,4}[-/.\s]\S+[-/.\s]\S+''')
class XloaderCSVParser(Parser):
@@ -168,8 +171,8 @@ def convert_types(self, extended_rows):
def to_number(value):
- if isinstance(value, Decimal):
- return value
+ if not isinstance(value, six.string_types):
+ return None
try:
return Decimal(value)
except InvalidOperation:
@@ -177,8 +180,8 @@ def to_number(value):
def to_timestamp(value):
- if isinstance(value, datetime.datetime):
- return value
+ if not isinstance(value, six.string_types) or not DATE_REGEX.search(value):
+ return None
try:
i = isoparser()
return i.isoparse(value)
From 00ee904e46009dd1be6beff41f65a45e2a443ce2 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 22 Aug 2023 15:59:37 +1000
Subject: [PATCH 031/102] [QOLDEV-554] fix missing comma
---
ckanext/xloader/tests/test_loader.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index c01d830c..f17e6c10 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -1232,7 +1232,7 @@ def test_preserving_time_ranges(self, Session):
(1, "Adavale", 4474, Decimal("-25.9092582"), Decimal("144.5975769"),
"8:00", "16:00", datetime.datetime(2018, 7, 19)),
(2, "Aramac", 4726, Decimal("-22.971298"), Decimal("145.241481"),
- "9:00-13:00", "14:00-16:45", datetime.datetime(2018, 7, 17))
+ "9:00-13:00", "14:00-16:45", datetime.datetime(2018, 7, 17)),
(3, "Barcaldine", 4725, Decimal("-23.55327901"), Decimal("145.289156"),
"9:00-12:30", "13:30-16:30", datetime.datetime(2018, 7, 20))
]
From e54b212e760b11f0d42f14dadcc0e8b0bc56b1fd Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 22 Aug 2023 16:17:46 +1000
Subject: [PATCH 032/102] [QOLDEV-554] replace custom parser with
post-processing
- Customising the parser requires a lot of boilerplate, and we already have a post-processor to do the job.
---
ckanext/xloader/jobs.py | 4 +-
ckanext/xloader/loader.py | 6 +-
ckanext/xloader/parser.py | 150 +++------------------------
ckanext/xloader/tests/test_parser.py | 10 +-
4 files changed, 23 insertions(+), 147 deletions(-)
diff --git a/ckanext/xloader/jobs.py b/ckanext/xloader/jobs.py
index 9fae67c2..9c6e0a67 100644
--- a/ckanext/xloader/jobs.py
+++ b/ckanext/xloader/jobs.py
@@ -28,6 +28,8 @@
except ImportError:
get_user_from_token = None
+log = logging.getLogger(__name__)
+
SSL_VERIFY = asbool(config.get('ckanext.xloader.ssl_verify', True))
if not SSL_VERIFY:
requests.packages.urllib3.disable_warnings()
@@ -82,7 +84,6 @@ def xloader_data_into_datastore(input):
db.mark_job_as_errored(job_id, str(e))
job_dict['status'] = 'error'
job_dict['error'] = str(e)
- log = logging.getLogger(__name__)
log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
errored = True
except Exception as e:
@@ -90,7 +91,6 @@ def xloader_data_into_datastore(input):
job_id, traceback.format_tb(sys.exc_info()[2])[-1] + repr(e))
job_dict['status'] = 'error'
job_dict['error'] = str(e)
- log = logging.getLogger(__name__)
log.error('xloader error: {0}, {1}'.format(e, traceback.format_exc()))
errored = True
finally:
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 92e990ed..4da314a8 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -16,7 +16,7 @@
import ckan.plugins as p
from .job_exceptions import FileCouldNotBeLoadedError, LoaderError
-from .parser import CSV_SAMPLE_LINES, XloaderCSVParser, TypeConverter
+from .parser import CSV_SAMPLE_LINES, TypeConverter
from .utils import datastore_resource_exists, headers_guess, type_guess
from ckan.plugins.toolkit import config
@@ -238,13 +238,13 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
try:
file_format = os.path.splitext(table_filepath)[1].strip('.')
with Stream(table_filepath, format=file_format,
- custom_parsers={'csv': XloaderCSVParser}) as stream:
+ post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
with Stream(table_filepath, format=file_format,
- custom_parsers={'csv': XloaderCSVParser}) as stream:
+ post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
diff --git a/ckanext/xloader/parser.py b/ckanext/xloader/parser.py
index d17b2f80..812ccd1f 100644
--- a/ckanext/xloader/parser.py
+++ b/ckanext/xloader/parser.py
@@ -1,154 +1,24 @@
# -*- coding: utf-8 -*-
-import csv
import datetime
from decimal import Decimal, InvalidOperation
-from itertools import chain
import re
import six
from ckan.plugins.toolkit import asbool
from dateutil.parser import isoparser, parser, ParserError
-from tabulator import helpers
-from tabulator.parser import Parser
-
from ckan.plugins.toolkit import config
CSV_SAMPLE_LINES = 1000
DATE_REGEX = re.compile(r'''^\d{1,4}[-/.\s]\S+[-/.\s]\S+''')
-class XloaderCSVParser(Parser):
- """Extends tabulator CSVParser to detect datetime and numeric values.
- """
-
- # Public
-
- options = [
- 'delimiter',
- 'doublequote',
- 'escapechar',
- 'quotechar',
- 'quoting',
- 'skipinitialspace',
- 'lineterminator'
- ]
-
- def __init__(self, loader, force_parse=False, **options):
- super(XloaderCSVParser, self).__init__(loader, force_parse, **options)
- # Set attributes
- self.__loader = loader
- self.__options = options
- self.__force_parse = force_parse
- self.__extended_rows = None
- self.__encoding = None
- self.__dialect = None
- self.__chars = None
-
- @property
- def closed(self):
- return self.__chars is None or self.__chars.closed
-
- def open(self, source, encoding=None):
- # Close the character stream, if necessary, before reloading it.
- self.close()
- self.__chars = self.__loader.load(source, encoding=encoding)
- self.__encoding = getattr(self.__chars, 'encoding', encoding)
- if self.__encoding:
- self.__encoding.lower()
- self.reset()
-
- def close(self):
- if not self.closed:
- self.__chars.close()
-
- def reset(self):
- helpers.reset_stream(self.__chars)
- self.__extended_rows = self.__iter_extended_rows()
-
- @property
- def encoding(self):
- return self.__encoding
-
- @property
- def dialect(self):
- if self.__dialect:
- dialect = {
- 'delimiter': self.__dialect.delimiter,
- 'doubleQuote': self.__dialect.doublequote,
- 'lineTerminator': self.__dialect.lineterminator,
- 'quoteChar': self.__dialect.quotechar,
- 'skipInitialSpace': self.__dialect.skipinitialspace,
- }
- if self.__dialect.escapechar is not None:
- dialect['escapeChar'] = self.__dialect.escapechar
- return dialect
-
- @property
- def extended_rows(self):
- return self.__extended_rows
-
- # Private
-
- def __iter_extended_rows(self):
-
- def type_value(value):
- """Returns numeric values as Decimal(). Uses dateutil to parse
- date values. Otherwise, returns values as it receives them
- (strings).
- """
- if value in ('', None):
- return ''
-
- return to_number(value) or to_timestamp(value) or value
-
- sample, dialect = self.__prepare_dialect(self.__chars)
- items = csv.reader(chain(sample, self.__chars), dialect=dialect)
- for row_number, item in enumerate(items, start=1):
- values = []
- for value in item:
- value = type_value(value)
- values.append(value)
- yield row_number, None, list(values)
-
- def __prepare_dialect(self, stream):
-
- # Get sample
- sample = []
- while True:
- try:
- sample.append(next(stream))
- except StopIteration:
- break
- if len(sample) >= CSV_SAMPLE_LINES:
- break
-
- # Get dialect
- try:
- separator = ''
- delimiter = self.__options.get('delimiter', ',\t;|')
- dialect = csv.Sniffer().sniff(separator.join(sample), delimiter)
- if not dialect.escapechar:
- dialect.doublequote = True
- except csv.Error:
- class dialect(csv.excel):
- pass
- for key, value in self.__options.items():
- setattr(dialect, key, value)
- # https://github.com/frictionlessdata/FrictionlessDarwinCore/issues/1
- if getattr(dialect, 'quotechar', None) == '':
- setattr(dialect, 'quoting', csv.QUOTE_NONE)
-
- self.__dialect = dialect
- return sample, dialect
-
-
class TypeConverter:
""" Post-process table cells to convert strings into numbers and timestamps
as desired.
"""
- def __init__(self, types):
+ def __init__(self, types=None):
self.types = types
def convert_types(self, extended_rows):
@@ -161,12 +31,18 @@ def convert_types(self, extended_rows):
for cell_index, cell_value in enumerate(row):
if cell_value is None:
row[cell_index] = ''
- if cell_value:
- cell_type = self.types[cell_index]
- if cell_type == Decimal:
- row[cell_index] = to_number(cell_value) or cell_value
- elif cell_type == datetime.datetime:
- row[cell_index] = to_timestamp(row[cell_index]) or cell_value
+ if not cell_value:
+ continue
+ cell_type = self.types[cell_index] if self.types else None
+ if cell_type in [Decimal, None]:
+ converted_value = to_number(cell_value)
+ if converted_value:
+ row[cell_index] = converted_value
+ continue
+ if cell_type in [datetime.datetime, None]:
+ converted_value = to_timestamp(cell_value)
+ if converted_value:
+ row[cell_index] = converted_value
yield (row_number, headers, row)
diff --git a/ckanext/xloader/tests/test_parser.py b/ckanext/xloader/tests/test_parser.py
index 67929d9f..ac4047dd 100644
--- a/ckanext/xloader/tests/test_parser.py
+++ b/ckanext/xloader/tests/test_parser.py
@@ -6,7 +6,7 @@
from datetime import datetime
from tabulator import Stream
-from ckanext.xloader.parser import XloaderCSVParser
+from ckanext.xloader.parser import TypeConverter
csv_filepath = os.path.abspath(
os.path.join(os.path.dirname(__file__), "samples", "date_formats.csv")
@@ -16,7 +16,7 @@
class TestParser(object):
def test_simple(self):
with Stream(csv_filepath, format='csv',
- custom_parsers={'csv': XloaderCSVParser}) as stream:
+ post_parse=[TypeConverter().convert_types]) as stream:
assert stream.sample == [
[
'date',
@@ -49,7 +49,7 @@ def test_simple(self):
def test_dayfirst(self):
print('test_dayfirst')
with Stream(csv_filepath, format='csv',
- custom_parsers={'csv': XloaderCSVParser}) as stream:
+ post_parse=[TypeConverter().convert_types]) as stream:
assert stream.sample == [
[
'date',
@@ -82,7 +82,7 @@ def test_dayfirst(self):
def test_yearfirst(self):
print('test_yearfirst')
with Stream(csv_filepath, format='csv',
- custom_parsers={'csv': XloaderCSVParser}) as stream:
+ post_parse=[TypeConverter().convert_types]) as stream:
assert stream.sample == [
[
'date',
@@ -115,7 +115,7 @@ def test_yearfirst(self):
@pytest.mark.ckan_config("ckanext.xloader.parse_dates_yearfirst", True)
def test_yearfirst_dayfirst(self):
with Stream(csv_filepath, format='csv',
- custom_parsers={'csv': XloaderCSVParser}) as stream:
+ post_parse=[TypeConverter().convert_types]) as stream:
assert stream.sample == [
[
'date',
From d40e24578b2e55b7273de36c9e45b665bdad74b4 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Wed, 11 Oct 2023 10:17:46 +1000
Subject: [PATCH 033/102] [QOLSVC-3224] handle any falsy url_type the same way
as empty string
---
ckanext/xloader/helpers.py | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/ckanext/xloader/helpers.py b/ckanext/xloader/helpers.py
index 829b7b74..8b9dee8f 100644
--- a/ckanext/xloader/helpers.py
+++ b/ckanext/xloader/helpers.py
@@ -35,8 +35,12 @@ def is_resource_supported_by_xloader(res_dict, check_access=True):
user_has_access = toolkit.h.check_access('package_update', {'id': res_dict.get('package_id')})
else:
user_has_access = True
- try:
- is_supported_url_type = res_dict.get('url_type') not in toolkit.h.datastore_rw_resource_url_types()
- except AttributeError:
- is_supported_url_type = (res_dict.get('url_type') == 'upload' or res_dict.get('url_type') == '')
+ url_type = res_dict.get('url_type')
+ if url_type:
+ try:
+ is_supported_url_type = url_type not in toolkit.h.datastore_rw_resource_url_types()
+ except AttributeError:
+ is_supported_url_type = (url_type == 'upload')
+ else:
+ is_supported_url_type = True
return (is_supported_format or is_datastore_active) and user_has_access and is_supported_url_type
From 589800078b29e35c2315de2e5ca51177f7530e15 Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 23 Oct 2023 15:16:47 +1000
Subject: [PATCH 034/102] [QOLSVC-2984] add test for ISO-8859-1 CSV encoding
---
.../xloader/tests/samples/non_utf8_sample.csv | 267 ++++++++++++++++++
ckanext/xloader/tests/test_loader.py | 12 +
2 files changed, 279 insertions(+)
create mode 100644 ckanext/xloader/tests/samples/non_utf8_sample.csv
diff --git a/ckanext/xloader/tests/samples/non_utf8_sample.csv b/ckanext/xloader/tests/samples/non_utf8_sample.csv
new file mode 100644
index 00000000..334c1005
--- /dev/null
+++ b/ckanext/xloader/tests/samples/non_utf8_sample.csv
@@ -0,0 +1,267 @@
+"ClientId_ActNo","Owner","Amount","SenderName","DateRec","PCode"
+"206681442213","MS MARIE LOUISE SEXTON ","477.05","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","3206"
+"206681442214","MR DAVID SHEARER","3.79","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","2213"
+"206681442215","MRS M SHONK + MR E T SHONK ","10.3","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","2093"
+"206681442216","MS AGATHA SKOURTIS","108.42","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","3025"
+"206681442217","MR JAMES SMITH","108.42","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","4811"
+"206681442218","MRS JILLIAN MELINDA SMITH","602.27","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","2752"
+"206681442219","MISS JESSICA SARAH STEAD","174.01","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","2040"
+"206681442220","MISS CHAU DONG MINH TANG","542.1","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","3065"
+"206681442221","MR TROY TAYLOR","240.69","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","4000"
+"206681442222","MR ANDREW PHILIP THOMPSON","2.17","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","2204"
+"206681442223","MR IVAN CONRAD TIMBS","702.02","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","2612"
+"206681442224","MR J WAJNTRAUB + MRS S WAJNTRAUB ","542.1","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","3205"
+"206681442225","MR HOWARD GRENVILLE WEBBER","400.61","VIRGIN AUSTRALIA HOLDINGS LIMITED","2012-02-28 00:00:00","4556"
+"206681442226","JANI ILARI KALLA","10","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","6157"
+"206681442227","GARY JOHN & DESLEY L CAHILL","10","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4816"
+"206681442228","CARMEL ANASTASIA MEAGLIA","10","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2205"
+"206681442229","ASHLEY & ANNIE BRUGGEMANN","10","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4671"
+"206681442230","TERRY & MARY RITCHIE","10","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4069"
+"206681442231","BODY CORPORATE VILLAGE WAY CTS 19459","10","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4214"
+"206681442232","MATHEW JOHN SHORTLAND","10","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2573"
+"206681442233","TANYA MARIE TOWNSON","10.01","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4814"
+"206681442234","VENEE ELVA RUSSELL","10.02","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4035"
+"206681442235","ELIZABETH FERNANCE","10.03","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4223"
+"206681442236","CHARLES JOHN & OLWYN MARTIN","10.04","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4121"
+"206681442237","ALFRED BRETT SEILER","10.05","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4129"
+"206681442238","LOUISE WOODHAM & NATHAN FREY","10.07","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4400"
+"206681442239","MITRA KHAKBAZ","10.09","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4005"
+"206681442240","ALLAN EDWARD KILCULLEN","10.1","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4817"
+"206681442241","BEVAN JOHN LISTON","10.11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4350"
+"206681442242","KRIS MICHAEL KANKAHAINEN","10.11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4107"
+"206681442243","MICHAEL LYNN","10.16","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4860"
+"206681442244","ALAN RAYMOND & GERAL BURKITT","10.19","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4228"
+"206681442245","JENNIFER & NEVILLE MARXSEN","10.19","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4680"
+"206681442246","DARREN MAIN GRANT & LISA MARIE GROSSKOPF","10.2","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4504"
+"206681442247","PEARSON AUTOMOTIVE","10.23","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4064"
+"206681442248","MR SHANE HOPE & MISS YVONNE HILTON","10.24","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4173"
+"206681442249","CARMEL LESLEY NEILSON & WAYNE MERVYN NEILSON &","10.24","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4650"
+"206681442250","STEPHEN KENNETH ROBERTSON","10.24","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4740"
+"206681442251","SHIH CHE LIN","10.26","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4214"
+"206681442252","DAVID BRETT BROWNE","10.29","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4558"
+"206681442253","NEVILLE COLIN WOODHOUSE","10.32","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4814"
+"206681442254","DARRYN GREGORY & PET ROBIN","10.34","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4178"
+"206681442255","DUDLEY JESSER","10.38","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4814"
+"206681442256","MURRAY JOHN & SANDRA DIXON","10.38","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4870"
+"206681442257","SHATHISO JOHNSON BAREKI","10.38","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4215"
+"206681442258","ARTHUR EDWARD & MAUR MACDONALD","10.39","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4390"
+"206681442259","GARY GOLDBERG","10.4","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2480"
+"206681442260","PHUONG VAN NGO","10.41","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4508"
+"206681442261","JACQUELYN WILSON","10.42","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","3046"
+"206681442262","GARTH TURTON","10.42","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4051"
+"206681442263","DAVID JAMES & ANNE M O'ROURKE","10.43","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4701"
+"206681442264","ROBERT RUSSELL & VER MCKENZIE","10.45","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4503"
+"206681442265","ESTATE OF DULCIE L SYKES","10.48","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4215"
+"206681442266","LEESA GAYE OSMOND","10.51","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4671"
+"206681442267","DAVID JOHN & ROSEMAR GILES","10.54","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4303"
+"206681442268","SALLY & AQEEL AHMED","10.56","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4350"
+"206681442269","JUDITH MARJORY BURGESS","10.59","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","3101"
+"206681442270","TROY ANTONY EWART","10.61","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4301"
+"206681442271","RODULFO MANOY & GEORGE HAJEK","10.62","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4152"
+"206681442272","GLEN DUNSTAN","10.66","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","3196"
+"206681442273","ANNE RALSTON WRIGHT","10.73","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4825"
+"206681442274","ALAN & NICOLE MAREE JACKSON","10.74","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4720"
+"206681442275","DANIEL MALCOLM BROWN","10.81","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4501"
+"206681442276","JENNIFER DEMERAL","10.82","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4214"
+"206681442277","DARREN & LISA GARRETT","10.83","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4165"
+"206681442278","LORRAINE & PETER JACKSON","10.84","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4740"
+"206681442279","CHERYL MADELINE CAMPBELL","10.86","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4824"
+"206681442280","OLAF PETER PRILL","10.89","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4305"
+"206681442281","AJAY GIDH","10.9","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4051"
+"206681442282","DEBRA JOANNE PRINDABLE","10.9","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4178"
+"206681442283","MATTHEW WILLIAM CLARKE","10.96","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2914"
+"206681442284","MARK STANLEY MCKENZIE","11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4207"
+"206681442285","TREVOR & JANICE GARWOOD","11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4610"
+"206681442286","LISA ANNE BRATINA","11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4228"
+"206681442287","MICHAEL GEORGE KIRKWOOD","11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4561"
+"206681442288","STEPHAN & JULIE BAWDEN","11.04","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4114"
+"206681442289","PETER JOHN BOURKE","11.04","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4118"
+"206681442290","TYRONE PAGE & ULRIKE","11.07","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4301"
+"206681442291","SIMON ROBERT GRAY","11.08","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4006"
+"206681442292","ALLAN NICHOLAS SCHWARZROCK","11.12","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4650"
+"206681442293","IVAN J BLAKE & JAINE RIGTER","11.12","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4220"
+"206681442294","DAVID MATTHEW REGINA CHRISTIE","11.12","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4151"
+"206681442295","GEOFFREY WAYNE & EVAN GRIGG","11.14","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4720"
+"206681442296","KYLIE JANELLE HARDCASTLE","11.14","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4013"
+"206681442297","PAMELA ANN WELLER","11.15","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4655"
+"206681442298","JASON PATRICK & ELIZ MURPHY","11.16","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4511"
+"206681442299","MLADEN & VESNA SAJKO","11.19","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4157"
+"206681442300","DEAN STEPHEN BROCKENSHIRE","11.19","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2571"
+"206681442301","LISA CHRISTOBEL BOWKER","11.22","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4066"
+"206681442302","MATTHEW RAY EBBAGE","11.24","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4101"
+"206681442303","BRIAN & GEORGINA WHITLEY","11.25","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4521"
+"206681442304","HAYLEY WESTON","11.25","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4159"
+"206681442305","JAMES PATRICK HOCKING","11.28","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4127"
+"206681442306","ROBERT ANDREW & SARA BROWNHALL","11.29","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4069"
+"206681442307","EDWARD JAMES DODGSON","11.3","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4069"
+"206681442308","MELISSA JOY DODD","11.32","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4069"
+"206681442309","JOSHUA CALVIN BEGENT","11.38","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4306"
+"206681442311","DORATHY AMANDA WALTERS","11.4","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4744"
+"206681442312","RICHARD ROBERTS & KYM RALEIGH","11.4","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4053"
+"206681442313","SAMARA INSOLL","11.48","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4212"
+"206681442314","NEIL GREGORY FLESSER","11.49","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4800"
+"206681442315","EUNICE GLADYS WILBRAHAM","11.51","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4570"
+"206681442316","KARA NICOLE MCINNES","11.57","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4503"
+"206681442317","DAVID BLYTH","11.58","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4078"
+"206681442318","KEVIN & MARION KEIR","11.58","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4216"
+"206681442319","FRANCES & CHARLES KEEBLE","11.59","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4500"
+"206681442320","LYNETTE ANNE & PETER NISSEN","11.6","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4069"
+"206681442321","DANIEL PETER JOHNSON","11.61","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4051"
+"206681442322","ALLAN & EUNICE DELLAWAY","11.62","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4122"
+"206681442323","CHRISTOPHER JOHN BEEM","11.63","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4101"
+"206681442324","DAVID JAMES & KELLIE POULTON","11.64","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4350"
+"206681442325","MAVIS CAROLIN SCOTT","11.64","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4018"
+"206681442326","REEGAN & ADAM MARTIN","11.68","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2153"
+"206681442327","DENYSE B BONNEY","11.7","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4811"
+"206681442328","JAMES ANDERSON","11.71","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4220"
+"206681442329","SUSANNAH PINTER","11.72","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4744"
+"206681442330","BRENTON MARK & KAREN GARNETT","11.78","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4306"
+"206681442331","PL CAMELOT VENTURES AS TRUSTEE FOR K F T TRUST NO","11.82","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4215"
+"206681442332","RON HENRY SCHMIDT","11.84","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","830"
+"206681442333","ROSS COCKBURN & AUDREY KILL","11.86","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4010"
+"206681442334","BENJAMIN CLARK","11.88","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4306"
+"206681442335","IRIS LEAH TERESA BAKER","11.9","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2170"
+"206681442336","MARK JOHN DEEBLE","11.94","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4740"
+"206681442337","CHRISTINE & BARRY RIGBY","11.94","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2485"
+"206681442338","NATASHA ANN WOODWARD","11.97","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4869"
+"206681442339","BENJAMIN JOHN CANSDALE","11.98","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4064"
+"206681442340","PETER HERALD","11.98","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4184"
+"206681442341","SIMON CUSHWAY","11.99","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4154"
+"206681442342","ANTHONY & MICHELLE JOHNSTON","12","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4551"
+"206681442343","PAUL HAUCK","12.03","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4000"
+"206681442344","RONALD ALBERT & PEAR NORTHILL","12.03","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4413"
+"206681442345","ROBYN ELLEN SOMERS","12.03","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4178"
+"206681442346","ROSE ANN HODGMAN","12.06","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4068"
+"206681442347","JOHN & MARDI BOLTON","12.09","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4165"
+"206681442348","KRYSTYNA RENNIE","12.09","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4053"
+"206681442349","JOANNE BARSBY","12.12","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4350"
+"206681442350","BRENDAN JAMES FELSCHOW","12.14","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4508"
+"206681442351","MARTIN WILLIAM HARRISON","12.16","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4870"
+"206681442352","PATRICK HEINEMANN","12.16","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4870"
+"206681442353","ELEKRA & SPENCER RORIE","12.17","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4211"
+"206681442354","ROBERT CLIVE & NOELE CROCKER","12.19","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4211"
+"206681442355","DANIEL JOSEPH & DAVI CARMICHAEL","12.21","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4160"
+"206681442356","WENBO JIANG & XIU FAN CHEN","12.24","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4303"
+"206681442357","NOEL JEFFREY BRADY","12.27","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4550"
+"206681442358","DARREN RICHARD GOSSNER & MATTHEW JOHN ANDERSON","12.29","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4500"
+"206681442359","STEPHEN MICHAEL & MA JOLLY","12.3","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4350"
+"206681442360","SHONA & ARCHIE WALLACE","12.34","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4504"
+"206681442361","ZOFIA HYS","12.34","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4300"
+"206681442362","PIROSKA KING","12.38","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4154"
+"206681442363","ARVIN CHAND & AMITA MOHINI","12.38","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4503"
+"206681442364","WIETSKE GERARDINA & GAUNT","12.38","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4309"
+"206681442365","MARK REGINALD MATTHEWS","12.39","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4811"
+"206681442366","SHARP ARLEEN & CLINTON","12.4","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","6020"
+"206681442367","EMOKE & LASZLO & MAR ZSOLDOS","12.41","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4005"
+"206681442368","MARK & KARON KELLER","12.42","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4702"
+"206681442369","JODIE KATRINA & TONY MCLACHLAN","12.43","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4350"
+"206681442370","ALAN WARWICK & LINDA LEWIS","12.45","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4670"
+"206681442371","ADRIAN WAYNE LORRAWAY","12.5","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4702"
+"206681442372","NICHOLE KRISTY MIKLOS","12.53","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4152"
+"206681442373","NATASHA LEANNE HAYES","12.54","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4017"
+"206681442374","KAREN LEE & DARREN J SHEEHAN","12.55","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4516"
+"206681442375","RACHAEL MAY COLLINS-COOK","12.58","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4211"
+"206681442376","TAMARA JUNE WEIGHT & SUSANNE ELIZABETH DEVINE","12.59","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4814"
+"206681442377","RODNEY GATES","12.59","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","7015"
+"206681442378","REBECCA & LEE-ANNE SMITH","12.61","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","830"
+"206681442379","ADAM WILLIAM JOHNSON","12.62","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4069"
+"206681442380","ZAC ASHLEY & ALEXAND MORGAN","12.63","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4165"
+"206681442381","HILARY SEALY","12.64","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4211"
+"206681442382","NAOMI JOHNSTONE & SCOTT LENAN","12.68","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4207"
+"206681442383","WAYNE FLICKER","12.7","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2756"
+"206681442384","BRENDA ANDERSON","12.71","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4811"
+"206681442385","MATTHEW JAMES ALLEN","12.71","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4002"
+"206681442386","MARIA-THERESIA ALTENHOFEN-CROSS & JOHN ERI CROSS","12.72","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4570"
+"206681442387","MELODIE ZYLSTRA","12.72","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4570"
+"206681442388","AMANDA & GRAHAM SWALLOW","12.75","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4720"
+"206681442389","GRAEME ROBERT & ROBI DOHERTY","12.75","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4214"
+"206681442390","GILLIAN LEIGH O'SULLIVAN","12.79","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4817"
+"206681442391","JULIA MELLICK","12.84","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4178"
+"206681442392","TOLISIALE & HAMAKO MAHINA","12.87","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4300"
+"206681442393","SIMON JOHN STEVENS","12.89","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4305"
+"206681442394","MICHAEL ANTHONY & DE SNELSON","12.89","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4817"
+"206681442395","QUERIDA JO LOFTES","12.89","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4812"
+"206681442396","LORRAINE VICTORIA DIAS","12.89","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4000"
+"206681442397","JOHN MICHAEL TRAVIS LINLEY","12.92","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4051"
+"206681442398","CAROLINE HENDRY & RICHARD HOPKINS","12.93","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4067"
+"206681442399","JOSH EAGLE","12.95","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4030"
+"206681442400","MARK SHAWN FROST & BELINDA JEAN MARSHALL","12.95","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4221"
+"206681442401","BRENT & GABRIELLE ANTHONY","12.96","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4213"
+"206681442402","RICHARD SADLER","12.98","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4065"
+"206681442403","GROVE FRUIT JUICE PTY LTD","13","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4122"
+"206681442404","LEAH SPARKS","13","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4561"
+"206681442405","JAMES MAURICE & PATR GORDON","13","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4870"
+"206681442406","MARK JOSEPH SEARS","13","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4565"
+"206681442407","SOPHIE VICTORIA STEWART & TREVOR MATTHEW ROWE","13","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4055"
+"206681442408","BOBBY JAMES & SIMONE TAYLOR","13.02","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","6254"
+"206681442409","PATRICK MICHAEL & ME REEVES","13.08","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4101"
+"206681442410","MAURICE GROGNUZ","13.09","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4670"
+"206681442411","ALAN PIGOTT & ALAN CONDER","13.11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2025"
+"206681442412","SAMANTHA & CAMERON SCHELBACH","13.16","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4309"
+"206681442413","SHERIDAN ANNE ST CLAIR","13.16","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4164"
+"206681442414","ANDREW CHRISTIE","13.17","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4521"
+"206681442415","MARK ANDREW & MELISS VINTON","13.17","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4508"
+"206681442416","IRWIN DOUGLAS & MARI SORENSEN","13.2","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4305"
+"206681442417","CARLY SUSAN BENNETTS","13.23","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4034"
+"206681442418","RYAN THORNTON","13.24","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2560"
+"206681442419","RICHARD BAILEY","13.26","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","3850"
+"206681442420","DAVID IAN & EMILY RU PRYOR","13.27","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4120"
+"206681442421","WILLIAM SINCLAIR","13.3","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4514"
+"206681442422","CATHERINE LUCILLE VALENTINE & ROBERT WAREING","13.3","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4165"
+"206681442423","RAYMOND JAMES JONES","13.3","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4170"
+"206681442424","ANDREW STEWART T/A AWE COMMUNICATIONS","13.3","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4207"
+"206681442425","TONY RONALD OSBOURNE","13.35","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4311"
+"206681442426","MARK JOHN & LENY FIG O'HARA","13.35","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4825"
+"206681442427","CECILIA ASHLEY & DAV BUTLER","13.35","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4816"
+"206681442428","WILLIAM LEATHAM","13.36","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4170"
+"206681442429","MAXWELL RAYMOND MATHERS & DENISE MAREE MELLARE","13.44","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4129"
+"206681442430","RENE & JACQUELINE WASSERFUHR","13.44","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4556"
+"206681442431","MICHAEL LEIGH KENNEDY","13.48","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4610"
+"206681442432","MEDECO MEDICAL CENTRE BEENLEIGH","13.5","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4207"
+"206681442433","GARY PAUL & GAYE SHELLEY","13.5","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4510"
+"206681442434","STEVE & BRENDA GEIGER","13.53","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4740"
+"206681442435","GREGORY BERNARD JAMES","13.53","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4051"
+"206681442436","ROBBIE DEEBLE","13.56","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4740"
+"206681442437","OWEN TRAYNOR","13.56","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","6076"
+"206681442438","TONI MICHELLE & SHAN MORGAN","13.59","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4717"
+"206681442439","NICOLAS VAN HORTON","13.59","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4220"
+"206681442440","IAN BOWDEN","13.6","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4886"
+"206681442441","QUEENSLAND COUNTRY CREDIT UNION - JIMBOOMBA","13.61","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4814"
+"206681442442","ALANA FELLINGHAM","13.62","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4551"
+"206681442443","ALLAN JOHN & CARMEL BETHEL","13.62","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4871"
+"206681442444","PETER WILLIAM & ODET NORMAN","13.63","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4350"
+"206681442445","EMILY & MATTHEW PARSLOW","13.68","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4173"
+"206681442446","JAMES OI YUEN GOCK","13.69","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2049"
+"206681442447","JODIE ELIZABETH MORRISON","13.7","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4280"
+"206681442448","BELINDA JANE HARNETT-PETERS & RANDALL NEI PETERS","13.74","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4017"
+"206681442449","JULIEN & CHRISTIAN JUVIGNY","13.78","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4215"
+"206681442450","SUSAN JOY MURRAY & THOMAS HOGAN","13.79","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4217"
+"206681442451","PATRICK COLIN & HEAT HARRIS","13.8","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4506"
+"206681442452","LINDY BOTHA","13.84","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4154"
+"206681442453","PATRICIA LORETTA & D KNIGHT","13.85","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4650"
+"206681442454","COWBURN CONSULTING PTY LTD","13.87","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4000"
+"206681442455","SPENCER JAMES HAMILTON","13.9","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4300"
+"206681442456","ANNA LOUISE ROSS","13.95","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4170"
+"206681442457","JOHN HUGH & BOB SUTHERLAND","13.98","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4509"
+"206681442458","ROBERTA MARY MACNEE","13.99","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4567"
+"206681442459","MATTHEW CHRISTENSEN","14.03","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4152"
+"206681442460","TROY & KIRSTY JEFFRIES","14.04","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4370"
+"206681442461","WILLIAM GEORGE BALSDON","14.05","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4878"
+"206681442462","JAIME LISA CAMPBELL & DANIEL BEVERIDGE","14.07","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4216"
+"206681442463","NANCY JOHANNESSON","14.11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4505"
+"206681442464","JOSHUA FRANK SEIDL","14.11","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4122"
+"206681442465","DAVID LESTER","14.16","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4817"
+"206681442466","MATHIAS DONALD","14.16","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4103"
+"206681442467","GLEN EVAN & HAYLEE L MARTIN","14.19","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4350"
+"206681442468","JOHN GORDON EVANS","14.19","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4814"
+"206681442469","DIANA NOYCE & LAURENCE VIZER T/A","14.2","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4502"
+"206681442470","GREIG MANLEY","14.22","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","3040"
+"206681442471","BRENDON ANSELL","14.23","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4171"
+"206681442472","CATHERINE A ROBERTSON & PAUL BROMILEY","14.27","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4064"
+"206681442473","ADAM LEE & SAMANTHA RANKIN","14.28","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4132"
+"206681442474","BERNICE BOYS","14.34","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4011"
+"206681442475","HAYLEY MICHELLE BURROW","14.34","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2153"
+"206681442476","SIONE FAUMUINA","14.42","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4815"
+"206681442477","GERARD JARMAN","14.44","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","3337"
+"206681442478","DOUGLAS CECIL GOOLEY","14.48","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","2481"
+"206681442479","ANTHONY AUGUSTO HENRIQUES T/A CAFÚ VILA FRANCA","14.5","SUNCORP GENERAL INSURANCE","2012-03-12 00:00:00","4020"
diff --git a/ckanext/xloader/tests/test_loader.py b/ckanext/xloader/tests/test_loader.py
index f17e6c10..ffb3dcba 100644
--- a/ckanext/xloader/tests/test_loader.py
+++ b/ckanext/xloader/tests/test_loader.py
@@ -1204,6 +1204,18 @@ def test_with_quoted_commas(self, Session):
)
assert len(self._get_records(Session, resource_id)) == 3
+ def test_with_iso_8859_1(self, Session):
+ csv_filepath = get_sample_filepath("non_utf8_sample.csv")
+ resource = factories.Resource()
+ resource_id = resource['id']
+ loader.load_table(
+ csv_filepath,
+ resource_id=resource_id,
+ mimetype="text/csv",
+ logger=logger,
+ )
+ assert len(self._get_records(Session, resource_id)) == 266
+
def test_with_mixed_quotes(self, Session):
csv_filepath = get_sample_filepath("sample_with_mixed_quotes.csv")
resource = factories.Resource()
From af6aea6d7fb57009c20041f667a13d66f3d84d6d Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Mon, 23 Oct 2023 15:53:27 +1000
Subject: [PATCH 035/102] [QOLSVC-2984] handle Latin-1 encoding if UTF-8 fails
---
ckanext/xloader/loader.py | 49 +++++++++++++++++++++++++++++++--------
1 file changed, 39 insertions(+), 10 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 4da314a8..aabaefbe 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -10,7 +10,7 @@
import psycopg2
from six.moves import zip
-from tabulator import config as tabulator_config, Stream, TabulatorException
+from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException
from unidecode import unidecode
import ckan.plugins as p
@@ -31,18 +31,45 @@
tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES
+class UnknownEncodingStream(object):
+ """ Provides a context manager that wraps a Tabulator stream
+ and tries multiple encodings if one fails.
+
+ This is particularly relevant in cases like Latin-1 encoding,
+ which is usually ASCII and thus the sample could be sniffed as UTF-8,
+ only to run into problems later in the file.
+ """
+
+ def __init__(self, filepath, file_format, **kwargs):
+ self.filepath = filepath
+ self.file_format = file_format
+ self.stream_args = kwargs
+
+ def __enter__(self):
+ try:
+ self.stream = Stream(self.filepath, format=self.file_format,
+ **self.stream_args).__enter__()
+ except (EncodingError, UnicodeDecodeError):
+ self.stream = Stream(self.filepath, format=self.file_format,
+ encoding='latin1', **self.stream_args).__enter__()
+ return self.stream
+
+ def __exit__(self, *args):
+ return self.stream.__exit__(*args)
+
+
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
'''Loads a CSV into DataStore. Does not create the indexes.'''
# Determine the header row
try:
file_format = os.path.splitext(csv_filepath)[1].strip('.')
- with Stream(csv_filepath, format=file_format) as stream:
+ with UnknownEncodingStream(csv_filepath, file_format) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
- with Stream(csv_filepath, format=file_format) as stream:
+ with UnknownEncodingStream(csv_filepath, file_format) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
@@ -73,7 +100,8 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
- with Stream(csv_filepath, format=file_format, skip_rows=skip_rows) as stream:
+ with UnknownEncodingStream(csv_filepath, file_format,
+ skip_rows=skip_rows) as stream:
stream.save(target=f_write.name, format='csv', encoding='utf-8',
delimiter=delimiter)
csv_filepath = f_write.name
@@ -237,14 +265,14 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Determining column names and types')
try:
file_format = os.path.splitext(table_filepath)[1].strip('.')
- with Stream(table_filepath, format=file_format,
- post_parse=[TypeConverter().convert_types]) as stream:
+ with UnknownEncodingStream(table_filepath, file_format,
+ post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
- with Stream(table_filepath, format=file_format,
- post_parse=[TypeConverter().convert_types]) as stream:
+ with UnknownEncodingStream(table_filepath, file_format,
+ post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
@@ -281,8 +309,9 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types)
- with Stream(table_filepath, format=file_format, skip_rows=skip_rows,
- post_parse=[type_converter.convert_types]) as stream:
+ with UnknownEncodingStream(table_filepath, file_format,
+ skip_rows=skip_rows,
+ post_parse=[type_converter.convert_types]) as stream:
def row_iterator():
for row in stream:
data_row = {}
From 121592bfc7ae46ab16de2bcfac19cdfb29a3371c Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Tue, 31 Oct 2023 15:57:34 +1000
Subject: [PATCH 036/102] [QOLSVC-2984] handle Latin-1 encoding during 'save'
call
- Handling the encoding during stream initialisation isn't enough for this part
---
ckanext/xloader/loader.py | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index aabaefbe..f9cb625b 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -30,6 +30,8 @@
MAX_COLUMN_LENGTH = 63
tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES
+ISO_8859_ENCODING = 'latin1'
+
class UnknownEncodingStream(object):
""" Provides a context manager that wraps a Tabulator stream
@@ -51,7 +53,7 @@ def __enter__(self):
**self.stream_args).__enter__()
except (EncodingError, UnicodeDecodeError):
self.stream = Stream(self.filepath, format=self.file_format,
- encoding='latin1', **self.stream_args).__enter__()
+ encoding=ISO_8859_ENCODING, **self.stream_args).__enter__()
return self.stream
def __exit__(self, *args):
@@ -100,11 +102,16 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info('Ensuring character coding is UTF8')
f_write = tempfile.NamedTemporaryFile(suffix=file_format, delete=False)
try:
- with UnknownEncodingStream(csv_filepath, file_format,
- skip_rows=skip_rows) as stream:
- stream.save(target=f_write.name, format='csv', encoding='utf-8',
- delimiter=delimiter)
- csv_filepath = f_write.name
+ save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
+ try:
+ with UnknownEncodingStream(csv_filepath, file_format,
+ skip_rows=skip_rows) as stream:
+ stream.save(**save_args)
+ except (EncodingError, UnicodeDecodeError):
+ with Stream(csv_filepath, format=file_format, encoding=ISO_8859_ENCODING,
+ skip_rows=skip_rows) as stream:
+ stream.save(**save_args)
+ csv_filepath = f_write.name
# datastore db connection
engine = get_write_engine()
From 202d1305d78311c52eda6e07917b406bbea0325e Mon Sep 17 00:00:00 2001
From: William Dutton
Date: Thu, 2 Nov 2023 07:06:14 +1000
Subject: [PATCH 037/102] QOLSVC-2984 use chardet library to guess format type
and use it if its above 70% confident as well as allow fallback
---
ckanext/xloader/loader.py | 40 ++++++++++++++++++++++++++++++---------
requirements.txt | 1 +
2 files changed, 32 insertions(+), 9 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index f9cb625b..8f73f67d 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -9,6 +9,7 @@
from decimal import Decimal
import psycopg2
+from chardet.universaldetector import UniversalDetector
from six.moves import zip
from tabulator import config as tabulator_config, EncodingError, Stream, TabulatorException
from unidecode import unidecode
@@ -42,15 +43,21 @@ class UnknownEncodingStream(object):
only to run into problems later in the file.
"""
- def __init__(self, filepath, file_format, **kwargs):
+ def __init__(self, filepath, file_format, decoding_result, **kwargs):
self.filepath = filepath
self.file_format = file_format
self.stream_args = kwargs
+ self.decoding_result = decoding_result # {'encoding': 'EUC-JP', 'confidence': 0.99}
def __enter__(self):
try:
- self.stream = Stream(self.filepath, format=self.file_format,
- **self.stream_args).__enter__()
+
+ if (self.decoding_result and self.decoding_result['confidence'] and self.decoding_result['confidence'] > 0.7):
+ self.stream = Stream(self.filepath, format=self.file_format, encoding=self.decoding_result['encoding'],
+ ** self.stream_args).__enter__()
+ else:
+ self.stream = Stream(self.filepath, format=self.file_format, ** self.stream_args).__enter__()
+
except (EncodingError, UnicodeDecodeError):
self.stream = Stream(self.filepath, format=self.file_format,
encoding=ISO_8859_ENCODING, **self.stream_args).__enter__()
@@ -60,18 +67,31 @@ def __exit__(self, *args):
return self.stream.__exit__(*args)
+def detect_encoding(file_path):
+ detector = UniversalDetector()
+ with open(file_path, 'rb') as file:
+ for line in file:
+ detector.feed(line)
+ if detector.done:
+ break
+ detector.close()
+ return detector.result # i.e. {'encoding': 'EUC-JP', 'confidence': 0.99}
+
+
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
'''Loads a CSV into DataStore. Does not create the indexes.'''
+ decoding_result = detect_encoding(csv_filepath)
+ logger.info("load_csv: Decoded encoding: %s", decoding_result)
# Determine the header row
try:
file_format = os.path.splitext(csv_filepath)[1].strip('.')
- with UnknownEncodingStream(csv_filepath, file_format) as stream:
+ with UnknownEncodingStream(csv_filepath, file_format, decoding_result=decoding_result) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
- with UnknownEncodingStream(csv_filepath, file_format) as stream:
+ with UnknownEncodingStream(csv_filepath, file_format, decoding_result=decoding_result) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
@@ -104,7 +124,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
try:
save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
- with UnknownEncodingStream(csv_filepath, file_format,
+ with UnknownEncodingStream(csv_filepath, file_format, decoding_result=decoding_result,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
@@ -270,15 +290,17 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
# Determine the header row
logger.info('Determining column names and types')
+ decoding_result = detect_encoding(table_filepath)
+ logger.info("load_table: Decoded encoding: %s", decoding_result)
try:
file_format = os.path.splitext(table_filepath)[1].strip('.')
- with UnknownEncodingStream(table_filepath, file_format,
+ with UnknownEncodingStream(table_filepath, file_format, decoding_result=decoding_result,
post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
- with UnknownEncodingStream(table_filepath, file_format,
+ with UnknownEncodingStream(table_filepath, file_format, decoding_result=decoding_result,
post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
@@ -317,7 +339,7 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
type_converter = TypeConverter(types=types)
with UnknownEncodingStream(table_filepath, file_format,
- skip_rows=skip_rows,
+ skip_rows=skip_rows, decoding_result=decoding_result,
post_parse=[type_converter.convert_types]) as stream:
def row_iterator():
for row in stream:
diff --git a/requirements.txt b/requirements.txt
index 58540beb..fe92b6d7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ six>=1.12.0
tabulator==1.53.5
Unidecode==1.0.22
python-dateutil>=2.8.2
+chardet==5.2.0
\ No newline at end of file
From cb54151a2dc4ea579bf1f107c82f501e4baf0636 Mon Sep 17 00:00:00 2001
From: William Dutton
Date: Thu, 2 Nov 2023 07:06:44 +1000
Subject: [PATCH 038/102] QOLSVC-2984 updates per comments
---
ckanext/xloader/loader.py | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index 8f73f67d..233a46e6 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -75,7 +75,7 @@ def detect_encoding(file_path):
if detector.done:
break
detector.close()
- return detector.result # i.e. {'encoding': 'EUC-JP', 'confidence': 0.99}
+ return detector.result # e.g. {'encoding': 'EUC-JP', 'confidence': 0.99}
def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
@@ -86,12 +86,12 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
# Determine the header row
try:
file_format = os.path.splitext(csv_filepath)[1].strip('.')
- with UnknownEncodingStream(csv_filepath, file_format, decoding_result=decoding_result) as stream:
+ with UnknownEncodingStream(csv_filepath, file_format, decoding_result) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
- with UnknownEncodingStream(csv_filepath, file_format, decoding_result=decoding_result) as stream:
+ with UnknownEncodingStream(csv_filepath, file_format, decoding_result) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
raise LoaderError('Tabulator error: {}'.format(e))
@@ -124,7 +124,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
try:
save_args = {'target': f_write.name, 'format': 'csv', 'encoding': 'utf-8', 'delimiter': delimiter}
try:
- with UnknownEncodingStream(csv_filepath, file_format, decoding_result=decoding_result,
+ with UnknownEncodingStream(csv_filepath, file_format, decoding_result,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
@@ -294,13 +294,13 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
logger.info("load_table: Decoded encoding: %s", decoding_result)
try:
file_format = os.path.splitext(table_filepath)[1].strip('.')
- with UnknownEncodingStream(table_filepath, file_format, decoding_result=decoding_result,
+ with UnknownEncodingStream(table_filepath, file_format, decoding_result,
post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException:
try:
file_format = mimetype.lower().split('/')[-1]
- with UnknownEncodingStream(table_filepath, file_format, decoding_result=decoding_result,
+ with UnknownEncodingStream(table_filepath, file_format, decoding_result,
post_parse=[TypeConverter().convert_types]) as stream:
header_offset, headers = headers_guess(stream.sample)
except TabulatorException as e:
@@ -338,8 +338,8 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
headers = [header.strip()[:MAX_COLUMN_LENGTH] for header in headers if header.strip()]
type_converter = TypeConverter(types=types)
- with UnknownEncodingStream(table_filepath, file_format,
- skip_rows=skip_rows, decoding_result=decoding_result,
+ with UnknownEncodingStream(table_filepath, file_format, decoding_result,
+ skip_rows=skip_rows,
post_parse=[type_converter.convert_types]) as stream:
def row_iterator():
for row in stream:
From cd32d61a7ec5e4808f04ccee509128314b8da81a Mon Sep 17 00:00:00 2001
From: ThrawnCA
Date: Thu, 2 Nov 2023 09:18:30 +1000
Subject: [PATCH 039/102] [QOLSVC-2984] sniff using Windows-1252 encoding
rather than Latin-1
- Windows-1252 is a superset, which makes it more useful for this purpose
---
ckanext/xloader/loader.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/ckanext/xloader/loader.py b/ckanext/xloader/loader.py
index f9cb625b..fc5b4ae3 100644
--- a/ckanext/xloader/loader.py
+++ b/ckanext/xloader/loader.py
@@ -30,7 +30,7 @@
MAX_COLUMN_LENGTH = 63
tabulator_config.CSV_SAMPLE_LINES = CSV_SAMPLE_LINES
-ISO_8859_ENCODING = 'latin1'
+SINGLE_BYTE_ENCODING = 'cp1252'
class UnknownEncodingStream(object):
@@ -53,7 +53,7 @@ def __enter__(self):
**self.stream_args).__enter__()
except (EncodingError, UnicodeDecodeError):
self.stream = Stream(self.filepath, format=self.file_format,
- encoding=ISO_8859_ENCODING, **self.stream_args).__enter__()
+ encoding=SINGLE_BYTE_ENCODING, **self.stream_args).__enter__()
return self.stream
def __exit__(self, *args):
@@ -108,7 +108,7 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
skip_rows=skip_rows) as stream:
stream.save(**save_args)
except (EncodingError, UnicodeDecodeError):
- with Stream(csv_filepath, format=file_format, encoding=ISO_8859_ENCODING,
+ with Stream(csv_filepath, format=file_format, encoding=SINGLE_BYTE_ENCODING,
skip_rows=skip_rows) as stream:
stream.save(**save_args)
csv_filepath = f_write.name
From ae354e57d100be8ca0c494e94bebc657373af56b Mon Sep 17 00:00:00 2001
From: Jesse Vickery
Date: Wed, 8 Nov 2023 20:12:14 +0000
Subject: [PATCH 040/102] feat(templates): added missing csrf field;
- Added csrf field to delete button form.
---
ckanext/xloader/templates/xloader/resource_data.html | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ckanext/xloader/templates/xloader/resource_data.html b/ckanext/xloader/templates/xloader/resource_data.html
index 0ae1d9b5..11ce3404 100644
--- a/ckanext/xloader/templates/xloader/resource_data.html
+++ b/ckanext/xloader/templates/xloader/resource_data.html
@@ -9,9 +9,11 @@
{% block delete_ds_button %}
From 340f629c28a1d39090e75c4a46eb2503cd1295d5 Mon Sep 17 00:00:00 2001
From: Jesse Vickery
Date: Wed, 8 Nov 2023 20:15:09 +0000
Subject: [PATCH 041/102] feat(templates): added missing form action;
- Added post action to delete button form.
---
ckanext/xloader/templates/xloader/resource_data.html | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/ckanext/xloader/templates/xloader/resource_data.html b/ckanext/xloader/templates/xloader/resource_data.html
index 11ce3404..85b334fd 100644
--- a/ckanext/xloader/templates/xloader/resource_data.html
+++ b/ckanext/xloader/templates/xloader/resource_data.html
@@ -5,13 +5,15 @@
{% block primary_content_inner %}
{% set action = h.url_for('xloader.resource_data', id=pkg.name, resource_id=res.id) %}
+ {% set delete_action = h.url_for('xloader.delete_datastore_table', id=pkg.id, resource_id=res.id) %}
{% set show_table = true %}
{% block delete_ds_button %}
-