Data Quality Framework provides by Jabar Digital Service
DataSae/tests/data/config.json
Lines 1 to 183 in 46ef800
{ | |
"test_gsheet": { | |
"type": "gsheet", | |
"client_secret_file": "tests/data/creds.json", | |
"gsheet_id": "gsheet_id", | |
"checker": [ | |
{ | |
"sheet_name": "Sheet1", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": null, | |
"regex_contain": "[a-z]" | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": null, | |
"regex_contain": "[A-Z]" | |
} | |
} | |
} | |
} | |
] | |
}, | |
"test_local": { | |
"type": "local", | |
"checker": [ | |
{ | |
"file_path": "tests/data/data.csv", | |
"sep": ",", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": null, | |
"regex_contain": "[a-z]" | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": null, | |
"regex_contain": "[A-Z]" | |
} | |
} | |
} | |
}, | |
{ | |
"file_path": "tests/data/data.xlsx", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": null, | |
"regex_contain": "[a-z]" | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": null, | |
"regex_contain": "[A-Z]" | |
} | |
} | |
} | |
} | |
] | |
}, | |
"test_mariadb_or_mysql": { | |
"type": "sql", | |
"drivername": "mysql+pymysql", | |
"username": "root", | |
"password": "testpassword", | |
"host": "localhost", | |
"port": 3306, | |
"database": "mysql", | |
"checker": [ | |
{ | |
"query": "tests/data/query.sql", | |
"column": { | |
"column_name": { | |
"integer": { | |
"equal_to": 1, | |
"less_than": 3, | |
"in_range": { | |
"lower_limit": 0, | |
"upper_limit": 2 | |
} | |
} | |
}, | |
"another_column_name": { | |
"datasae.integer.Integer": { | |
"equal_to": 5, | |
"less_than": 10, | |
"in_range": [4, 6] | |
} | |
}, | |
"boolean_column_name": { | |
"datasae.boolean.Boolean": {"is_bool": null} | |
} | |
} | |
} | |
] | |
}, | |
"test_postgresql": { | |
"type": "datasae.converter.sql.Sql", | |
"drivername": "postgresql", | |
"username": "postgres", | |
"password": "testpassword", | |
"host": "localhost", | |
"port": 5432, | |
"database": "postgres", | |
"checker": [ | |
{ | |
"query": "select 1 column_name, 5 another_column_name, false boolean_column_name;", | |
"column": { | |
"column_name": { | |
"integer": { | |
"equal_to": 1, | |
"less_than": 3, | |
"in_range": { | |
"lower_limit": 0, | |
"upper_limit": 2 | |
} | |
} | |
}, | |
"another_column_name": { | |
"datasae.integer.Integer": { | |
"equal_to": 5, | |
"less_than": 10, | |
"in_range": [4, 6] | |
} | |
}, | |
"boolean_column_name": { | |
"boolean": {"is_bool": null} | |
} | |
} | |
} | |
] | |
}, | |
"test_s3": { | |
"type": "s3", | |
"endpoint": "play.min.io", | |
"access_key": "Q3AM3UQ867SPQQA43P2F", | |
"secret_key": "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG", | |
"bucket_name": "bucket_name", | |
"checker": [ | |
{ | |
"object_name": "data.csv", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": null, | |
"regex_contain": "[a-z]" | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": null, | |
"regex_contain": "[A-Z]" | |
} | |
} | |
} | |
}, | |
{ | |
"object_name": "data.xlsx", | |
"bucket_name": "another_bucket_name", | |
"sheet_name": "Sheet1", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": null, | |
"regex_contain": "[a-z]" | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": null, | |
"regex_contain": "[A-Z]" | |
} | |
} | |
} | |
} | |
] | |
} | |
} |
DataSae/tests/data/config.yaml
Lines 1 to 120 in 46ef800
test_gsheet: | |
type: gsheet | |
client_secret_file: tests/data/creds.json | |
gsheet_id: gsheet_id | |
checker: | |
- sheet_name: Sheet1 | |
column: | |
alphabet: | |
string: | |
is_lowercase: | |
regex_contain: '[a-z]' | |
ALPHABET: | |
datasae.string.String: | |
is_uppercase: | |
regex_contain: '[A-Z]' | |
test_local: | |
type: local | |
checker: | |
- file_path: tests/data/data.csv | |
sep: ',' | |
column: | |
alphabet: | |
string: | |
is_lowercase: | |
regex_contain: '[a-z]' | |
ALPHABET: | |
datasae.string.String: | |
is_uppercase: | |
regex_contain: '[A-Z]' | |
- file_path: tests/data/data.xlsx | |
column: | |
alphabet: | |
string: | |
is_lowercase: | |
regex_contain: '[a-z]' | |
ALPHABET: | |
datasae.string.String: | |
is_uppercase: | |
regex_contain: '[A-Z]' | |
test_mariadb_or_mysql: | |
type: sql | |
drivername: mysql+pymysql | |
username: root | |
password: testpassword | |
host: localhost | |
port: 3306 | |
database: mysql | |
checker: | |
- query: tests/data/query.sql | |
column: | |
column_name: | |
integer: | |
equal_to: 1 | |
less_than: 3 | |
in_range: | |
lower_limit: 0 | |
upper_limit: 2 | |
another_column_name: | |
datasae.integer.Integer: | |
equal_to: 5 | |
less_than: 10 | |
in_range: [4, 6] | |
boolean_column_name: | |
datasae.boolean.Boolean: | |
is_bool: | |
test_postgresql: | |
type: datasae.converter.sql.Sql | |
drivername: postgresql | |
username: postgres | |
password: testpassword | |
host: localhost | |
port: 5432 | |
database: postgres | |
checker: | |
- query: select 1 column_name, 5 another_column_name, false boolean_column_name; | |
column: | |
column_name: | |
integer: | |
equal_to: 1 | |
less_than: 3 | |
in_range: | |
lower_limit: 0 | |
upper_limit: 2 | |
another_column_name: | |
datasae.integer.Integer: | |
equal_to: 5 | |
less_than: 10 | |
in_range: [4, 6] | |
boolean_column_name: | |
boolean: | |
is_bool: | |
test_s3: | |
type: s3 | |
endpoint: play.min.io | |
access_key: Q3AM3UQ867SPQQA43P2F | |
secret_key: zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG | |
bucket_name: bucket_name | |
checker: | |
- object_name: data.csv | |
column: | |
alphabet: | |
string: | |
is_lowercase: | |
regex_contain: '[a-z]' | |
ALPHABET: | |
datasae.string.String: | |
is_uppercase: | |
regex_contain: '[A-Z]' | |
- object_name: data.xlsx | |
bucket_name: another_bucket_name | |
sheet_name: Sheet1 | |
column: | |
alphabet: | |
string: | |
is_lowercase: | |
regex_contain: '[a-z]' | |
ALPHABET: | |
datasae.string.String: | |
is_uppercase: | |
regex_contain: '[A-Z]' |
Note
You can use DataSae Column's Function Based on Data Type for adding column checker function data quality in the config file.
pip install 'DataSae[converter,gsheet,s3,sql]'
datasae --help
Usage: datasae [OPTIONS] FILE_PATH
Checker command.
Creates checker result based on the configuration provided in the checker section of the data source's configuration file.
â•â”€ Arguments ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ * file_path TEXT The source path of the .json or .yaml file [default: None] [required] │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
â•â”€ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --config-name TEXT If the config name is not set, it will create all of the checker results [default: None] │
│ --yaml-display --json-display [default: yaml-display] │
│ --save-to-file-path TEXT [default: None] │
│ --help Show this message and exit. │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
Example commands:
datasae DataSae/tests/data/config.yaml # Check all data qualities on configuration
datasae DataSae/tests/data/config.yaml --config-name test_local # Check data quality by config name
Tip
Actually, we have example for DataSae implementation in Apache Airflow, but for now it is for private use only. Internal developer can see it at this git repository.
Example results:
DataSae/tests/data/checker.json
Lines 1 to 432 in 46ef800
{ | |
"test_gsheet": [ | |
{ | |
"sheet_name": "Sheet1", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[a-z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[A-Z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
} | |
} | |
} | |
], | |
"test_local": [ | |
{ | |
"file_path": "tests/data/data.csv", | |
"sep": ",", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[a-z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[A-Z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
} | |
} | |
}, | |
{ | |
"file_path": "tests/data/data.xlsx", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[a-z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[A-Z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
} | |
} | |
} | |
], | |
"test_mariadb_or_mysql": [ | |
{ | |
"query": "tests/data/query.sql", | |
"column": { | |
"column_name": { | |
"integer": { | |
"equal_to": { | |
"params": 1, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"less_than": { | |
"params": 3, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"in_range": { | |
"params": { | |
"lower_limit": 0, | |
"upper_limit": 2 | |
}, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"another_column_name": { | |
"datasae.integer.Integer": { | |
"equal_to": { | |
"params": 5, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"less_than": { | |
"params": 10, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"in_range": { | |
"params": [ | |
4, | |
6 | |
], | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"boolean_column_name": { | |
"datasae.boolean.Boolean": { | |
"is_bool": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
} | |
} | |
} | |
], | |
"test_postgresql": [ | |
{ | |
"query": "select 1 column_name, 5 another_column_name, false boolean_column_name;", | |
"column": { | |
"column_name": { | |
"integer": { | |
"equal_to": { | |
"params": 1, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"less_than": { | |
"params": 3, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"in_range": { | |
"params": { | |
"lower_limit": 0, | |
"upper_limit": 2 | |
}, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"another_column_name": { | |
"datasae.integer.Integer": { | |
"equal_to": { | |
"params": 5, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"less_than": { | |
"params": 10, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"in_range": { | |
"params": [ | |
4, | |
6 | |
], | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"boolean_column_name": { | |
"boolean": { | |
"is_bool": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 1, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
} | |
} | |
} | |
], | |
"test_s3": [ | |
{ | |
"object_name": "data.csv", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[a-z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[A-Z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
} | |
} | |
}, | |
{ | |
"object_name": "data.xlsx", | |
"bucket_name": "another_bucket_name", | |
"sheet_name": "Sheet1", | |
"column": { | |
"alphabet": { | |
"string": { | |
"is_lowercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[a-z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
}, | |
"ALPHABET": { | |
"datasae.string.String": { | |
"is_uppercase": { | |
"params": null, | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
}, | |
"regex_contain": { | |
"params": "[A-Z]", | |
"result": { | |
"score": 1.0, | |
"valid": 26, | |
"invalid": 0, | |
"warning": {} | |
} | |
} | |
} | |
} | |
} | |
} | |
] | |
} |
from datasae.converter import Config
# From JSON
config = Config('DataSae/tests/data/config.json')
# From YAML
config = Config('DataSae/tests/data/config.yaml')
# Check all data qualities on configuration
config.checker # dict result
# Check data quality by config name
config('test_local').checker # list of dict result
config('test_gsheet').checker # list of dict result
config('test_s3').checker # list of dict result
config('test_mariadb_or_mysql').checker # list of dict result
config('test_postgresql').checker # list of dict result
Note
Currently support to convert from CSV, JSON, Parquet, Excel, Google Spreadsheet, and SQL.
pip install 'DataSae[converter]'
from datasae.converter import Config
# From JSON
config = Config('DataSae/tests/data/config.json')
# From YAML
config = Config('DataSae/tests/data/config.yaml')
# Local computer file to DataFrame
local = config('test_local')
df = local('path/file_name.csv', sep=',')
df = local('path/file_name.json')
df = local('path/file_name.parquet')
df = local('path/file_name.xlsx', sheet_name='Sheet1')
df = local('path/file_name.csv') # Default: sep = ','
df = local('path/file_name.json')
df = local('path/file_name.parquet')
df = local('path/file_name.xlsx') # Default: sheet_name = 'Sheet1'
Lines 1 to 12 in 4308324
{ | |
"type": "service_account", | |
"project_id": "project_id", | |
"private_key_id": "private_key_id", | |
"private_key": "-----BEGIN PRIVATE KEY----------END PRIVATE KEY-----\n", | |
"client_email": "[email protected]", | |
"client_id": "client_id", | |
"auth_uri": "https://accounts.google.com/o/oauth2/auth", | |
"token_uri": "https://oauth2.googleapis.com/token", | |
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", | |
"client_x509_cert_url": "client_x509_cert_url" | |
} |
pip install 'DataSae[converter,gsheet]'
from datasae.converter import Config
# From JSON
config = Config('DataSae/tests/data/config.json')
# From YAML
config = Config('DataSae/tests/data/config.yaml')
# Google Spreadsheet to DataFrame
gsheet = config('test_gsheet')
df = gsheet('Sheet1')
df = gsheet('Sheet1', 'gsheet_id')
pip install 'DataSae[converter,s3]'
from datasae.converter import Config
# From JSON
config = Config('DataSae/tests/data/config.json')
# From YAML
config = Config('DataSae/tests/data/config.yaml')
# S3 object to DataFrame
s3 = config('test_s3')
df = s3('path/file_name.csv', sep=',')
df = s3('path/file_name.json')
df = s3('path/file_name.parquet')
df = s3('path/file_name.xlsx', sheet_name='Sheet1')
df = s3('path/file_name.csv', 'bucket_name') # Default: sep = ','
df = s3('path/file_name.json', 'bucket_name')
df = s3('path/file_name.parquet', 'bucket_name')
df = s3('path/file_name.xlsx', 'bucket_name') # Default: sheet_name = 'Sheet1'
pip install 'DataSae[converter,sql]'
Important
For MacOS users, if pip install failed at mysqlclient
, please run this and retry to install again after that.
brew install mysql
from datasae.converter import Config
# From JSON
config = Config('DataSae/tests/data/config.json')
# From YAML
config = Config('DataSae/tests/data/config.yaml')
# MariaDB or MySQL to DataFrame
mariadb_or_mysql = config('test_mariadb_or_mysql')
df = mariadb_or_mysql('select 1 column_name from schema_name.table_name;')
df = mariadb_or_mysql('path/file_name.sql')
from datasae.converter import Config
# From JSON
config = Config('DataSae/tests/data/config.json')
# From YAML
config = Config('DataSae/tests/data/config.yaml')
# PostgreSQL to DataFrame
postgresql = config('test_postgresql')
df = postgresql('select 1 column_name from schema_name.table_name;')
df = postgresql('path/file_name.sql')