Skip to content

Commit

Permalink
single quotes instead of double quotes to distinguish Indonesian from id
Browse files Browse the repository at this point in the history
  • Loading branch information
miau1 committed Nov 20, 2023
1 parent bbd92db commit 8eef88c
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 10 deletions.
18 changes: 9 additions & 9 deletions opustools_pkg/opustools/db_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ class DbOperations:

def __init__(self, db_file=None):
if db_file:
self.db_file=db_file
self.db_file = db_file
else:
self.db_file = os.environ.get('OPUSAPI_DB')
self.db_file = os.environ.get('OPUSAPI_DB')

def clean_up_parameters(self, parameters):
remove = []
Expand All @@ -34,20 +34,20 @@ def run_query(self, sql_command):
def run_default_query(self, parameters, suffix=''):
columns = ['alignment_pairs', 'corpus', 'documents', 'id', 'latest', 'preprocessing', 'size', 'source', 'source_tokens', 'target', 'target_tokens', 'url', 'version']
parameters = self.sort_source_target(parameters)
sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f'{k} = "{v}"' for k, v in parameters.items()]) + suffix
sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()]) + suffix
keys, value_list = self.run_query(sql_command)
ret = [{k: v for k, v in zip(keys,values)} for values in value_list]
if 'preprocessing' not in parameters.keys() and parameters.get('target'):
param_mono_src = parameters.copy()
param_mono_src['target'] = ''
sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f'{k} = "{v}"' for k, v in param_mono_src.items()]) + suffix
sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in param_mono_src.items()]) + suffix
keys, value_list = self.run_query(sql_command)
ret = ret + [{k: v for k, v in zip(keys,values)} for values in value_list]

param_mono_trg = parameters.copy()
param_mono_trg['source'] = parameters['target']
param_mono_trg['target'] = ''
sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f'{k} = "{v}"' for k, v in param_mono_trg.items()]) + suffix
sql_command = f'SELECT {", ".join(columns)} FROM opusfile WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in param_mono_trg.items()]) + suffix
keys, value_list = self.run_query(sql_command)
ret = ret + [{k: v for k, v in zip(keys,values)} for values in value_list]

Expand All @@ -61,7 +61,7 @@ def run_corpora_query(self, parameters):

sql_command = 'SELECT DISTINCT corpus FROM opusfile'
if len(parameters) > 0:
sql_command = sql_command+' WHERE '+' AND '.join([f'{k} = "{v}"' for k, v in parameters.items()])
sql_command = sql_command+' WHERE '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()])
_, value_list = self.run_query(sql_command)
values = [v[0] for v in value_list]
return values
Expand All @@ -76,11 +76,11 @@ def run_languages_query(self, parameters):
if len(parameters) > 0:
source = parameters.get('source')
if source:
sql_command = 'SELECT DISTINCT target FROM opusfile where '+' AND '.join([f'{k} = "{v}"' for k, v in parameters.items()]) + f' AND target != "{source}" AND target != "" UNION SELECT DISTINCT source FROM opusfile '
sql_command = 'SELECT DISTINCT target FROM opusfile where '+' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()]) + f" AND target != '{source}' AND target != '' UNION SELECT DISTINCT source FROM opusfile "
parameters['target'] = parameters['source']
del parameters['source']
sql_command = sql_command + 'WHERE '
sql_command = sql_command + ' AND '.join([f'{k} = "{v}"' for k, v in parameters.items()])
sql_command = sql_command + ' AND '.join([f"{k} = '{v}'" for k, v in parameters.items()])
_, value_list = self.run_query(sql_command)
values = [v[0] for v in value_list]
return values
Expand Down Expand Up @@ -119,7 +119,7 @@ def get_corpora(self, parameters):
# Get items where the queried language is on the target side
a_parameters['target'] = parameters['source']
del a_parameters['source']
ret = self.run_default_query(a_parameters, suffix=' AND source != ""') + ret
ret = self.run_default_query(a_parameters, suffix=" AND source != ''") + ret

if preprocessing in ['xml', 'raw', 'parsed']:
# Get sentence files
Expand Down
2 changes: 1 addition & 1 deletion opustools_pkg/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

setuptools.setup(
name="opustools",
version="1.6.0",
version="1.6.1",
author="Mikko Aulamo",
author_email="[email protected]",
description="Tools to read OPUS",
Expand Down
22 changes: 22 additions & 0 deletions opustools_pkg/tests/test_opus_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,28 @@ def test_download_everything_from_a_corpus(self):
sys.stdout = old_stdout
self.assertEqual(len(printout.getvalue().split('\n')), 18)

def test_indonesian(self):
old_stdout = sys.stdout
printout = io.StringIO()

sys.stdout = printout
files = OpusGet(directory='bible-uedin', source='id', target='en', preprocess='xml',
list_resources=True, local_db=True).get_files()
sys.stdout = old_stdout
self.assertEqual(len(printout.getvalue().split('\n')), 6)

sys.stdout = printout
files = OpusGet(list_languages=True, directory='bible-uedin', source='id',
list_resources=True, local_db=True).get_files()
sys.stdout = old_stdout
self.assertEqual(len(printout.getvalue().split('\n')[-2].split(',')), 101)

sys.stdout = printout
files = OpusGet(list_corpora=True, source='id',
list_resources=True, local_db=True).get_files()
sys.stdout = old_stdout
self.assertEqual(len(printout.getvalue().split('\n')[-2].split(',')), 24)

if __name__ == '__main__':
unittest.main()

0 comments on commit 8eef88c

Please sign in to comment.