Skip to content

Commit

Permalink
add stats reports to tenant (#152)
Browse files Browse the repository at this point in the history
* add stats reports to tenant

* Update contanerize.yaml

* add reports

* update graph summary endpoint

* add csv

* Update tenants.py

* Update tenants.py

* Update tenants.py

* Update tenants.py

* update yamls

* update url

* add csv config
  • Loading branch information
ylyangtw authored Oct 11, 2024
1 parent 2248232 commit 2cddc33
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 4 deletions.
1 change: 1 addition & 0 deletions .github/workflows/contanerize.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ on:
- dev_eco
- v0_generated_code
- 133_dev_sitemaps
- 151-integrate-community-stats-codes
tags:
- "v*.*.*"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ services:
- GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml}
- GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml}
- GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH}
- GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv}
- ECRR_MINIO_BUCKET=${ECRR_MINIO_BUCKET}
- ECRR_GRAPH_NAMESPACE=${ECRR_GRAPH_NAMESPACE}
- PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
Expand Down
1 change: 1 addition & 0 deletions dagster/implnets/deployment/compose_project.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ services:
- GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml}
- GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml}
- GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH}
- GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv}
- PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
- SLACK_CHANNEL=${SLACK_CHANNEL:-"#twitterfeed"}
- SLACK_TOKEN=${SLACK_TOKEN}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ services:
- GLEANERIO_SOURCES_FILENAME=${GLEANERIO_SOURCES_FILENAME:-gleanerconfig.yaml}
- GLEANERIO_TENANT_FILENAME=${GLEANERIO_TENANT_FILENAME:-tenant.yaml}
- GLEANERIO_WORKSPACE_CONFIG_PATH=${GLEANERIO_WORKSPACE_CONFIG_PATH}
- GLEANERIO_CSV_CONFIG_URL=${GLEANERIO_CSV_CONFIG_URL:-https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv}
- ECRR_MINIO_BUCKET=${ECRR_MINIO_BUCKET}
- ECRR_GRAPH_NAMESPACE=${ECRR_GRAPH_NAMESPACE}
- PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
Expand Down
3 changes: 3 additions & 0 deletions dagster/implnets/deployment/envFile.env
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,6 @@ ECRR_MINIO_BUCKET=ecrr
SLACK_CHANNEL="#production_discussion"
#SLACK_CHANNEL="#twitterfeed"
SLACK_TOKEN=

GLEANERIO_CSV_CONFIG_URL=https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1340502269&single=true&output=csv

26 changes: 22 additions & 4 deletions dagster/implnets/workflows/tasks/tasks/assets/tenants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,34 @@
from ec.datastore import s3
from distutils import util
from ..resources.gleanerS3 import _pythonMinioAddress
from ec.reporting.report import generateReportStats

GLEANER_MINIO_ADDRESS = os.environ.get('GLEANERIO_MINIO_ADDRESS')
GLEANER_MINIO_PORT = os.environ.get('GLEANERIO_MINIO_PORT')
GLEANER_MINIO_USE_SSL = bool(util.strtobool(os.environ.get('GLEANERIO_MINIO_USE_SSL', 'true')))
GLEANER_MINIO_SECRET_KEY = os.environ.get('GLEANERIO_MINIO_SECRET_KEY')
GLEANER_MINIO_ACCESS_KEY = os.environ.get('GLEANERIO_MINIO_ACCESS_KEY')
GLEANER_MINIO_BUCKET = os.environ.get('GLEANERIO_MINIO_BUCKET')
GLEANERIO_GRAPH_URL = os.environ.get('GLEANERIO_GRAPH_URL')
GLEANERIO_GRAPH_SUMMARY_NAMESPACE = os.environ.get('GLEANERIO_GRAPH_SUMMARY_NAMESPACE')
GLEANERIO_CSV_CONFIG_URL = os.environ.get('GLEANERIO_CSV_CONFIG_URL')

MINIO_OPTIONS={"secure":GLEANER_MINIO_USE_SSL

,"access_key": GLEANER_MINIO_ACCESS_KEY
,"secret_key": GLEANER_MINIO_SECRET_KEY
}

def _graphSummaryEndpoint(community):
if community == "all":
url = f"{GLEANERIO_GRAPH_URL}/namespace/{GLEANERIO_GRAPH_SUMMARY_NAMESPACE}/sparql"
else:
url = f"{GLEANERIO_GRAPH_URL}/namespace/{community}_summary/sparql"
return url
@asset(group_name="community",key_prefix="task",
required_resource_keys={"triplestore"})
def task_tenant_sources(context) ->Any:
s3_resource = context.resources.triplestore.s3

t=s3_resource.getTennatInfo()
tenants = t['tenant']
listTenants = map (lambda a: {a['community']}, tenants)
Expand Down Expand Up @@ -144,11 +154,9 @@ def loadstatsCommunity(context, task_tenant_sources) -> str:
ts = task_tenant_sources
t =list(filter ( lambda a: a['community']== community_code, ts["tenant"] ))
s = t[0]["sources"]
for source in s:

for source in s:
dirs = s3Minio.listPath(GLEANER_MINIO_BUCKET,path=f"{REPORT_PATH}{source}/",recursive=False )


for d in dirs:
latestpath = f"{REPORT_PATH}{source}/latest/"
if (d.object_name.casefold() == latestpath.casefold()) or (d.is_dir == False):
Expand Down Expand Up @@ -210,4 +218,14 @@ def loadstatsCommunity(context, task_tenant_sources) -> str:
# s3.upload_fileobj(f, s3.GLEANERIO_MINIO_BUCKET, f"data/all/all_stats.csv")
context.log.info(f"all_stats.csv uploaded using ec.datastore.putReportFile {s3_config.GLEANERIO_MINIO_BUCKET}tenant/{community_code} ")
#return df_csv # now checking return types

context.log.info(f"GLEANERIO_CSV_CONFIG_URL {GLEANERIO_CSV_CONFIG_URL} ")

report = generateReportStats(GLEANERIO_CSV_CONFIG_URL, s3_config.GLEANERIO_MINIO_BUCKET, s3Minio,
_graphSummaryEndpoint(community_code), community_code)
bucket, object = s3Minio.putReportFile(s3_config.GLEANERIO_MINIO_BUCKET, f"tenant/{community_code}",
f"report_stats.json", report)
context.log.info(
f"report_stats.json uploaded using ec.datastore.putReportFile {s3_config.GLEANERIO_MINIO_BUCKET}tenant/{community_code} ")

return df_csv

0 comments on commit 2cddc33

Please sign in to comment.