Skip to content

Commit

Permalink
Merge pull request #23 from auburnsummer/22-add-source_metadata-field
Browse files Browse the repository at this point in the history
add source_metadata field
  • Loading branch information
auburnsummer authored Oct 26, 2022
2 parents f22e8d2 + 3c617c5 commit 9873522
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 25 deletions.
2 changes: 2 additions & 0 deletions orchard/db/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Level(Model):
song_ct = MyJsonField()
source = TextField()
source_iid = TextField()
source_metadata = MyJsonField(null=True)
tags = MyJsonField()
thumb = TextField(null=True)
two_player = BooleanField()
Expand Down Expand Up @@ -92,6 +93,7 @@ class Combined(Model):
song_ct = MyJsonField()
source = TextField()
source_iid = TextField()
source_metadata = MyJsonField(null=True)
tags = MyJsonField()
thumb = TextField(null=True)
two_player = BooleanField()
Expand Down
40 changes: 31 additions & 9 deletions orchard/package/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,40 @@ def datetime_to_epoch(s):
if isinstance(s, datetime):
return floor(s.timestamp())
return None
# dict of keys to functions that transform the value.
transformers = {
"last_updated": datetime_to_epoch,
"indexed": datetime_to_epoch,
"song_ct": json.dumps,
"description_ct": json.dumps

def default_step(col_name, col_value, final_dict):
final_dict[col_name] = col_value
return final_dict

def with_func(func):
def inner(col_name, col_value, final_dict):
final_dict[col_name] = func(col_value)
return final_dict
return inner

def source_metadata_step(col_name, col_value, final_dict):
if col_value is None:
return final_dict
final_dict["discord_metadata__user_id"] = col_value["user_id"]
final_dict["discord_metadata__timestamp"] = col_value["timestamp"]
return final_dict

# dict of keys to functions that do stuff with the value.
# we run through the functions to build a final object which then gets JSON serialised into the jsonl.
steps = {
"last_updated": with_func(datetime_to_epoch),
"indexed": with_func(datetime_to_epoch),
"song_ct": with_func(json.dumps),
"description_ct": with_func(json.dumps),
"source_metadata": source_metadata_step
}

final_dict = {}
for col_name, col_value in combined_dict.items():
if col_name in transformers.keys():
combined_dict[col_name] = transformers[col_name](col_value)
step = steps[col_name] if col_name in steps else default_step
final_dict = step(col_name, col_value, final_dict)

return json.dumps(combined_dict, ensure_ascii=False)
return json.dumps(final_dict, ensure_ascii=False)


def package():
Expand Down
14 changes: 14 additions & 0 deletions orchard/package/schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,20 @@
"name": "sha1",
"type": "string",
"optional": true
},
{
"facet": false,
"index": true,
"name": "discord_metadata__user_id",
"type": "string",
"optional": true
},
{
"facet": false,
"index": true,
"name": "discord_metadata__timestamp",
"type": "string",
"optional": true
}
]
}
1 change: 1 addition & 0 deletions orchard/scan/scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ async def main(db: SqliteExtDatabase, sources):
"icon": f"{CODEX}/{icon_url}",
"source": source_id,
"source_iid": iid,
"source_metadata": await scraper.get_metadata(iid)
}

if vit["icon"]:
Expand Down
5 changes: 5 additions & 0 deletions orchard/scan/sources.test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- - rdl
- !!python/name:orchard.scan.sources.discord.DiscordScraper
- bot_token: !!python/object/apply:orchard.utils.env.const_from_env ["BOT_TOKEN"]
channel_id: 1034453263031992481
after: 0
41 changes: 30 additions & 11 deletions orchard/scan/sources/discord.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ def __init__(self, bot_token, channel_id, after):
self.bot_token = bot_token
self.channel_id = channel_id
self.after = after
self.iid_url_map = {}
self.iid_cache = {} # cache of iids to discord Message objects.
# self.iid_url_map = {}
# get our id.
resp = httpx.get(f"{DISCORD_API_URL}/users/@me", headers={
"user-agent": USER_AGENT,
"Authorization": f"Bot {self.bot_token}"
Expand All @@ -39,22 +41,39 @@ async def download_iid(self, iid):
url = await self.get_url(iid)
return httpx.get(url).content

async def get_url(self, iid):
if iid in self.iid_url_map:
return self.iid_url_map[iid]
async def get_message(self, iid):
"Get the discord Message object relating to an iid. Has an internal cache."
if iid in self.iid_cache:
return self.iid_cache[iid]
else:
# if we haven't recorded the URL in the map, that's fine!
# we can still retrieve a URL from an iid, but it needs an API request.
message_id, attachment_id = get_iid_info(iid)
# we need to do an API request to get the message.
message_id, _ = get_iid_info(iid)
async with Client() as client:
headers = {
"user-agent": USER_AGENT,
"Authorization": f"Bot {self.bot_token}"
}
resp = await client.get(f"{DISCORD_API_URL}/channels/{self.channel_id}/messages/{message_id}",
headers=headers)
attachment = next(a for a in resp.json()['attachments'] if a['id'] == attachment_id)
return attachment['url']
message = resp.json()
# put it in the cache before we keep going.
self.iid_cache[iid] = message
return message

async def get_url(self, iid):
_, attachment_id = get_iid_info(iid)
message = await self.get_message(iid)

attachment = next(a for a in message['attachments'] if int(a['id']) == attachment_id)
return attachment['url']

async def get_metadata(self, iid):
message = await self.get_message(iid)

return {
"user_id": message['author']['id'],
"timestamp": message['timestamp']
}

async def get_iids(self):
iids = []
Expand Down Expand Up @@ -110,8 +129,8 @@ async def get_iids(self):
# note2: attachment id is required because it's possible to delete an attachment
# w/o deleting the post.
iid = f"{post['id']}|{attachment['id']}"
# the iid doesn't encode the actual download url.
self.iid_url_map[iid] = attachment['url']
# cache of message objects for later use, if needed.
self.iid_cache[iid] = post
iids.append(iid)

print('', end='') # <-- for a breakpoint
Expand Down
5 changes: 5 additions & 0 deletions orchard/scan/sources/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ async def download_iid(self, iid):
async def get_url(self, iid):
pass

# if any external metadata is available for this iid, return that, or None otherwise.
# note that the output of this should be JSON serializable.
async def get_metadata(self, iid):
pass

# optional callback function that gets called whenever orchard indexes a level from this source.
# Note that on_index receives a full level object
async def on_index(self, level):
Expand Down
13 changes: 11 additions & 2 deletions orchard/scan/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ def fake_vitals(f):
@pytest.fixture
def make_mock_scraper():
class MockLevelScraper(RDLevelScraper):
def __init__(self, iids):
def __init__(self, iids, enable_metadata):
self.iids = iids
self.enable_metadata = enable_metadata

async def get_iids(self):
return self.iids
Expand All @@ -58,7 +59,15 @@ async def download_iid(self, iid):
return iid.encode('utf-8')

async def get_url(self, iid):
return f"this.is.a.mock.url.com/{iid}"
return f"this.is.a.mock.url.com/{iid}"

async def get_metadata(self, iid):
if self.enable_metadata:
return {
"first_letter": iid[0],
"last_letter": iid[-1]
}
return None

return MockLevelScraper

Expand Down
30 changes: 27 additions & 3 deletions orchard/scan/test/test_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ async def test_scan(empty_db, make_mock_scraper, caplog, patch_vitals):
'mock',
make_mock_scraper,
{
"iids": ["a", "b", "c", "d", "e"]
"iids": ["a", "b", "c", "d", "e"],
"enable_metadata": False
}
]
]
Expand All @@ -30,7 +31,8 @@ async def test_scan_removes_an_iid_if_the_source_no_longer_provides_it(empty_db,
'mock',
make_mock_scraper,
{
"iids": ["a", "b", "c", "d", "e"]
"iids": ["a", "b", "c", "d", "e"],
"enable_metadata": False
}
]
]
Expand All @@ -44,10 +46,32 @@ async def test_scan_removes_an_iid_if_the_source_no_longer_provides_it(empty_db,
'mock',
make_mock_scraper,
{
"iids": ["a", "b", "c", "d"] # no e!
"iids": ["a", "b", "c", "d"], # no e!
"enable_metadata": False
}
]
]
await main(empty_db, sources)
ids = set(level.id for level in Level.select())
assert "unittest_e" not in ids


@pytest.mark.asyncio
async def test_scan_inserts_source_metadata_if_given(empty_db, make_mock_scraper, caplog, patch_vitals):
caplog.set_level(logging.INFO)
sources = [
[
'mock',
make_mock_scraper,
{
"iids": ["az", "by", "cx", "dw", "ev"],
"enable_metadata": True
}
]
]
await main(empty_db, sources)
level_a = Level.get_by_id("unittest_az")
assert level_a.source_metadata == {
"first_letter": "a",
"last_letter": "z"
}

0 comments on commit 9873522

Please sign in to comment.