Skip to content

Commit

Permalink
Improved date fixing of fixer #70 and added Fixer for #211
Browse files Browse the repository at this point in the history
  • Loading branch information
tholzheim committed Nov 18, 2021
1 parent 96dbbb1 commit ae12536
Show file tree
Hide file tree
Showing 9 changed files with 311 additions and 63 deletions.
83 changes: 83 additions & 0 deletions migration/ormigrate/issue211_year.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
'''
Created on 2021-11-17
@author: th
'''
import re

from ormigrate.issue71_date import DateFixer
from ormigrate.smw.rating import RatingType, EntityRating
from ormigrate.fixer import ORFixer, Entity


class YearFixer(ORFixer):
'''
see purpose and issue
Notes:
* Which year does a event has if it startDate and endDate have different years?
'''
purpose="fixer for year property"
issue="https://github.com/SmartDataAnalytics/OpenResearch/issues/211"

worksOn = [Entity.EVENT]

YEAR_PROP="year"
YEAR_REGEX="^(19|20)\d{2}$"

def __init__(self,pageFixerManager):
'''
Constructor
'''
super(YearFixer, self).__init__(pageFixerManager)


def fix(self, rating:EntityRating):
"""
Sets the year property if missing or incorrect.
Args:
entityRating: entity for which that should be fixed
Returns:
Nothing
"""
records = rating.getRecord()
if self.YEAR_PROP in records and records.get(self.YEAR_PROP):
# year property is present and set → check id valid
if re.match(self.YEAR_REGEX, str(records.get(self.YEAR_PROP))):
# year property is set and valid → end fixing
return
else:
# value is invalid set to none
records[self.YEAR_PROP] = None
# year property is missing → check if it could be reconstructed from start or end date
otherYearSources = [record for record in [records.get("startDate", None), records.get("endDate", None)] if record is not None]
possibleYears = [date[:4] for date, errors in [DateFixer.parseDate(d) for d in otherYearSources] if len(errors) == 0] # depends on returned iso format of the date
if any(possibleYears):
records[self.YEAR_PROP] = str(min([int(year) for year in possibleYears])) #ToDo: Is year in normalized form int or string?



def rate(self, rating: EntityRating):
"""
rates given entity for the quality of the year property
Args:
rating: entity for which that should be rated
Returns:
EntityRating
"""
records=rating.getRecord()
if self.YEAR_PROP in records and records.get(self.YEAR_PROP):
# year property is present and set → check id valid
if re.match(self.YEAR_REGEX, str(records.get(self.YEAR_PROP))):
rating.set(pain=0, reason=RatingType.ok, hint="Year property is set and valid")
else:
rating.set(pain=5, reason=RatingType.invalid, hint="Year property is set but invalid")
else:
# year property is missing → check if it could be reconstructed from start or end date
otherYearSources=[record for record in [records.get("startDate", None), records.get("endDate", None)] if record is not None]
if any([len(errors)==0 for data, errors in [DateFixer.parseDate(d) for d in otherYearSources]]):
rating.set(pain=3, reason=RatingType.missing, hint="Year property is missing but can be derived from other properties")
else:
rating.set(pain=8, reason=RatingType.missing,hint="Year property is missing")
return rating
129 changes: 77 additions & 52 deletions migration/ormigrate/issue71_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,85 +24,109 @@ def __init__(self,pageFixerManager):
Constructor
'''
super(DateFixer, self).__init__(pageFixerManager)

def parseDate(self,date):

@staticmethod
def parseDate(date:str):
'''
parses the date in any format to the format YYYY/MM/DD
parses the date in any format to the format YYYY-MM-DD (iso-format see https://www.semantic-mediawiki.org/wiki/Help:Type_Date)
Args:
date: Given date in any format
Returns:
date(str): Date in YYYY/MM/DD format. None if date cannot be converted
date(str): Date in YYYY-MM-DD format. None if date cannot be converted
'''
errors= {}
if date:
if re.match("^(19|20)\d{2}$", date.strip()):
errors["onlyYear"]="Only the year is en as input the parsed date will be the given year and month & day from the time of cumputation"
try:
parseToDatetime = dateutil.parser.parse(date)
except ValueError as _e:
except ValueError or TypeError as _e:
errors[date] = _e
return None,errors
datetimeToDate = parseToDatetime.date()
datetimetoString = datetimeToDate.strftime("%Y/%m/%d")
datetimetoString = datetimeToDate.strftime("%Y-%m-%d")
return datetimetoString,errors

def fix(self,rating:EntityRating):
record=rating.getRecord()
self.fixEventRecord(record, datelist=["startDate", "endDate"])

def fixEventRecord(self, event, datelist:list=['Start date' , 'End date'], errors=None):
def fixEventRecord(self, event, datelist=None, errors=None):
if datelist is None:
datelist = ['Start date', 'End date']
if errors is None:
errors={}
for element in datelist:
eventDate = event.get(element)
if eventDate is not None:
fixedDate,parseError = self.parseDate(eventDate)
if len(parseError) == 0:
if len(parseError)==0:
if fixedDate != eventDate:
event[element] = fixedDate
elif "onlyYear" in parseError:
# eventDate is only a year → move property value to the property year
event["year"]=eventDate
event[element] = None
else:
errors['fixNotPossible']=True
event[element]=None
else:
key= element+'notFound'
errors[key]= True
if self.debug and errors.get('fixNotPossible'): print(self.generateLink(event['pageTitle']))
return event,errors

@classmethod
def isIso(cls, date:str):
"""
checks if given date is in iso format yyyy-mm-dd
Args:
date:
def getFixedDateWikiFile(self, page, event, datetype='date'):
'''
fix the date of the given page and event and mark unfixable pages
Returns:
Fixed text of the page.
'''
generateLink=False
change=False
dates = re.findall('|.*'+datetype+'.*=.*\n', event)
if len(dates) != 0:
for element in dates:
name,value=self.getNameValue(element)
if name is not None and value is not None:
fixedDate = self.parseDate(value)
if fixedDate is not None:
if fixedDate != value:
change = True
event = event.replace(element,'|'+name+'='+fixedDate)
else:
generateLink=True
if self.debug and generateLink: print(self.generateLink(page))
if change:
return event
else:
return None
bool
"""
try:
parsedDate=dateutil.parser.isoparse(date)
return True
except Exception as e:
return False

@classmethod
def checkDate(cls,dateStr)->(int,str):
if dateStr is None or dateStr.strip()=="":
return 5,"Date is missing"
else:
_date,errors = cls.parseDate(cls, dateStr)
if len(errors) == 0:
return 1,"Date is {dateStr} is ok"
_date,errors = cls.parseDate(dateStr)
if len(errors)==0:
if cls.isIso(dateStr):
return 1,"Date {dateStr} is ok"
else:
return 3, "Date {dateStr} is not in iso format but valid date"
elif "onlyYear" in errors:
return 4, f"Date '{dateStr}' is only a year (property value can be ed over to year property)"
else:
return 7,f"Date '{dateStr}' can't be parsed: {errors[dateStr]}"

@classmethod
def durationValid(cls, startdate, enddate) -> bool:
"""
Checks if the given dates form a valid duration for an event (0 - 100 days)
Args:
startdate: start date
enddate: end date
Returns:
bool
"""
startdate=dateutil.parser.parse(startdate)
enddate=dateutil.parser.parse(enddate)
duration=enddate-startdate
if duration.days<0 or duration.days>100:
return False
else:
return True

def rate(self, rating: EntityRating):
aRating = self.getRating(rating.getRecord())
rating.set(pain=aRating.pain, reason=aRating.reason, hint=aRating.hint)
Expand All @@ -115,26 +139,27 @@ def getRating(self,eventRecord):
'''
# TODO add checks for invalid dates that are not properly formatted examples
painrating= None
startDate = None
endDate = None
if 'startDate' in eventRecord: startDate = eventRecord['startDate']
if 'endDate' in eventRecord: endDate = eventRecord['endDate']
startDate = eventRecord.get('startDate')
endDate = eventRecord.get('endDate')
painStartDate, messageStartDate = self.checkDate(startDate)
painEndDate, messageEndDate = self.checkDate(endDate)
maxPain=max(painStartDate, painEndDate)
additionalHint=""
if maxPain == 1 or maxPain == 3:
if not self.durationValid(startDate, endDate):
additionalHint+="The duration of the event is invalid! Please check start- and endDate"
maxPain=4
ratingType = RatingType.invalid
else:
ratingType = RatingType.ok

if painStartDate == 1 and painEndDate == 1:
painrating= Rating(1,RatingType.ok,f'Dates, {startDate} , {endDate} valid')
elif painStartDate == 7:
painrating= Rating(7,RatingType.invalid,f"{messageStartDate}")
elif painEndDate == 7:
painrating = Rating(7, RatingType.invalid,f"{messageEndDate}")
elif painStartDate != 1 and painEndDate != 1:
painrating=Rating(5,RatingType.missing,f'Dates not found')
elif painStartDate == 5 and painEndDate == 1:
painrating=Rating(7,RatingType.missing,f'Start Date missing for valid end date {endDate}')
elif painStartDate == 1 and painEndDate == 5:
painrating=Rating(3,RatingType.missing,f'End Date missing for valid start date {startDate}')
return painrating
elif maxPain == 4:
ratingType = RatingType.missing
elif maxPain == 5:
ratingType = RatingType.missing
else:
ratingType = RatingType.invalid
return Rating(maxPain,ratingType,f"{additionalHint} startdate: {messageStartDate} endDate: {messageEndDate}")

if __name__ == "__main__":
PageFixerManager.runCmdLine([DateFixer])
2 changes: 1 addition & 1 deletion migration/ormigrate/smw/pagefixer.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ def __init__(self,pageFixerManager:PageFixerManager,debug=False):
self.pageFixerManager=pageFixerManager
self.wikiFileManager=pageFixerManager.wikiFileManager

def fixEventRecord(self):
def fixEventRecord(self, **kwargs):
''' abstract base function to be overwritten by fixing class'''
return

Expand Down
2 changes: 1 addition & 1 deletion migration/ormigrate/smw/rating.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(self,entity:JSONAble, fixer:EntityFixer=None, pageTitle:str=None):
entity(JSONAble): entity to be rated/fixed
fixer(EntityFixer): fixer responsible for rating/fixing the entity
'''
pageTitle=getattr(entity, "pageTitle") if not pageTitle else pageTitle
pageTitle=getattr(entity, "pageTitle", None) if not pageTitle else pageTitle
super().__init__(pageTitle=pageTitle)
self.entity=entity
self.fixer = fixer
Expand Down
2 changes: 1 addition & 1 deletion migration/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ py-3rdparty-mediawiki>=0.4.10
# https://github.com/5j9/wikitextparser
wikitextparser>=0.47.4
# https://pypi.org/project/wikirender/
wikirender>=0.0.29
wikirender>=0.0.33
# https://github.com/somnathrakshit/geograpy3
geograpy3>=0.2.1
# https://github.com/SmartDataAnalytics/OpenResearch
Expand Down
1 change: 0 additions & 1 deletion migration/scripts/fixmyor
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ do
done

scriptbase=$(pwd)
targetWikiTextPath=~/.or/generated/orfixed
if [ ! -d $base ]
then
mkdir -p $base
Expand Down
10 changes: 9 additions & 1 deletion migration/tests/pagefixtoolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,5 +156,13 @@ def getPageTitleLists(self,*pageTitles):
if self.testAll:
pageLists.append(None)
return pageLists


def getEntityRatingFromDict(self, records:dict):
"""
returns a EntityRating for the given dict
"""
entity = JSONAble()
entity.fromDict(records)
rating = EntityRating(entity=entity)
return rating

Loading

0 comments on commit ae12536

Please sign in to comment.