diff --git a/migration/ormigrate/issue211_year.py b/migration/ormigrate/issue211_year.py new file mode 100644 index 0000000..9592901 --- /dev/null +++ b/migration/ormigrate/issue211_year.py @@ -0,0 +1,83 @@ +''' +Created on 2021-11-17 + +@author: th +''' +import re + +from ormigrate.issue71_date import DateFixer +from ormigrate.smw.rating import RatingType, EntityRating +from ormigrate.fixer import ORFixer, Entity + + +class YearFixer(ORFixer): + ''' + see purpose and issue + Notes: + * Which year does a event has if it startDate and endDate have different years? + ''' + purpose="fixer for year property" + issue="https://github.com/SmartDataAnalytics/OpenResearch/issues/211" + + worksOn = [Entity.EVENT] + + YEAR_PROP="year" + YEAR_REGEX="^(19|20)\d{2}$" + + def __init__(self,pageFixerManager): + ''' + Constructor + ''' + super(YearFixer, self).__init__(pageFixerManager) + + + def fix(self, rating:EntityRating): + """ + Sets the year property if missing or incorrect. + Args: + entityRating: entity for which that should be fixed + + Returns: + Nothing + """ + records = rating.getRecord() + if self.YEAR_PROP in records and records.get(self.YEAR_PROP): + # year property is present and set → check id valid + if re.match(self.YEAR_REGEX, str(records.get(self.YEAR_PROP))): + # year property is set and valid → end fixing + return + else: + # value is invalid set to none + records[self.YEAR_PROP] = None + # year property is missing → check if it could be reconstructed from start or end date + otherYearSources = [record for record in [records.get("startDate", None), records.get("endDate", None)] if record is not None] + possibleYears = [date[:4] for date, errors in [DateFixer.parseDate(d) for d in otherYearSources] if len(errors) == 0] # depends on returned iso format of the date + if any(possibleYears): + records[self.YEAR_PROP] = str(min([int(year) for year in possibleYears])) #ToDo: Is year in normalized form int or string? + + + + def rate(self, rating: EntityRating): + """ + rates given entity for the quality of the year property + Args: + rating: entity for which that should be rated + + Returns: + EntityRating + """ + records=rating.getRecord() + if self.YEAR_PROP in records and records.get(self.YEAR_PROP): + # year property is present and set → check id valid + if re.match(self.YEAR_REGEX, str(records.get(self.YEAR_PROP))): + rating.set(pain=0, reason=RatingType.ok, hint="Year property is set and valid") + else: + rating.set(pain=5, reason=RatingType.invalid, hint="Year property is set but invalid") + else: + # year property is missing → check if it could be reconstructed from start or end date + otherYearSources=[record for record in [records.get("startDate", None), records.get("endDate", None)] if record is not None] + if any([len(errors)==0 for data, errors in [DateFixer.parseDate(d) for d in otherYearSources]]): + rating.set(pain=3, reason=RatingType.missing, hint="Year property is missing but can be derived from other properties") + else: + rating.set(pain=8, reason=RatingType.missing,hint="Year property is missing") + return rating \ No newline at end of file diff --git a/migration/ormigrate/issue71_date.py b/migration/ormigrate/issue71_date.py index bec44cf..72617e7 100644 --- a/migration/ormigrate/issue71_date.py +++ b/migration/ormigrate/issue71_date.py @@ -24,85 +24,109 @@ def __init__(self,pageFixerManager): Constructor ''' super(DateFixer, self).__init__(pageFixerManager) - - def parseDate(self,date): + + @staticmethod + def parseDate(date:str): ''' - parses the date in any format to the format YYYY/MM/DD + parses the date in any format to the format YYYY-MM-DD (iso-format see https://www.semantic-mediawiki.org/wiki/Help:Type_Date) Args: date: Given date in any format Returns: - date(str): Date in YYYY/MM/DD format. None if date cannot be converted + date(str): Date in YYYY-MM-DD format. None if date cannot be converted ''' errors= {} + if date: + if re.match("^(19|20)\d{2}$", date.strip()): + errors["onlyYear"]="Only the year is en as input the parsed date will be the given year and month & day from the time of cumputation" try: parseToDatetime = dateutil.parser.parse(date) - except ValueError as _e: + except ValueError or TypeError as _e: errors[date] = _e return None,errors datetimeToDate = parseToDatetime.date() - datetimetoString = datetimeToDate.strftime("%Y/%m/%d") + datetimetoString = datetimeToDate.strftime("%Y-%m-%d") return datetimetoString,errors def fix(self,rating:EntityRating): record=rating.getRecord() self.fixEventRecord(record, datelist=["startDate", "endDate"]) - def fixEventRecord(self, event, datelist:list=['Start date' , 'End date'], errors=None): + def fixEventRecord(self, event, datelist=None, errors=None): + if datelist is None: + datelist = ['Start date', 'End date'] if errors is None: errors={} for element in datelist: eventDate = event.get(element) if eventDate is not None: fixedDate,parseError = self.parseDate(eventDate) - if len(parseError) == 0: + if len(parseError)==0: if fixedDate != eventDate: event[element] = fixedDate + elif "onlyYear" in parseError: + # eventDate is only a year → move property value to the property year + event["year"]=eventDate + event[element] = None else: errors['fixNotPossible']=True + event[element]=None else: key= element+'notFound' errors[key]= True if self.debug and errors.get('fixNotPossible'): print(self.generateLink(event['pageTitle'])) return event,errors + @classmethod + def isIso(cls, date:str): + """ + checks if given date is in iso format yyyy-mm-dd + Args: + date: - def getFixedDateWikiFile(self, page, event, datetype='date'): - ''' - fix the date of the given page and event and mark unfixable pages Returns: - Fixed text of the page. - ''' - generateLink=False - change=False - dates = re.findall('|.*'+datetype+'.*=.*\n', event) - if len(dates) != 0: - for element in dates: - name,value=self.getNameValue(element) - if name is not None and value is not None: - fixedDate = self.parseDate(value) - if fixedDate is not None: - if fixedDate != value: - change = True - event = event.replace(element,'|'+name+'='+fixedDate) - else: - generateLink=True - if self.debug and generateLink: print(self.generateLink(page)) - if change: - return event - else: - return None + bool + """ + try: + parsedDate=dateutil.parser.isoparse(date) + return True + except Exception as e: + return False @classmethod def checkDate(cls,dateStr)->(int,str): if dateStr is None or dateStr.strip()=="": return 5,"Date is missing" else: - _date,errors = cls.parseDate(cls, dateStr) - if len(errors) == 0: - return 1,"Date is {dateStr} is ok" + _date,errors = cls.parseDate(dateStr) + if len(errors)==0: + if cls.isIso(dateStr): + return 1,"Date {dateStr} is ok" + else: + return 3, "Date {dateStr} is not in iso format but valid date" + elif "onlyYear" in errors: + return 4, f"Date '{dateStr}' is only a year (property value can be ed over to year property)" else: return 7,f"Date '{dateStr}' can't be parsed: {errors[dateStr]}" + @classmethod + def durationValid(cls, startdate, enddate) -> bool: + """ + Checks if the given dates form a valid duration for an event (0 - 100 days) + Args: + startdate: start date + enddate: end date + + Returns: + bool + """ + startdate=dateutil.parser.parse(startdate) + enddate=dateutil.parser.parse(enddate) + duration=enddate-startdate + if duration.days<0 or duration.days>100: + return False + else: + return True + def rate(self, rating: EntityRating): aRating = self.getRating(rating.getRecord()) rating.set(pain=aRating.pain, reason=aRating.reason, hint=aRating.hint) @@ -115,26 +139,27 @@ def getRating(self,eventRecord): ''' # TODO add checks for invalid dates that are not properly formatted examples painrating= None - startDate = None - endDate = None - if 'startDate' in eventRecord: startDate = eventRecord['startDate'] - if 'endDate' in eventRecord: endDate = eventRecord['endDate'] + startDate = eventRecord.get('startDate') + endDate = eventRecord.get('endDate') painStartDate, messageStartDate = self.checkDate(startDate) painEndDate, messageEndDate = self.checkDate(endDate) + maxPain=max(painStartDate, painEndDate) + additionalHint="" + if maxPain == 1 or maxPain == 3: + if not self.durationValid(startDate, endDate): + additionalHint+="The duration of the event is invalid! Please check start- and endDate" + maxPain=4 + ratingType = RatingType.invalid + else: + ratingType = RatingType.ok - if painStartDate == 1 and painEndDate == 1: - painrating= Rating(1,RatingType.ok,f'Dates, {startDate} , {endDate} valid') - elif painStartDate == 7: - painrating= Rating(7,RatingType.invalid,f"{messageStartDate}") - elif painEndDate == 7: - painrating = Rating(7, RatingType.invalid,f"{messageEndDate}") - elif painStartDate != 1 and painEndDate != 1: - painrating=Rating(5,RatingType.missing,f'Dates not found') - elif painStartDate == 5 and painEndDate == 1: - painrating=Rating(7,RatingType.missing,f'Start Date missing for valid end date {endDate}') - elif painStartDate == 1 and painEndDate == 5: - painrating=Rating(3,RatingType.missing,f'End Date missing for valid start date {startDate}') - return painrating + elif maxPain == 4: + ratingType = RatingType.missing + elif maxPain == 5: + ratingType = RatingType.missing + else: + ratingType = RatingType.invalid + return Rating(maxPain,ratingType,f"{additionalHint} startdate: {messageStartDate} endDate: {messageEndDate}") if __name__ == "__main__": PageFixerManager.runCmdLine([DateFixer]) diff --git a/migration/ormigrate/smw/pagefixer.py b/migration/ormigrate/smw/pagefixer.py index db81e04..7c8ed2e 100644 --- a/migration/ormigrate/smw/pagefixer.py +++ b/migration/ormigrate/smw/pagefixer.py @@ -490,7 +490,7 @@ def __init__(self,pageFixerManager:PageFixerManager,debug=False): self.pageFixerManager=pageFixerManager self.wikiFileManager=pageFixerManager.wikiFileManager - def fixEventRecord(self): + def fixEventRecord(self, **kwargs): ''' abstract base function to be overwritten by fixing class''' return diff --git a/migration/ormigrate/smw/rating.py b/migration/ormigrate/smw/rating.py index 1565933..37a3c47 100644 --- a/migration/ormigrate/smw/rating.py +++ b/migration/ormigrate/smw/rating.py @@ -65,7 +65,7 @@ def __init__(self,entity:JSONAble, fixer:EntityFixer=None, pageTitle:str=None): entity(JSONAble): entity to be rated/fixed fixer(EntityFixer): fixer responsible for rating/fixing the entity ''' - pageTitle=getattr(entity, "pageTitle") if not pageTitle else pageTitle + pageTitle=getattr(entity, "pageTitle", None) if not pageTitle else pageTitle super().__init__(pageTitle=pageTitle) self.entity=entity self.fixer = fixer diff --git a/migration/requirements.txt b/migration/requirements.txt index 886188b..c3c1722 100644 --- a/migration/requirements.txt +++ b/migration/requirements.txt @@ -7,7 +7,7 @@ py-3rdparty-mediawiki>=0.4.10 # https://github.com/5j9/wikitextparser wikitextparser>=0.47.4 # https://pypi.org/project/wikirender/ -wikirender>=0.0.29 +wikirender>=0.0.33 # https://github.com/somnathrakshit/geograpy3 geograpy3>=0.2.1 # https://github.com/SmartDataAnalytics/OpenResearch diff --git a/migration/scripts/fixmyor b/migration/scripts/fixmyor index c9032bf..2fe43d1 100644 --- a/migration/scripts/fixmyor +++ b/migration/scripts/fixmyor @@ -128,7 +128,6 @@ do done scriptbase=$(pwd) -targetWikiTextPath=~/.or/generated/orfixed if [ ! -d $base ] then mkdir -p $base diff --git a/migration/tests/pagefixtoolbox.py b/migration/tests/pagefixtoolbox.py index b18e647..7cffe2f 100644 --- a/migration/tests/pagefixtoolbox.py +++ b/migration/tests/pagefixtoolbox.py @@ -156,5 +156,13 @@ def getPageTitleLists(self,*pageTitles): if self.testAll: pageLists.append(None) return pageLists - + + def getEntityRatingFromDict(self, records:dict): + """ + returns a EntityRating for the given dict + """ + entity = JSONAble() + entity.fromDict(records) + rating = EntityRating(entity=entity) + return rating \ No newline at end of file diff --git a/migration/tests/testIssue71_InvalidDates.py b/migration/tests/testIssue71_InvalidDates.py index f6da639..3bab6b3 100644 --- a/migration/tests/testIssue71_InvalidDates.py +++ b/migration/tests/testIssue71_InvalidDates.py @@ -23,7 +23,15 @@ def testDateParser(self): dateFixer=self.getPageFixer() sampledates=['2020-02-20','2020/02/20','2020.02.20','20/02/2020','02/20/2020','20.02.2020','02.20.2020','20 Feb, 2020','2020, Feb 20','2020 20 Feb','2020 Feb 20'] for date in sampledates: - self.assertEqual('2020/02/20',dateFixer.parseDate(date)[0]) + self.assertEqual('2020-02-20',dateFixer.parseDate(date)[0]) + + def testDateParserForYear(self): + ''' + tests the behavior of date parser when only the year is the input + ''' + dateFixer = self.getPageFixer() + parsedDate, errors = dateFixer.parseDate("2020") + self.assertTrue("onlyYear" in errors) def testIssue71Examples(self): ''' @@ -36,11 +44,12 @@ def testIssue71Examples(self): {'startDate': '20 Feb, 2020', 'endDate': None}, {'startDate': None, 'endDate': '20 Feb, 2020'}, {'startDate': '2010/03/22', 'endDate':'2011/03/226'}, + {'startDate': '2011-03-22', 'endDate':'2011-03-26'}, ] - expectedPainRatings=[1, 5, 3, 7,7] - expectedStartDates=['2020/02/20', None, '2020/02/20', None,'2010/03/22'] - expectedEndDates=['2020/02/20', None, None, '2020/02/20','2011/03/226'] - expectedErrors=[0, 2, 1, 1, 1] + expectedPainRatings=[3, 5, 5, 5,7,1] + expectedStartDates=['2020-02-20', None, '2020-02-20', None,'2010-03-22','2011-03-22'] + expectedEndDates=['2020-02-20', None, None, '2020-02-20',None,'2011-03-26'] + expectedErrors=[0, 2, 1, 1, 1,0] painRatings=[] errors=[] fixedStartDates=[] @@ -68,14 +77,69 @@ def testIssue71Rating(self): for pageTitleList in pageTitleLists: counters=self.getRatingCounters(pageTitleList) painCounter=counters["pain"] + print(painCounter) if pageTitleList is None: - self.assertGreater(painCounter[self.pageFixerClass.__name__][3],100) + self.assertGreater(painCounter[self.pageFixerClass.__name__][1],500) + self.assertGreater(painCounter[self.pageFixerClass.__name__][3], 7000) + self.assertGreater(painCounter[self.pageFixerClass.__name__][4], 200) self.assertGreater(painCounter[self.pageFixerClass.__name__][5],500) self.assertGreater(painCounter[self.pageFixerClass.__name__][7],100) else: self.assertEqual(3,painCounter[self.pageFixerClass.__name__][7]) + def testPropertyShift(self): + ''' + tests if only the year is given if the value is moved to the year property + ''' + dateFixer = self.getPageFixer() + eventRecords = [ + { + "expected": {"year": "2020", "startDate":None}, + "raw": {"startDate": "2020"} + }, + { + "expected": {"year": "2020", "endDate":None}, + "raw": {"endDate": "2020"} + }, + { + "expected": {"year": "2020", "endDate": None, "startDate":"2020-01-05"}, + "raw": {"endDate": "2020", "startDate":"2020/01/05"} + } + ] + for record in eventRecords: + entityRating=self.getEntityRatingFromDict(record["raw"]) + dateFixer.fix(entityRating) + self.assertDictEqual(record["expected"], entityRating.getRecord()) + + def testDurationVerification(self): + """ + tests if the events have a valid duration + """ + dateFixer = self.getPageFixer() + eventRecords = [ + { + "expected": {"startDate": "2020-02-01", "endDate": "2020-01-05"}, + "raw": {"startDate": "2020-02-01", "endDate": "2020-01-05"}, + "rating":4 + }, + { + "expected": {"startDate": "2020-02-01", "endDate": "2020-02-05"}, + "raw": {"startDate": "2020-02-01", "endDate": "2020-02-05"}, + "rating":1 + }, + { + "expected": {"startDate": "2020-02-01"}, + "raw": {"startDate": "2020-02-01"}, + "rating":5 + } + ] + for record in eventRecords: + entityRating = self.getEntityRatingFromDict(record["raw"]) + dateFixer.rate(entityRating) + self.assertEqual(record["rating"], entityRating.pain) + + if __name__ == "__main__": #import sys;sys.argv = ['', 'Test.testName'] unittest.main() \ No newline at end of file diff --git a/migration/tests/test_issue211_year.py b/migration/tests/test_issue211_year.py new file mode 100644 index 0000000..9a1b610 --- /dev/null +++ b/migration/tests/test_issue211_year.py @@ -0,0 +1,69 @@ +''' +Created on 2021-11-17 + +@author: th +''' +from ormigrate.issue211_year import YearFixer +from tests.pagefixtoolbox import PageFixerTest + +class TestYearFixer(PageFixerTest): + ''' + tests fixing and rating of issue #211 (year property) + https://github.com/SmartDataAnalytics/OpenResearch/issues/211 + ''' + + def setUp(self, **kwargs): + PageFixerTest.setUp(self) + self.pageFixerClass=YearFixer + self.eventRecords=[ + { + "expected":{"year":"2020"}, + "raw":{"year":"2020"}, + "rating":0 + }, + { + "expected": {"year": 2020}, + "raw": {"year": 2020}, + "rating":0 + }, + { + "expected": {"year": None}, + "raw": {"year": "1404"}, # unrealistic year for a conference in or + "rating":5 + }, + { + "expected": {"year": "2020", "startDate":"2020-01-01"}, + "raw": {"year": None, "startDate":"2020-01-01"}, + "rating": 3 + }, + { + "expected": {"year": None}, + "raw": {"year": None}, + "rating": 8 + }, + { + "expected": {"year": "2020", "endDate": "2020-01-01"}, + "raw": {"year": None, "endDate": "2020-01-01"}, + "rating": 3 + } + ] + + def test_fix(self): + """ + tests fixing the year property + """ + fixer = self.getPageFixer() + for record in self.eventRecords: + entity = self.getEntityRatingFromDict(record["raw"]) + fixer.fix(entity) + self.assertDictEqual(record["expected"], entity.getRecord()) + + def test_rate(self): + """ + tests rating the year property + """ + fixer=self.getPageFixer() + for record in self.eventRecords: + entity=self.getEntityRatingFromDict(record["raw"]) + rating=fixer.rate(entity) + self.assertEqual(record["rating"], rating.pain)