diff --git a/TripitakaPlatform/settings.py b/TripitakaPlatform/settings.py index 4b64dc3..b3e7a94 100644 --- a/TripitakaPlatform/settings.py +++ b/TripitakaPlatform/settings.py @@ -20,7 +20,7 @@ # See https://docs.djangoproject.com/en/2.0/howto/deployment/checklist/ # SECURITY WARNING: keep the secret key used in production secret! -SECRET_KEY = 'hh-65qc%$-_4djzm%y8wp%!&va$nakjf_9d&r7z+v5(#77*u-i' +SECRET_KEY = '2dx3sbj0#=4k$xu=8h52to&a2zia%%lr(w2h4wf$zb(ux6v9az' # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True @@ -155,4 +155,14 @@ DATA_UPLOAD_MAX_NUMBER_FIELDS = 10000 -IMAGE_URL_PREFIX = 'https://s3.cn-north-1.amazonaws.com.cn/lqdzj-image' \ No newline at end of file +IMAGE_URL_PREFIX = 'https://s3.cn-north-1.amazonaws.com.cn/lqdzj-image' + +REST_FRAMEWORK = { + 'DEFAULT_PERMISSION_CLASSES': ( + 'rest_framework.permissions.IsAuthenticated', + ), 'DEFAULT_AUTHENTICATION_CLASSES': ( + 'rest_framework.authentication.SessionAuthentication', + ) +} + +AUTH_USER_MODEL = 'auth.User' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 178af8f..ea1a199 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ django-background-tasks Jinja2 djangorestframework django-filter +git+git://github.com/sshwsfc/xadmin.git@django2 diff --git a/sutradata/common.py b/sutradata/common.py index ef5788c..9490f79 100644 --- a/sutradata/common.py +++ b/sutradata/common.py @@ -229,7 +229,11 @@ def fetch_cut_file(reel, vol_page): def compute_accurate_cut(reel): sid = reel.sutra.sid - pagetexts = reel.text[2:].split('\np\n') + try: + reel_ocr_text = ReelOCRText.objects.get(reel_id = reel.id) + except: + return None + pagetexts = reel_ocr_text.text[2:].split('\np\n') reel_correct_texts = list(ReelCorrectText.objects.filter(reel=reel).order_by('-id')[0:1]) if not reel_correct_texts: return None diff --git a/sutradata/lib/__init__.py b/sutradata/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sutradata/lib/fields.py b/sutradata/lib/fields.py new file mode 100644 index 0000000..07b1683 --- /dev/null +++ b/sutradata/lib/fields.py @@ -0,0 +1,88 @@ +import json +import six +import functools + +import django + +from django.core.exceptions import ValidationError +from django.conf import settings +from django.db import models + +try: + from django.utils.encoding import smart_unicode as smart_text + smart_text # placate pyflakes +except ImportError: + from django.utils.encoding import smart_text + +# SubfieldBase causes RemovedInDjango110Warning in 1.8 and 1.9, and +# will not work in 1.10 or later +if django.VERSION[:2] >= (1, 8): + field_metaclass = type +else: + from django.db.models import SubfieldBase + field_metaclass = SubfieldBase + +field_class = functools.partial(six.with_metaclass, field_metaclass) + +from django.contrib.postgres.fields import JSONField as JSONFieldBase + + +class JSONField(JSONFieldBase): + """Simple JSON field that stores python structures as JSON strings + on database. + """ + + def __init__(self, *args, **kwargs): + kwargs.setdefault('default', dict) + super(JSONField, self).__init__(*args, **kwargs) + + def from_db_value(self, value, expression, connection, context): + return self.to_python(value) + + def to_python(self, value): + """ + Convert the input JSON value into python structures, raises + django.core.exceptions.ValidationError if the data can't be converted. + """ + if self.blank and not value: + return {} + value = value or '{}' + if isinstance(value, six.binary_type): + value = six.text_type(value, 'utf-8') + if isinstance(value, six.string_types): + try: + # with django 1.6 i have '"{}"' as default value here + if value[0] == value[-1] == '"': + value = value[1:-1] + + return json.loads(value) + except Exception as err: + raise ValidationError(str(err)) + else: + return value + + def validate(self, value, model_instance): + """Check value is a valid JSON string, raise ValidationError on + error.""" + if isinstance(value, six.string_types): + super(JSONField, self).validate(value, model_instance) + try: + json.loads(value) + except Exception as err: + raise ValidationError(str(err)) + + def get_prep_value(self, value): + """Convert value to JSON string before save""" + try: + return json.dumps(value) + except Exception as err: + raise ValidationError(str(err)) + + def value_to_string(self, obj): + """Return value from object converted to string properly""" + return smart_text(self.value_from_object(obj)) + + def value_from_object(self, obj): + """Return value dumped to string.""" + orig_val = super(JSONField, self).value_from_object(obj) + return self.get_prep_value(orig_val) diff --git a/sutradata/management/commands/init.py b/sutradata/management/commands/init.py index 0f41f72..8787c41 100644 --- a/sutradata/management/commands/init.py +++ b/sutradata/management/commands/init.py @@ -44,12 +44,13 @@ def handle(self, *args, **options): huayan_yb_1 = Reel(sutra=huayan_yb, reel_no=1, start_vol=27, start_vol_page=1, end_vol=27, end_vol_page=23, edition_type=Reel.EDITION_TYPE_CHECKED, path1='27') + huayan_yb_1.save() text = get_reel_text(huayan_yb_1) #filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001.txt' % huayan_yb.sid) #with open(filename, 'r') as f: # huayan_yb_1.text = f.read() - huayan_yb_1.text = text - huayan_yb_1.save() + reel_ocr_text_yb_1 = ReelOCRText(reel=huayan_yb_1, text = text) + reel_ocr_text_yb_1.save() filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001_fixed.txt' % huayan_yb.sid) with open(filename, 'r') as f: @@ -79,8 +80,9 @@ def handle(self, *args, **options): filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001.txt' % huayan_gl.sid) with open(filename, 'r') as f: text = f.read() - huayan_gl_1.text = text huayan_gl_1.save() + reel_ocr_text = ReelOCRText(reel=huayan_gl_1, text = text) + reel_ocr_text.save() reelcorrecttext = ReelCorrectText(reel=huayan_gl_1, text=text) reelcorrecttext.save() @@ -96,11 +98,11 @@ def handle(self, *args, **options): # create Tasks # Correct Task - separators = extract_page_line_separators(huayan_yb_1.text) + separators = extract_page_line_separators(reel_ocr_text_yb_1.text) separators_json = json.dumps(separators, separators=(',', ':')) # 文字校对 - diff_lst, base_text = CompareReel.generate_compare_reel(reelcorrecttext.text, huayan_yb_1.text) + diff_lst, base_text = CompareReel.generate_compare_reel(reelcorrecttext.text, reel_ocr_text_yb_1.text) compare_reel = CompareReel(reel=huayan_yb_1, base_reel=huayan_gl_1, base_text=base_text) compare_reel.save() diff --git a/sutradata/management/commands/initjudge.py b/sutradata/management/commands/initjudge.py index 1c4fe0c..12d975e 100644 --- a/sutradata/management/commands/initjudge.py +++ b/sutradata/management/commands/initjudge.py @@ -37,10 +37,12 @@ def handle(self, *args, **options): huayan_ql_1 = Reel(sutra=huayan_ql, reel_no=1, start_vol=24, start_vol_page=2, end_vol=24, end_vol_page=17, edition_type=Reel.EDITION_TYPE_CHECKED, path1='24') + huayan_ql_1.save() filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001.txt' % huayan_ql.sid) with open(filename, 'r') as f: - huayan_ql_1.text = f.read() - huayan_ql_1.save() + text = f.read() + reel_ocr_text = ReelOCRText(reel=huayan_ql_1, text = text) + reel_ocr_text.save() filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001_fixed.txt' % huayan_ql.sid) with open(filename, 'r') as f: text = f.read() @@ -59,12 +61,15 @@ def handle(self, *args, **options): except: huayan_cb_1 = Reel(sutra=huayan_cb, reel_no=1, start_vol=14, start_vol_page=31, end_vol=14, end_vol_page=37, edition_type=Reel.EDITION_TYPE_BASE) + huayan_cb_1.save() filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001_punct.txt' % huayan_cb.sid) with open(filename, 'r') as f: text = f.read() - punctuation, huayan_cb_1.text = extract_punct(text) - huayan_cb_1.save() - reelcorrecttext = ReelCorrectText(reel=huayan_cb_1, text=huayan_cb_1.text) + punctuation, text = extract_punct(text) + reel_ocr_text = ReelOCRText(reel=huayan_cb_1, text = text) + reel_ocr_text.save() + + reelcorrecttext = ReelCorrectText(reel=huayan_cb_1, text=text) reelcorrecttext.save() punct = Punct(reel=huayan_cb_1, reeltext=reelcorrecttext, punctuation=json.dumps(punctuation)) punct.save() diff --git a/sutradata/models.py b/sutradata/models.py index 85c31d5..6d27537 100644 --- a/sutradata/models.py +++ b/sutradata/models.py @@ -2,35 +2,29 @@ from django.db import models from django.utils import timezone from django.core.exceptions import ValidationError +from .lib.fields import JSONField import json class SutraTextField(models.TextField): description = '存储经文内容,换行用\n,每页前有换页标记p\n' - def __init__(self, *args, **kwargs): - kwargs['blank'] = True - super().__init__(*args, **kwargs) - - def deconstruct(self): - name, path, args, kwargs = super().deconstruct() - del kwargs["blank"] - return name, path, args, kwargs - def get_prep_value(self, value): value = value.replace('\r\n', '\n') value = super().get_prep_value(value) return self.to_python(value) -class TripiMixin(object): - def __str__(self): - return self.name - -class Tripitaka(models.Model, TripiMixin): +class Tripitaka(models.Model): code = models.CharField(verbose_name='实体藏经版本编码', max_length=2, blank=False) name = models.CharField(verbose_name='实体藏经名称', max_length=32, blank=False) shortname = models.CharField(verbose_name='简称(用于校勘记)', max_length=32, blank=False) remark = models.TextField('备注', blank=True, default='') + path1_char = models.CharField('存储层次1字母', max_length=1, blank=True, default='') + path1_name = models.CharField('存储层次1中文名', max_length=16, blank=True, default='') + path2_char = models.CharField('存储层次2字母', max_length=1, blank=True, default='') + path2_name = models.CharField('存储层次2中文名', max_length=16, blank=True, default='') + path3_char = models.CharField('存储层次3字母', max_length=1, blank=True, default='') + path3_name = models.CharField('存储层次3中文名', max_length=16, blank=True, default='') class Meta: verbose_name = '实体藏经' @@ -52,7 +46,7 @@ class Meta: def __str__(self): return '%s: 第%s册' % (self.tripitaka.name, self.vol_no) -class LQSutra(models.Model, TripiMixin): +class LQSutra(models.Model): sid = models.CharField(verbose_name='龙泉经目经号编码', max_length=8) #(为"LQ"+ 经序号 + 别本号) code = models.CharField(verbose_name='龙泉经目编码', max_length=5, blank=False) variant_code = models.CharField(verbose_name='龙泉经目别本编码', max_length=1, default='0') @@ -67,7 +61,7 @@ class Meta: def __str__(self): return '%s: %s' % (self.sid, self.name) -class Sutra(models.Model, TripiMixin): +class Sutra(models.Model): sid = models.CharField(verbose_name='实体藏经|唯一经号编码', editable=True, max_length=8) tripitaka = models.ForeignKey(Tripitaka, on_delete=models.CASCADE) code = models.CharField(verbose_name='实体经目编码', max_length=5, blank=False) @@ -89,7 +83,6 @@ def __str__(self): class LQReel(models.Model): lqsutra = models.ForeignKey(LQSutra, verbose_name='龙泉经目编码', on_delete=models.CASCADE) reel_no = models.SmallIntegerField('卷序号') - text = SutraTextField('经文', default='') remark = models.TextField('备注', blank=True, default='') class Meta: @@ -120,18 +113,11 @@ class Reel(models.Model): path1 = models.CharField('存储层次1', max_length=16, default='') path2 = models.CharField('存储层次2', max_length=16, default='') path3 = models.CharField('存储层次3', max_length=16, default='') - text = SutraTextField('经文', default='') #按实际行加了换行符,换页标记为p\n - fixed = models.BooleanField('是否有调整', default=False) - f_start_page = models.CharField('起始页ID', max_length=18, default='', blank=True, null=True) - f_start_line_no = models.IntegerField('起始页行序号', default=-1) - f_start_char_no = models.IntegerField('起始页的行中字序号', default=-1) - f_end_page = models.CharField('终止页ID', max_length=18, default='', blank=True, null=True) - f_end_line_no = models.IntegerField('终止页行序号', default=-1) - f_end_char_no = models.IntegerField('终止页的行中字序号', default=-1) - f_text = SutraTextField('调整经文', default='', blank=True, null=True) - correct_text = SutraTextField('文字校对后的经文', default='') #按实际行加了换行符,换页标记为p\n edition_type = models.SmallIntegerField('版本类型', choices=EDITION_TYPE_CHOICES, default=0) remark = models.TextField('备注', blank=True, default='') + image_ready = models.BooleanField(verbose_name='图片状态', default=False) + cut_ready = models.BooleanField(verbose_name='切分数据状态', default=False) + column_ready = models.BooleanField(verbose_name='切列图状态', default=False) class Meta: verbose_name = '实体藏经卷' @@ -155,37 +141,71 @@ def url_prefix(self): s = '/%s/%s/%s_%s_' % (tcode, path_str, tcode, filename_str) return s - def image_prefix(self): - tcode = self.sutra.sid[0:2] - path_lst = [] - if self.path1: - path_lst.append(self.path1) - if self.path2: - path_lst.append(self.path2) - if self.path3: - path_lst.append(self.path3) - filename_str = '_'.join(path_lst) - s = '%s_%s_' % (tcode, filename_str) - return s +class ReelOCRText(models.Model): + reel = models.OneToOneField(Reel, verbose_name='实体藏经卷', on_delete=models.CASCADE, primary_key=True) + text = SutraTextField('经文', blank=True, default='') #按实际行加了换行符,换页标记为p\n + fixed = models.BooleanField('是否有调整', default=False) + f_start_page = models.CharField('起始页ID', max_length=18, default='', blank=True, null=True) + f_start_line_no = models.IntegerField('起始页行序号', default=-1) + f_start_char_no = models.IntegerField('起始页的行中字序号', default=-1) + f_end_page = models.CharField('终止页ID', max_length=18, default='', blank=True, null=True) + f_end_line_no = models.IntegerField('终止页行序号', default=-1) + f_end_char_no = models.IntegerField('终止页的行中字序号', default=-1) + f_text = SutraTextField('调整经文', blank=True, default='') + + class Meta: + verbose_name = '实体藏经卷OCR经文' + verbose_name_plural = '实体藏经卷OCR经文' + +class PageStatus: + INITIAL = 0 + RECT_NOTFOUND = 1 + PARSE_FAILED = 2 + RECT_NOTREADY = 3 + CUT_PIC_NOTFOUND = 4 + COL_PIC_NOTFOUND = 5 + COL_POS_NOTFOUND = 6 + RECT_COL_NOTREADY = 7 + RECT_COL_NOTFOUND = 8 + READY = 9 + MARKED = 10 + + CHOICES = ( + (INITIAL, u'初始化'), + (RECT_NOTFOUND, u'切分数据未上传'), + (PARSE_FAILED, u'数据解析失败'), + (RECT_NOTREADY, u'字块数据未展开'), + (CUT_PIC_NOTFOUND, u'图片不存在'), + (COL_PIC_NOTFOUND, u'列图不存在'), + (COL_POS_NOTFOUND, u'列图坐标不存在'), + (RECT_COL_NOTREADY, u'字块对应列图未准备'), + (RECT_COL_NOTFOUND, u'字块对应列图不存在'), + (READY, u'已准备好'), + (MARKED, u'已入卷标记'), + ) class Page(models.Model): - pid = models.CharField('页ID', editable=True, max_length=13, primary_key=True) #sid + 3位卷号 + 2位页序号,页序号从1计数。如:YB00086000101 + pid = models.CharField(verbose_name='实体藏经页级总编码', max_length=21, blank=False, primary_key=True) reel = models.ForeignKey(Reel, verbose_name='实体藏经卷', on_delete=models.CASCADE) reel_page_no = models.SmallIntegerField('卷中页序号') - vol_no = models.SmallIntegerField('册序号') - page_no = models.SmallIntegerField('页序号') - text = SutraTextField('经文') # 文字校对后的经文 + page_no = models.SmallIntegerField('页序号') + bar_no = models.CharField('栏序号', max_length=1, default='0') + status = models.PositiveSmallIntegerField(db_index=True, verbose_name=u'操作类型', + choices=PageStatus.CHOICES, default=PageStatus.INITIAL) + bar_info = JSONField(verbose_name='栏信息', default=dict) + text = SutraTextField('经文', blank=True) # 文字校对后的经文 cut_info = models.TextField('切分信息') cut_updated_at = models.DateTimeField('更新时间', null=True) cut_add_count = models.SmallIntegerField('切分信息增加字数', default=0) cut_wrong_count = models.SmallIntegerField('切分信息识别错的字数', default=0) cut_confirm_count = models.SmallIntegerField('切分信息需要确认的字数', default=0) cut_verify_count = models.SmallIntegerField('切分信息需要确认的字数', default=0) + s3_id = models.CharField(verbose_name='图片路径', max_length=128, default='', blank=False) class Meta: verbose_name = '实体藏经页' verbose_name_plural = '实体藏经页' def __str__(self): - return '%s第%s册第%s页' % (self.reel, self.vol_no, self.page_no) + return '%s第%s页' % (self.reel, self.reel_page_no) diff --git a/tasks/models.py b/tasks/models.py index 8f23ad2..d6d1d56 100644 --- a/tasks/models.py +++ b/tasks/models.py @@ -7,10 +7,6 @@ from difflib import SequenceMatcher import re -class TripiMixin(object): - def __str__(self): - return self.name - class CompareReel(models.Model): reel = models.ForeignKey(Reel, on_delete=models.CASCADE, related_name='compare_set') base_reel = models.ForeignKey(Reel, on_delete=models.CASCADE, verbose_name='底本') @@ -88,7 +84,7 @@ class TaskBase(object): (3, '高'), ) -class BatchTask(models.Model, TripiMixin): +class BatchTask(models.Model): priority = models.SmallIntegerField('优先级', choices=TaskBase.PRIORITY_CHOICES, default=2) # 1,2,3分别表示低,中,高 created_at = models.DateTimeField('创建时间', default=timezone.now) publisher = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.PROTECT, @@ -101,7 +97,7 @@ def batch_no(self): self.created_at.month, self.created_at.day, self.created_at.hour, self.created_at.minute, self.created_at.second, self.created_at.microsecond) -class Task(models.Model, TripiMixin): +class Task(models.Model): TYPE_CORRECT = 1 TYPE_CORRECT_VERIFY = 2 TYPE_JUDGE = 3 @@ -171,7 +167,7 @@ class Task(models.Model, TripiMixin): # 标点相关 reeltext = models.ForeignKey('ReelCorrectText', related_name='punct_tasks', on_delete=models.SET_NULL, blank=True, null=True) - result = SutraTextField('结果') + result = SutraTextField('结果', blank=True) started_at = models.DateTimeField('开始时间', blank=True, null=True) finished_at = models.DateTimeField('完成时间', blank=True, null=True) created_at = models.DateTimeField('创建时间', default=timezone.now) @@ -198,7 +194,7 @@ class CorrectSeg(models.Model): class ReelCorrectText(models.Model): reel = models.ForeignKey(Reel, verbose_name='实体藏经卷', on_delete=models.CASCADE) - text = SutraTextField('经文') # 文字校对或文字校对审定后得到的经文 + text = SutraTextField('经文', blank=True) # 文字校对或文字校对审定后得到的经文 task = models.OneToOneField(Task, verbose_name='发布任务', on_delete=models.SET_NULL, blank=True, null=True, default=None) publisher = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.SET_NULL, null=True, verbose_name='发布用户') created_at = models.DateTimeField('创建时间', default=timezone.now) @@ -209,7 +205,7 @@ class Meta: class LQReelText(models.Model): lqreel = models.ForeignKey(LQReel, verbose_name='龙泉藏经卷', on_delete=models.CASCADE) - text = SutraTextField('经文') # 校勘判取审定后得到的经文 + text = SutraTextField('经文', blank=True) # 校勘判取审定后得到的经文 task = models.OneToOneField(Task, verbose_name='发布任务', on_delete=models.SET_NULL, blank=True, null=True, default=None) publisher = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.SET_NULL, null=True, verbose_name='发布用户') created_at = models.DateTimeField('创建时间', default=timezone.now)