Skip to content

Commit

Permalink
Merge pull request #26 from CoinLQ/bu
Browse files Browse the repository at this point in the history
Bu
  • Loading branch information
dang-xia authored Feb 3, 2018
2 parents 79e852b + f4cbdd7 commit 9afdf12
Show file tree
Hide file tree
Showing 9 changed files with 192 additions and 66 deletions.
14 changes: 12 additions & 2 deletions TripitakaPlatform/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# See https://docs.djangoproject.com/en/2.0/howto/deployment/checklist/

# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'hh-65qc%$-_4djzm%y8wp%!&va$nakjf_9d&r7z+v5(#77*u-i'
SECRET_KEY = '2dx3sbj0#=4k$xu=8h52to&a2zia%%lr(w2h4wf$zb(ux6v9az'

# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
Expand Down Expand Up @@ -155,4 +155,14 @@

DATA_UPLOAD_MAX_NUMBER_FIELDS = 10000

IMAGE_URL_PREFIX = 'https://s3.cn-north-1.amazonaws.com.cn/lqdzj-image'
IMAGE_URL_PREFIX = 'https://s3.cn-north-1.amazonaws.com.cn/lqdzj-image'

REST_FRAMEWORK = {
'DEFAULT_PERMISSION_CLASSES': (
'rest_framework.permissions.IsAuthenticated',
), 'DEFAULT_AUTHENTICATION_CLASSES': (
'rest_framework.authentication.SessionAuthentication',
)
}

AUTH_USER_MODEL = 'auth.User'
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ django-background-tasks
Jinja2
djangorestframework
django-filter
git+git://github.com/sshwsfc/xadmin.git@django2
6 changes: 5 additions & 1 deletion sutradata/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,11 @@ def fetch_cut_file(reel, vol_page):

def compute_accurate_cut(reel):
sid = reel.sutra.sid
pagetexts = reel.text[2:].split('\np\n')
try:
reel_ocr_text = ReelOCRText.objects.get(reel_id = reel.id)
except:
return None
pagetexts = reel_ocr_text.text[2:].split('\np\n')
reel_correct_texts = list(ReelCorrectText.objects.filter(reel=reel).order_by('-id')[0:1])
if not reel_correct_texts:
return None
Expand Down
Empty file added sutradata/lib/__init__.py
Empty file.
88 changes: 88 additions & 0 deletions sutradata/lib/fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import json
import six
import functools

import django

from django.core.exceptions import ValidationError
from django.conf import settings
from django.db import models

try:
from django.utils.encoding import smart_unicode as smart_text
smart_text # placate pyflakes
except ImportError:
from django.utils.encoding import smart_text

# SubfieldBase causes RemovedInDjango110Warning in 1.8 and 1.9, and
# will not work in 1.10 or later
if django.VERSION[:2] >= (1, 8):
field_metaclass = type
else:
from django.db.models import SubfieldBase
field_metaclass = SubfieldBase

field_class = functools.partial(six.with_metaclass, field_metaclass)

from django.contrib.postgres.fields import JSONField as JSONFieldBase


class JSONField(JSONFieldBase):
"""Simple JSON field that stores python structures as JSON strings
on database.
"""

def __init__(self, *args, **kwargs):
kwargs.setdefault('default', dict)
super(JSONField, self).__init__(*args, **kwargs)

def from_db_value(self, value, expression, connection, context):
return self.to_python(value)

def to_python(self, value):
"""
Convert the input JSON value into python structures, raises
django.core.exceptions.ValidationError if the data can't be converted.
"""
if self.blank and not value:
return {}
value = value or '{}'
if isinstance(value, six.binary_type):
value = six.text_type(value, 'utf-8')
if isinstance(value, six.string_types):
try:
# with django 1.6 i have '"{}"' as default value here
if value[0] == value[-1] == '"':
value = value[1:-1]

return json.loads(value)
except Exception as err:
raise ValidationError(str(err))
else:
return value

def validate(self, value, model_instance):
"""Check value is a valid JSON string, raise ValidationError on
error."""
if isinstance(value, six.string_types):
super(JSONField, self).validate(value, model_instance)
try:
json.loads(value)
except Exception as err:
raise ValidationError(str(err))

def get_prep_value(self, value):
"""Convert value to JSON string before save"""
try:
return json.dumps(value)
except Exception as err:
raise ValidationError(str(err))

def value_to_string(self, obj):
"""Return value from object converted to string properly"""
return smart_text(self.value_from_object(obj))

def value_from_object(self, obj):
"""Return value dumped to string."""
orig_val = super(JSONField, self).value_from_object(obj)
return self.get_prep_value(orig_val)
12 changes: 7 additions & 5 deletions sutradata/management/commands/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ def handle(self, *args, **options):
huayan_yb_1 = Reel(sutra=huayan_yb, reel_no=1, start_vol=27,
start_vol_page=1, end_vol=27, end_vol_page=23, edition_type=Reel.EDITION_TYPE_CHECKED,
path1='27')
huayan_yb_1.save()
text = get_reel_text(huayan_yb_1)
#filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001.txt' % huayan_yb.sid)
#with open(filename, 'r') as f:
# huayan_yb_1.text = f.read()
huayan_yb_1.text = text
huayan_yb_1.save()
reel_ocr_text_yb_1 = ReelOCRText(reel=huayan_yb_1, text = text)
reel_ocr_text_yb_1.save()

filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001_fixed.txt' % huayan_yb.sid)
with open(filename, 'r') as f:
Expand Down Expand Up @@ -79,8 +80,9 @@ def handle(self, *args, **options):
filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001.txt' % huayan_gl.sid)
with open(filename, 'r') as f:
text = f.read()
huayan_gl_1.text = text
huayan_gl_1.save()
reel_ocr_text = ReelOCRText(reel=huayan_gl_1, text = text)
reel_ocr_text.save()
reelcorrecttext = ReelCorrectText(reel=huayan_gl_1, text=text)
reelcorrecttext.save()

Expand All @@ -96,11 +98,11 @@ def handle(self, *args, **options):

# create Tasks
# Correct Task
separators = extract_page_line_separators(huayan_yb_1.text)
separators = extract_page_line_separators(reel_ocr_text_yb_1.text)
separators_json = json.dumps(separators, separators=(',', ':'))

# 文字校对
diff_lst, base_text = CompareReel.generate_compare_reel(reelcorrecttext.text, huayan_yb_1.text)
diff_lst, base_text = CompareReel.generate_compare_reel(reelcorrecttext.text, reel_ocr_text_yb_1.text)
compare_reel = CompareReel(reel=huayan_yb_1, base_reel=huayan_gl_1, base_text=base_text)
compare_reel.save()

Expand Down
15 changes: 10 additions & 5 deletions sutradata/management/commands/initjudge.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ def handle(self, *args, **options):
huayan_ql_1 = Reel(sutra=huayan_ql, reel_no=1, start_vol=24,
start_vol_page=2, end_vol=24, end_vol_page=17, edition_type=Reel.EDITION_TYPE_CHECKED,
path1='24')
huayan_ql_1.save()
filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001.txt' % huayan_ql.sid)
with open(filename, 'r') as f:
huayan_ql_1.text = f.read()
huayan_ql_1.save()
text = f.read()
reel_ocr_text = ReelOCRText(reel=huayan_ql_1, text = text)
reel_ocr_text.save()
filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001_fixed.txt' % huayan_ql.sid)
with open(filename, 'r') as f:
text = f.read()
Expand All @@ -59,12 +61,15 @@ def handle(self, *args, **options):
except:
huayan_cb_1 = Reel(sutra=huayan_cb, reel_no=1, start_vol=14,
start_vol_page=31, end_vol=14, end_vol_page=37, edition_type=Reel.EDITION_TYPE_BASE)
huayan_cb_1.save()
filename = os.path.join(BASE_DIR, 'data/sutra_text/%s_001_punct.txt' % huayan_cb.sid)
with open(filename, 'r') as f:
text = f.read()
punctuation, huayan_cb_1.text = extract_punct(text)
huayan_cb_1.save()
reelcorrecttext = ReelCorrectText(reel=huayan_cb_1, text=huayan_cb_1.text)
punctuation, text = extract_punct(text)
reel_ocr_text = ReelOCRText(reel=huayan_cb_1, text = text)
reel_ocr_text.save()

reelcorrecttext = ReelCorrectText(reel=huayan_cb_1, text=text)
reelcorrecttext.save()
punct = Punct(reel=huayan_cb_1, reeltext=reelcorrecttext, punctuation=json.dumps(punctuation))
punct.save()
Expand Down
108 changes: 64 additions & 44 deletions sutradata/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,29 @@
from django.db import models
from django.utils import timezone
from django.core.exceptions import ValidationError
from .lib.fields import JSONField
import json

class SutraTextField(models.TextField):

description = '存储经文内容,换行用\n,每页前有换页标记p\n'

def __init__(self, *args, **kwargs):
kwargs['blank'] = True
super().__init__(*args, **kwargs)

def deconstruct(self):
name, path, args, kwargs = super().deconstruct()
del kwargs["blank"]
return name, path, args, kwargs

def get_prep_value(self, value):
value = value.replace('\r\n', '\n')
value = super().get_prep_value(value)
return self.to_python(value)

class TripiMixin(object):
def __str__(self):
return self.name

class Tripitaka(models.Model, TripiMixin):
class Tripitaka(models.Model):
code = models.CharField(verbose_name='实体藏经版本编码', max_length=2, blank=False)
name = models.CharField(verbose_name='实体藏经名称', max_length=32, blank=False)
shortname = models.CharField(verbose_name='简称(用于校勘记)', max_length=32, blank=False)
remark = models.TextField('备注', blank=True, default='')
path1_char = models.CharField('存储层次1字母', max_length=1, blank=True, default='')
path1_name = models.CharField('存储层次1中文名', max_length=16, blank=True, default='')
path2_char = models.CharField('存储层次2字母', max_length=1, blank=True, default='')
path2_name = models.CharField('存储层次2中文名', max_length=16, blank=True, default='')
path3_char = models.CharField('存储层次3字母', max_length=1, blank=True, default='')
path3_name = models.CharField('存储层次3中文名', max_length=16, blank=True, default='')

class Meta:
verbose_name = '实体藏经'
Expand All @@ -52,7 +46,7 @@ class Meta:
def __str__(self):
return '%s: 第%s册' % (self.tripitaka.name, self.vol_no)

class LQSutra(models.Model, TripiMixin):
class LQSutra(models.Model):
sid = models.CharField(verbose_name='龙泉经目经号编码', max_length=8) #(为"LQ"+ 经序号 + 别本号)
code = models.CharField(verbose_name='龙泉经目编码', max_length=5, blank=False)
variant_code = models.CharField(verbose_name='龙泉经目别本编码', max_length=1, default='0')
Expand All @@ -67,7 +61,7 @@ class Meta:
def __str__(self):
return '%s: %s' % (self.sid, self.name)

class Sutra(models.Model, TripiMixin):
class Sutra(models.Model):
sid = models.CharField(verbose_name='实体藏经|唯一经号编码', editable=True, max_length=8)
tripitaka = models.ForeignKey(Tripitaka, on_delete=models.CASCADE)
code = models.CharField(verbose_name='实体经目编码', max_length=5, blank=False)
Expand All @@ -89,7 +83,6 @@ def __str__(self):
class LQReel(models.Model):
lqsutra = models.ForeignKey(LQSutra, verbose_name='龙泉经目编码', on_delete=models.CASCADE)
reel_no = models.SmallIntegerField('卷序号')
text = SutraTextField('经文', default='')
remark = models.TextField('备注', blank=True, default='')

class Meta:
Expand Down Expand Up @@ -120,18 +113,11 @@ class Reel(models.Model):
path1 = models.CharField('存储层次1', max_length=16, default='')
path2 = models.CharField('存储层次2', max_length=16, default='')
path3 = models.CharField('存储层次3', max_length=16, default='')
text = SutraTextField('经文', default='') #按实际行加了换行符,换页标记为p\n
fixed = models.BooleanField('是否有调整', default=False)
f_start_page = models.CharField('起始页ID', max_length=18, default='', blank=True, null=True)
f_start_line_no = models.IntegerField('起始页行序号', default=-1)
f_start_char_no = models.IntegerField('起始页的行中字序号', default=-1)
f_end_page = models.CharField('终止页ID', max_length=18, default='', blank=True, null=True)
f_end_line_no = models.IntegerField('终止页行序号', default=-1)
f_end_char_no = models.IntegerField('终止页的行中字序号', default=-1)
f_text = SutraTextField('调整经文', default='', blank=True, null=True)
correct_text = SutraTextField('文字校对后的经文', default='') #按实际行加了换行符,换页标记为p\n
edition_type = models.SmallIntegerField('版本类型', choices=EDITION_TYPE_CHOICES, default=0)
remark = models.TextField('备注', blank=True, default='')
image_ready = models.BooleanField(verbose_name='图片状态', default=False)
cut_ready = models.BooleanField(verbose_name='切分数据状态', default=False)
column_ready = models.BooleanField(verbose_name='切列图状态', default=False)

class Meta:
verbose_name = '实体藏经卷'
Expand All @@ -155,37 +141,71 @@ def url_prefix(self):
s = '/%s/%s/%s_%s_' % (tcode, path_str, tcode, filename_str)
return s

def image_prefix(self):
tcode = self.sutra.sid[0:2]
path_lst = []
if self.path1:
path_lst.append(self.path1)
if self.path2:
path_lst.append(self.path2)
if self.path3:
path_lst.append(self.path3)
filename_str = '_'.join(path_lst)
s = '%s_%s_' % (tcode, filename_str)
return s
class ReelOCRText(models.Model):
reel = models.OneToOneField(Reel, verbose_name='实体藏经卷', on_delete=models.CASCADE, primary_key=True)
text = SutraTextField('经文', blank=True, default='') #按实际行加了换行符,换页标记为p\n
fixed = models.BooleanField('是否有调整', default=False)
f_start_page = models.CharField('起始页ID', max_length=18, default='', blank=True, null=True)
f_start_line_no = models.IntegerField('起始页行序号', default=-1)
f_start_char_no = models.IntegerField('起始页的行中字序号', default=-1)
f_end_page = models.CharField('终止页ID', max_length=18, default='', blank=True, null=True)
f_end_line_no = models.IntegerField('终止页行序号', default=-1)
f_end_char_no = models.IntegerField('终止页的行中字序号', default=-1)
f_text = SutraTextField('调整经文', blank=True, default='')

class Meta:
verbose_name = '实体藏经卷OCR经文'
verbose_name_plural = '实体藏经卷OCR经文'

class PageStatus:
INITIAL = 0
RECT_NOTFOUND = 1
PARSE_FAILED = 2
RECT_NOTREADY = 3
CUT_PIC_NOTFOUND = 4
COL_PIC_NOTFOUND = 5
COL_POS_NOTFOUND = 6
RECT_COL_NOTREADY = 7
RECT_COL_NOTFOUND = 8
READY = 9
MARKED = 10

CHOICES = (
(INITIAL, u'初始化'),
(RECT_NOTFOUND, u'切分数据未上传'),
(PARSE_FAILED, u'数据解析失败'),
(RECT_NOTREADY, u'字块数据未展开'),
(CUT_PIC_NOTFOUND, u'图片不存在'),
(COL_PIC_NOTFOUND, u'列图不存在'),
(COL_POS_NOTFOUND, u'列图坐标不存在'),
(RECT_COL_NOTREADY, u'字块对应列图未准备'),
(RECT_COL_NOTFOUND, u'字块对应列图不存在'),
(READY, u'已准备好'),
(MARKED, u'已入卷标记'),
)

class Page(models.Model):
pid = models.CharField('页ID', editable=True, max_length=13, primary_key=True) #sid + 3位卷号 + 2位页序号,页序号从1计数。如:YB00086000101
pid = models.CharField(verbose_name='实体藏经页级总编码', max_length=21, blank=False, primary_key=True)
reel = models.ForeignKey(Reel, verbose_name='实体藏经卷', on_delete=models.CASCADE)
reel_page_no = models.SmallIntegerField('卷中页序号')
vol_no = models.SmallIntegerField('册序号')
page_no = models.SmallIntegerField('页序号')
text = SutraTextField('经文') # 文字校对后的经文
page_no = models.SmallIntegerField('页序号')
bar_no = models.CharField('栏序号', max_length=1, default='0')
status = models.PositiveSmallIntegerField(db_index=True, verbose_name=u'操作类型',
choices=PageStatus.CHOICES, default=PageStatus.INITIAL)
bar_info = JSONField(verbose_name='栏信息', default=dict)
text = SutraTextField('经文', blank=True) # 文字校对后的经文
cut_info = models.TextField('切分信息')
cut_updated_at = models.DateTimeField('更新时间', null=True)
cut_add_count = models.SmallIntegerField('切分信息增加字数', default=0)
cut_wrong_count = models.SmallIntegerField('切分信息识别错的字数', default=0)
cut_confirm_count = models.SmallIntegerField('切分信息需要确认的字数', default=0)
cut_verify_count = models.SmallIntegerField('切分信息需要确认的字数', default=0)
s3_id = models.CharField(verbose_name='图片路径', max_length=128, default='', blank=False)

class Meta:
verbose_name = '实体藏经页'
verbose_name_plural = '实体藏经页'

def __str__(self):
return '%s第%s册第%s页' % (self.reel, self.vol_no, self.page_no)
return '%s第%s页' % (self.reel, self.reel_page_no)

Loading

0 comments on commit 9afdf12

Please sign in to comment.