Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for Python 3 and reorganization #1

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.idea/
*.pyc
Background/
*.ttf
*.otf
11 changes: 0 additions & 11 deletions .idea/DocEmul.iml

This file was deleted.

4 changes: 0 additions & 4 deletions .idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions .idea/modules.xml

This file was deleted.

6 changes: 0 additions & 6 deletions .idea/vcs.xml

This file was deleted.

20 changes: 10 additions & 10 deletions branden.xml → Data/Brandenburg/branden.xml
Original file line number Diff line number Diff line change
Expand Up @@ -169,21 +169,21 @@


<dictonaries>
<Dictonary path="divina.txt" number_words="-1"/>
<Dictonary path="extra2.txt" number_words="1"/>
<Dictonary path="divina.txt" number_words="1"/>
<Dictonary path="numbers.txt" number_words="1"/>
<Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
<Dictonary path="Dictionaries/extra2.txt" number_words="1"/>
<Dictonary path="Dictionaries/divina.txt" number_words="1"/>
<Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
</dictonaries>
<fonts>
<Font path="handwritten/A_Glitch_In_Time.ttf" />
<Font path="fonts/A_Glitch_In_Time.ttf" />

<Font path="handwritten/Lovelt__.ttf" />
<Font path="handwritten/WankstabergBattles.ttf" />
<Font path="handwritten/SCRIPTIN.ttf" />
<Font path="fonts/Lovelt__.ttf" />
<Font path="fonts/WankstabergBattles.ttf" />
<Font path="fonts/SCRIPTIN.ttf" />


<Font path="handwritten/kevinwildfont.ttf" />
<Font path="handwritten/Mr_Fisherman_and_the_Shoemaker.ttf" />
<Font path="fonts/kevinwildfont.ttf" />
<Font path="fonts/Mr_Fisherman_and_the_Shoemaker.ttf" />

</fonts>
</Documents>
20 changes: 10 additions & 10 deletions branden2.xml → Data/Brandenburg/branden2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -694,24 +694,24 @@


<dictonaries>
<Dictonary path="divina.txt" number_words="-1"/>
<Dictonary path="extra2.txt" number_words="1"/>
<Dictonary path="divina.txt" number_words="1"/>
<Dictonary path="numbers.txt" number_words="1"/>
<Dictonary path="extra3.txt" number_words="1"/>
<Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
<Dictonary path="Dictionaries/extra2.txt" number_words="1"/>
<Dictonary path="Dictionaries/divina.txt" number_words="1"/>
<Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
<Dictonary path="Dictionaries/extra3.txt" number_words="1"/>
</dictonaries>
<fonts>
<Font path="handwritten/A_Glitch_In_Time.ttf" />
<Font path="fonts/A_Glitch_In_Time.ttf" />

<Font path="handwritten/FountainPen.ttf" />
<Font path="fonts/FountainPen.ttf" />

<Font path="handwritten/Taken_by_Vultures_Demo.otf" />
<Font path="fonts/Taken_by_Vultures_Demo.otf" />

<Font path="handwritten/SCRIPTIN.ttf" />
<Font path="fonts/SCRIPTIN.ttf" />
<!-- not yet used -->


<Font path="handwritten/Lemon_Tuesday.otf" />
<Font path="fonts/Lemon_Tuesday.otf" />



Expand Down
18 changes: 9 additions & 9 deletions esposalles.xml → Data/Esposalles/esposalles.xml
Original file line number Diff line number Diff line change
Expand Up @@ -387,20 +387,20 @@
</Document>

<dictonaries>
<Dictonary path="divina.txt" number_words="-1"/>
<Dictonary path="extra.txt" number_words="1"/>
<Dictonary path="numbers.txt" number_words="1"/>
<Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
<Dictonary path="Dictionaries/extra.txt" number_words="1"/>
<Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
</dictonaries>
<fonts>
<Font path="handwritten/A_Glitch_In_Time.ttf" />
<Font path="fonts/A_Glitch_In_Time.ttf" />

<Font path="handwritten/Lovelt__.ttf" />
<Font path="handwritten/WankstabergBattles.ttf" />
<Font path="handwritten/SCRIPTIN.ttf" />
<Font path="fonts/Lovelt__.ttf" />
<Font path="fonts/WankstabergBattles.ttf" />
<Font path="fonts/SCRIPTIN.ttf" />


<Font path="handwritten/kevinwildfont.ttf" />
<Font path="handwritten/Mr_Fisherman_and_the_Shoemaker.ttf" />
<Font path="fonts/kevinwildfont.ttf" />
<Font path="fonts/Mr_Fisherman_and_the_Shoemaker.ttf" />



Expand Down
18 changes: 9 additions & 9 deletions esposalles_big.xml → Data/Esposalles/esposalles_big.xml
Original file line number Diff line number Diff line change
Expand Up @@ -387,20 +387,20 @@
</Document>

<dictonaries>
<Dictonary path="divina.txt" number_words="-1"/>
<Dictonary path="extra.txt" number_words="1"/>
<Dictonary path="numbers.txt" number_words="1"/>
<Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
<Dictonary path="Dictionaries/extra.txt" number_words="1"/>
<Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
</dictonaries>
<fonts>
<Font path="handwritten/A_Glitch_In_Time.ttf" />
<Font path="fonts/A_Glitch_In_Time.ttf" />

<Font path="handwritten/Lovelt__.ttf" />
<Font path="handwritten/WankstabergBattles.ttf" />
<Font path="handwritten/SCRIPTIN.ttf" />
<Font path="fonts/Lovelt__.ttf" />
<Font path="fonts/WankstabergBattles.ttf" />
<Font path="fonts/SCRIPTIN.ttf" />


<Font path="handwritten/kevinwildfont.ttf" />
<Font path="handwritten/Mr_Fisherman_and_the_Shoemaker.ttf" />
<Font path="fonts/kevinwildfont.ttf" />
<Font path="fonts/Mr_Fisherman_and_the_Shoemaker.ttf" />



Expand Down
22 changes: 9 additions & 13 deletions esposalles_small.xml → Data/Esposalles/esposalles_small.xml
Original file line number Diff line number Diff line change
Expand Up @@ -387,24 +387,20 @@
</Document>

<dictonaries>
<Dictonary path="divina.txt" number_words="-1"/>
<Dictonary path="extra.txt" number_words="1"/>
<Dictonary path="numbers.txt" number_words="1"/>
<Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
<Dictonary path="Dictionaries/extra.txt" number_words="1"/>
<Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
</dictonaries>
<fonts>
<Font path="handwritten/A_Glitch_In_Time.ttf" />

<Font path="handwritten/Lovelt__.ttf" />
<Font path="handwritten/WankstabergBattles.ttf" />
<Font path="handwritten/SCRIPTIN.ttf" />


<Font path="handwritten/kevinwildfont.ttf" />
<Font path="handwritten/Mr_Fisherman_and_the_Shoemaker.ttf" />

<Font path="fonts/A_Glitch_In_Time.ttf" />

<Font path="fonts/Lovelt__.ttf" />
<Font path="fonts/WankstabergBattles.ttf" />
<Font path="fonts/SCRIPTIN.ttf" />


<Font path="fonts/kevinwildfont.ttf" />
<Font path="fonts/Mr_Fisherman_and_the_Shoemaker.ttf" />

</fonts>
</Documents>
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,20 @@ in `<repo>/Data` you can find some generated files for two different datasets (E

### Document structure
There are several XML files to describe the document structure. This files are used also in the experiments proposed in the research.

### Text data
In the text files there are the data used to write text during the production process.

### Fonts
Before to start the generation process, you need to download from `http://www.dafont.com/` the used fonts for the experiments.
Run the script `python download_font.py' which creates the directory `handwritten`. Here you can find some fonts downloaded from `http://www.dafont.com/` and used to generate the synthetic data for the experiments.
Run the script `python download_font.py` which creates the directory `fonts`. Here you can find some fonts downloaded from `http://www.dafont.com/` and used to generate the synthetic data for the experiments.

# Generation process

## Pre-requisites
- Python 3
- `pip install -r py-requirements.txt`

## Esposalles
It is possible to emulate the generation of documents for the dataset Esposalles.

Expand All @@ -46,7 +53,7 @@ Try to modify the script to generate more pages.
If you need more instructions to define the document structure, please, contact us and we will glad to help you..

## Brandenburg
It is possible to emulate the generation of documents for the dataset Brandenburg. Running the script `python test_generate_brandenburg.py` it will be possible to generate documents following the Brandenburg model (`branden2.xml`). It will build the synthetic dataset (only text over a transparent background) into the local directory `GENERATED/Brandenburg/test`.
It is possible to emulate the generation of documents for the dataset Brandenburg. Running the script `python generate_brandenburg.py` it will be possible to generate documents following the Brandenburg model (`branden2.xml`). It will build the synthetic dataset (only text over a transparent background) into the local directory `GENERATED/Brandenburg/test`.

# Citing DocEmul

Expand Down
6 changes: 3 additions & 3 deletions branden_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@



def create_bf(dir='/home/scstech/PycharmProjects/synthetic_handwritten/CNN_MODELS/test_set/F2-1'):
def create_bf(dir='/home/scstech/PycharmProjects/synthetic_fonts/CNN_MODELS/test_set/F2-1'):
files = [os.path.join(dir,f) for f in os.listdir(dir) if f.split('.')[-1] == 'jpg']
np.random.shuffle(files)
print len(files)
Expand All @@ -16,7 +16,7 @@ def create_bf(dir='/home/scstech/PycharmProjects/synthetic_handwritten/CNN_MODEL



def run_realbackground(dir, path_background,num=9, size=None, greyscale=False,type='TXT',model='branden.xml'):
def run_realbackground(dir, path_background,num=9, size=None, greyscale=False,type='TXT',model='Data/Brandenburg/branden.xml'):
RealBackGound.dirname = path_background
realSampler = RealBackGound.load_examples()

Expand All @@ -26,7 +26,7 @@ def run_realbackground(dir, path_background,num=9, size=None, greyscale=False,ty
#solidSampler = SolidBackGroundback(colors=[255])
#realSampler = RealBackGound.load_examples()

def run(dir, num=5, size=None, sampler=None, greyscale=False,type='TXT',model='branden2.xml',seed=2):
def run(dir, num=5, size=None, sampler=None, greyscale=False,type='TXT',model='Data/Brandenburg/branden2.xml',seed=2):
generate(dir, num=num, size=size,sampler=sampler, greyscale=greyscale,type=type,model=model,seed=seed)


Expand Down
2 changes: 1 addition & 1 deletion docemul/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from extract_background import create_background_dataset
from .extract_background import create_background_dataset
Binary file removed docemul/__init__.pyc
Binary file not shown.
40 changes: 23 additions & 17 deletions docemul/augment.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
import PIL,Image
from skimage.color import rgb2grey
import PIL
from PIL import Image
from skimage.color import rgb2gray
import os
import csv
from scipy.misc import imread,imsave
from imageio import imread, imsave
from skimage.transform import resize as imresize
import numpy as np

def augment_img(img, rotate=3, rotate_time=2, noise=2):
imgs = [img]

from scipy.misc import imrotate
for _ in range(rotate_time):

rotation = np.random.random()*rotate*2 - rotate
print rotation
print(rotation)
M = 255

im = PIL.Image.fromarray(np.uint8(img))
img = (img * 255).astype(np.uint8)
im = PIL.Image.fromarray(img)
# converted to have an alpha layer
im2 = im.convert('RGBA')
# rotated image
Expand All @@ -27,7 +29,8 @@ def augment_img(img, rotate=3, rotate_time=2, noise=2):
out = out.convert(im.mode)

im = np.asarray(out).copy()
im = rgb2grey(im)
if len(im.shape) != 2:
im = rgb2gray(im)
part = tuple(map(int,0.05 * np.array(im.shape[:2])))

top = im[:part[0],:]
Expand Down Expand Up @@ -64,33 +67,36 @@ def augment_img(img, rotate=3, rotate_time=2, noise=2):
def generate_noise(img, rand_no_noise=0.9, max_rand_distr=0.03, step=16):

mod = np.zeros_like(img)
for w in range(0, img.shape[0], img.shape[0] / step):
for w in range(0, img.shape[0], img.shape[0] // step):

for h in range(0, img.shape[1], img.shape[1] / step):
for h in range(0, img.shape[1], img.shape[1] // step):
if np.random.random() <= rand_no_noise:
v = 0

else:
v = np.random.random() * max_rand_distr

m = mod[w:w + img.shape[0] / (step / 2), h:h + img.shape[1] / (step / 2)]
m = mod[w:w + img.shape[0] // (step // 2), h:h + img.shape[1] // (step // 2)]
noise = (np.random.rand(m.shape[0], m.shape[1]) < v).astype(np.uint8)

mod[w:w + img.shape[0] / (step / 2), h:h + img.shape[1] / (step / 2)] += noise
mod[w:w + img.shape[0] // (step // 2), h:h + img.shape[1] // (step // 2)] += noise

return (mod>0).astype(np.uint8)

def data_augment(fcsv, dir_target,f_output = 'gt_augment.csv',resize=(450,190),rotate=2, rotate_time=1, noise=1):
os.makedirs(dir_target)
os.makedirs(dir_target, exist_ok=True)
img_dir = os.path.join(dir_target, 'imgs')
os.makedirs(img_dir)
os.makedirs(img_dir, exist_ok=True)
f_csv_o = os.path.join(dir_target, f_output)


with open(fcsv, 'r') as csv_in:
with open(f_csv_o, 'w') as csv_out:
writer = csv.writer(csv_out, delimiter=' ')
for f, r in csv.reader(csv_in, delimiter=' '):
for row in csv.reader(csv_in, delimiter=' '):
if not row:
continue
f, r = row
if os.path.isfile(f):
img = imread(f)
if resize:
Expand All @@ -107,8 +113,8 @@ def data_augment(fcsv, dir_target,f_output = 'gt_augment.csv',resize=(450,190),r
for j, ii in enumerate(imgs):
im_name = os.path.join(img_dir, fname+'_'+str(j)+'.'+ext)
imsave(im_name, ii)
print im_name
print(im_name)
writer.writerow([im_name, r])
else:
print 'file non valid:', f
print('file non valid:', f)

Binary file removed docemul/augment.pyc
Binary file not shown.
Loading