scstech85 · GokulNC · Jun 25, 2020 · Jun 25, 2020 · Jun 26, 2020 · Jun 26, 2020
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+.idea/
+*.pyc
+Background/
+*.ttf
+*.otf
diff --git a/.idea/DocEmul.iml b/.idea/DocEmul.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/branden.xml → Data/Brandenburg/branden.xml b/branden.xml → Data/Brandenburg/branden.xml
@@ -169,21 +169,21 @@
 
 
     <dictonaries>
-        <Dictonary path="divina.txt" number_words="-1"/>
-        <Dictonary path="extra2.txt" number_words="1"/>
-        <Dictonary path="divina.txt" number_words="1"/>
-        <Dictonary path="numbers.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
+        <Dictonary path="Dictionaries/extra2.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/divina.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
     </dictonaries>
     <fonts>
-        <Font path="handwritten/A_Glitch_In_Time.ttf" />
+        <Font path="fonts/A_Glitch_In_Time.ttf" />
 
-        <Font path="handwritten/Lovelt__.ttf" />
-        <Font path="handwritten/WankstabergBattles.ttf" />
-        <Font path="handwritten/SCRIPTIN.ttf" />
+        <Font path="fonts/Lovelt__.ttf" />
+        <Font path="fonts/WankstabergBattles.ttf" />
+        <Font path="fonts/SCRIPTIN.ttf" />
 
 
-        <Font path="handwritten/kevinwildfont.ttf" />
-        <Font path="handwritten/Mr_Fisherman_and_the_Shoemaker.ttf" />
+        <Font path="fonts/kevinwildfont.ttf" />
+        <Font path="fonts/Mr_Fisherman_and_the_Shoemaker.ttf" />
 
     </fonts>
 </Documents>
diff --git a/branden2.xml → Data/Brandenburg/branden2.xml b/branden2.xml → Data/Brandenburg/branden2.xml
@@ -694,24 +694,24 @@
 
 
     <dictonaries>
-        <Dictonary path="divina.txt" number_words="-1"/>
-        <Dictonary path="extra2.txt" number_words="1"/>
-        <Dictonary path="divina.txt" number_words="1"/>
-        <Dictonary path="numbers.txt" number_words="1"/>
-        <Dictonary path="extra3.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
+        <Dictonary path="Dictionaries/extra2.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/divina.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/extra3.txt" number_words="1"/>
     </dictonaries>
     <fonts>
-        <Font path="handwritten/A_Glitch_In_Time.ttf" />
+        <Font path="fonts/A_Glitch_In_Time.ttf" />
 
-        <Font path="handwritten/FountainPen.ttf" />
+        <Font path="fonts/FountainPen.ttf" />
 
-        <Font path="handwritten/Taken_by_Vultures_Demo.otf" />
+        <Font path="fonts/Taken_by_Vultures_Demo.otf" />
 
-        <Font path="handwritten/SCRIPTIN.ttf" />
+        <Font path="fonts/SCRIPTIN.ttf" />
         <!-- not yet used -->
 
 
-        <Font path="handwritten/Lemon_Tuesday.otf" />
+        <Font path="fonts/Lemon_Tuesday.otf" />
 
 
 

diff --git a/esposalles.xml → Data/Esposalles/esposalles.xml b/esposalles.xml → Data/Esposalles/esposalles.xml
@@ -387,20 +387,20 @@
     </Document>
 
     <dictonaries>
-        <Dictonary path="divina.txt" number_words="-1"/>
-        <Dictonary path="extra.txt" number_words="1"/>
-        <Dictonary path="numbers.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
+        <Dictonary path="Dictionaries/extra.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
     </dictonaries>
     <fonts>
-        <Font path="handwritten/A_Glitch_In_Time.ttf" />
+        <Font path="fonts/A_Glitch_In_Time.ttf" />
 
-        <Font path="handwritten/Lovelt__.ttf" />
-        <Font path="handwritten/WankstabergBattles.ttf" />
-        <Font path="handwritten/SCRIPTIN.ttf" />
+        <Font path="fonts/Lovelt__.ttf" />
+        <Font path="fonts/WankstabergBattles.ttf" />
+        <Font path="fonts/SCRIPTIN.ttf" />
 
 
-        <Font path="handwritten/kevinwildfont.ttf" />
-        <Font path="handwritten/Mr_Fisherman_and_the_Shoemaker.ttf" />
+        <Font path="fonts/kevinwildfont.ttf" />
+        <Font path="fonts/Mr_Fisherman_and_the_Shoemaker.ttf" />
 
 
 

diff --git a/esposalles_big.xml → Data/Esposalles/esposalles_big.xml b/esposalles_big.xml → Data/Esposalles/esposalles_big.xml
@@ -387,20 +387,20 @@
     </Document>
 
     <dictonaries>
-        <Dictonary path="divina.txt" number_words="-1"/>
-        <Dictonary path="extra.txt" number_words="1"/>
-        <Dictonary path="numbers.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
+        <Dictonary path="Dictionaries/extra.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
     </dictonaries>
     <fonts>
-        <Font path="handwritten/A_Glitch_In_Time.ttf" />
+        <Font path="fonts/A_Glitch_In_Time.ttf" />
 
-        <Font path="handwritten/Lovelt__.ttf" />
-        <Font path="handwritten/WankstabergBattles.ttf" />
-        <Font path="handwritten/SCRIPTIN.ttf" />
+        <Font path="fonts/Lovelt__.ttf" />
+        <Font path="fonts/WankstabergBattles.ttf" />
+        <Font path="fonts/SCRIPTIN.ttf" />
 
 
-        <Font path="handwritten/kevinwildfont.ttf" />
-        <Font path="handwritten/Mr_Fisherman_and_the_Shoemaker.ttf" />
+        <Font path="fonts/kevinwildfont.ttf" />
+        <Font path="fonts/Mr_Fisherman_and_the_Shoemaker.ttf" />
 
 
 

diff --git a/esposalles_small.xml → Data/Esposalles/esposalles_small.xml b/esposalles_small.xml → Data/Esposalles/esposalles_small.xml
@@ -387,24 +387,20 @@
     </Document>
 
     <dictonaries>
-        <Dictonary path="divina.txt" number_words="-1"/>
-        <Dictonary path="extra.txt" number_words="1"/>
-        <Dictonary path="numbers.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/divina.txt" number_words="-1"/>
+        <Dictonary path="Dictionaries/extra.txt" number_words="1"/>
+        <Dictonary path="Dictionaries/numbers.txt" number_words="1"/>
     </dictonaries>
     <fonts>
-        <Font path="handwritten/A_Glitch_In_Time.ttf" />
-
-        <Font path="handwritten/Lovelt__.ttf" />
-        <Font path="handwritten/WankstabergBattles.ttf" />
-        <Font path="handwritten/SCRIPTIN.ttf" />
-
-
-        <Font path="handwritten/kevinwildfont.ttf" />
-        <Font path="handwritten/Mr_Fisherman_and_the_Shoemaker.ttf" />
-
+        <Font path="fonts/A_Glitch_In_Time.ttf" />
 
+        <Font path="fonts/Lovelt__.ttf" />
+        <Font path="fonts/WankstabergBattles.ttf" />
+        <Font path="fonts/SCRIPTIN.ttf" />
 
 
+        <Font path="fonts/kevinwildfont.ttf" />
+        <Font path="fonts/Mr_Fisherman_and_the_Shoemaker.ttf" />
 
     </fonts>
 </Documents>
diff --git a/divina.txt → Dictionaries/divina.txt b/divina.txt → Dictionaries/divina.txt
diff --git a/extra.txt → Dictionaries/extra.txt b/extra.txt → Dictionaries/extra.txt
diff --git a/extra2.txt → Dictionaries/extra2.txt b/extra2.txt → Dictionaries/extra2.txt
diff --git a/extra3.txt → Dictionaries/extra3.txt b/extra3.txt → Dictionaries/extra3.txt
diff --git a/numbers.txt → Dictionaries/numbers.txt b/numbers.txt → Dictionaries/numbers.txt
diff --git a/README.md b/README.md
@@ -19,13 +19,20 @@ in `<repo>/Data` you can find some generated files for two different datasets (E
 
 ### Document structure
 There are several XML files to describe the document structure. This files are used also in the experiments proposed in the research.
+
 ### Text data
 In the text files there are the data used to write text during the production process.
+
 ### Fonts
 Before to start the generation process, you need to download from `http://www.dafont.com/` the used fonts for the experiments.
-Run the script `python download_font.py' which creates the directory `handwritten`. Here you can find some fonts downloaded from `http://www.dafont.com/` and used to generate the synthetic data for the experiments.
+Run the script `python download_font.py` which creates the directory `fonts`. Here you can find some fonts downloaded from `http://www.dafont.com/` and used to generate the synthetic data for the experiments.
 
 # Generation process
+
+## Pre-requisites
+- Python 3
+- `pip install -r py-requirements.txt`
+
 ## Esposalles
 It is possible to emulate the generation of documents for the dataset Esposalles.
 
@@ -46,7 +53,7 @@ Try to modify the script to generate more pages.
 If you need more instructions to define the document structure, please, contact us and we will glad to help you..
 
 ## Brandenburg
-It is possible to emulate the generation of documents for the dataset Brandenburg. Running the script `python test_generate_brandenburg.py` it will be possible to generate documents following the Brandenburg model (`branden2.xml`). It will build the synthetic dataset (only text over a transparent background)  into the local directory `GENERATED/Brandenburg/test`.
+It is possible to emulate the generation of documents for the dataset Brandenburg. Running the script `python generate_brandenburg.py` it will be possible to generate documents following the Brandenburg model (`branden2.xml`). It will build the synthetic dataset (only text over a transparent background)  into the local directory `GENERATED/Brandenburg/test`.
 
 # Citing DocEmul
 

diff --git a/branden_test.py b/branden_test.py
@@ -5,7 +5,7 @@
 
 
 
-def create_bf(dir='/home/scstech/PycharmProjects/synthetic_handwritten/CNN_MODELS/test_set/F2-1'):
+def create_bf(dir='/home/scstech/PycharmProjects/synthetic_fonts/CNN_MODELS/test_set/F2-1'):
     files = [os.path.join(dir,f) for f in os.listdir(dir) if f.split('.')[-1] == 'jpg']
     np.random.shuffle(files)
     print len(files)
@@ -16,7 +16,7 @@ def create_bf(dir='/home/scstech/PycharmProjects/synthetic_handwritten/CNN_MODEL
 
 
 
-def run_realbackground(dir, path_background,num=9, size=None, greyscale=False,type='TXT',model='branden.xml'):
+def run_realbackground(dir, path_background,num=9, size=None, greyscale=False,type='TXT',model='Data/Brandenburg/branden.xml'):
     RealBackGound.dirname = path_background
     realSampler = RealBackGound.load_examples()
 
@@ -26,7 +26,7 @@ def run_realbackground(dir, path_background,num=9, size=None, greyscale=False,ty
 #solidSampler = SolidBackGroundback(colors=[255])
 #realSampler = RealBackGound.load_examples()
 
-def run(dir, num=5, size=None, sampler=None, greyscale=False,type='TXT',model='branden2.xml',seed=2):
+def run(dir, num=5, size=None, sampler=None, greyscale=False,type='TXT',model='Data/Brandenburg/branden2.xml',seed=2):
     generate(dir, num=num, size=size,sampler=sampler, greyscale=greyscale,type=type,model=model,seed=seed)
 
 

diff --git a/docemul/__init__.py b/docemul/__init__.py
@@ -1 +1 @@
-from extract_background import create_background_dataset
+from .extract_background import create_background_dataset
diff --git a/docemul/__init__.pyc b/docemul/__init__.pyc
diff --git a/docemul/augment.py b/docemul/augment.py
@@ -1,20 +1,22 @@
-import PIL,Image
-from skimage.color import rgb2grey
+import PIL
+from PIL import Image
+from skimage.color import rgb2gray
 import os
 import csv
-from scipy.misc import imread,imsave
+from imageio import imread, imsave
+from skimage.transform import resize as imresize
 import numpy as np
+
 def augment_img(img, rotate=3, rotate_time=2, noise=2):
     imgs = [img]
 
-    from scipy.misc import imrotate
     for _ in range(rotate_time):
 
         rotation = np.random.random()*rotate*2 - rotate
-        print rotation
+        print(rotation)
         M = 255
-
-        im = PIL.Image.fromarray(np.uint8(img))
+        img = (img * 255).astype(np.uint8)
+        im = PIL.Image.fromarray(img)
         # converted to have an alpha layer
         im2 = im.convert('RGBA')
         # rotated image
@@ -27,7 +29,8 @@ def augment_img(img, rotate=3, rotate_time=2, noise=2):
         out = out.convert(im.mode)
 
         im = np.asarray(out).copy()
-        im = rgb2grey(im)
+        if len(im.shape) != 2:
+            im = rgb2gray(im)
         part = tuple(map(int,0.05 * np.array(im.shape[:2])))
 
         top = im[:part[0],:]
@@ -64,33 +67,36 @@ def augment_img(img, rotate=3, rotate_time=2, noise=2):
 def generate_noise(img, rand_no_noise=0.9, max_rand_distr=0.03, step=16):
 
     mod = np.zeros_like(img)
-    for w in range(0, img.shape[0], img.shape[0] / step):
+    for w in range(0, img.shape[0], img.shape[0] // step):
 
-        for h in range(0, img.shape[1], img.shape[1] / step):
+        for h in range(0, img.shape[1], img.shape[1] // step):
             if np.random.random() <= rand_no_noise:
                 v = 0
 
             else:
                 v = np.random.random() * max_rand_distr
 
-            m = mod[w:w + img.shape[0] / (step / 2), h:h + img.shape[1] / (step / 2)]
+            m = mod[w:w + img.shape[0] // (step // 2), h:h + img.shape[1] // (step // 2)]
             noise = (np.random.rand(m.shape[0], m.shape[1]) < v).astype(np.uint8)
 
-            mod[w:w + img.shape[0] / (step / 2), h:h + img.shape[1] / (step / 2)] += noise
+            mod[w:w + img.shape[0] // (step // 2), h:h + img.shape[1] // (step // 2)] += noise
 
     return (mod>0).astype(np.uint8)
 
 def data_augment(fcsv, dir_target,f_output = 'gt_augment.csv',resize=(450,190),rotate=2, rotate_time=1, noise=1):
-    os.makedirs(dir_target)
+    os.makedirs(dir_target, exist_ok=True)
     img_dir = os.path.join(dir_target, 'imgs')
-    os.makedirs(img_dir)
+    os.makedirs(img_dir, exist_ok=True)
     f_csv_o = os.path.join(dir_target, f_output)
 
 
     with open(fcsv, 'r') as csv_in:
         with open(f_csv_o, 'w') as csv_out:
             writer = csv.writer(csv_out, delimiter=' ')
-            for f, r in csv.reader(csv_in, delimiter=' '):
+            for row in csv.reader(csv_in, delimiter=' '):
+                if not row:
+                    continue
+                f, r = row
                 if os.path.isfile(f):
                     img = imread(f)
                     if resize:
@@ -107,8 +113,8 @@ def data_augment(fcsv, dir_target,f_output = 'gt_augment.csv',resize=(450,190),r
                     for j, ii in enumerate(imgs):
                         im_name = os.path.join(img_dir, fname+'_'+str(j)+'.'+ext)
                         imsave(im_name, ii)
-                        print im_name
+                        print(im_name)
                         writer.writerow([im_name, r])
                 else:
-                    print 'file non valid:', f
+                    print('file non valid:', f)
 
diff --git a/docemul/augment.pyc b/docemul/augment.pyc
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from extract_background import create_background_dataset
		from .extract_background import create_background_dataset