diff --git a/Makefile b/Makefile
index 614787a..46ee9c2 100644
--- a/Makefile
+++ b/Makefile
@@ -13,11 +13,11 @@ init: $(ORIGINALFOLDER)
 	cp $(ORIGINALFOLDER)/training2nd/clk.*.bz2 $(TRAIN)
 	cp $(ORIGINALFOLDER)/training3rd/imp.*.bz2 $(TRAIN)
 	cp $(ORIGINALFOLDER)/training3rd/clk.*.bz2 $(TRAIN)
-	bzip2 -d $(TRAIN)/*
+	pbzip2 -d $(TRAIN)/*
 	mkdir -p $(TEST)
 	cp $(ORIGINALFOLDER)/testing2nd/* $(TEST)
 	cp $(ORIGINALFOLDER)/testing3rd/* $(TEST)
-	bzip2 -d $(TEST)/*
+	pbzip2 -d $(TEST)/*
 	mkdir $(BASE)/all	
 
 clk: $(TRAIN)
diff --git a/README.md b/README.md
index aa66e3a..ba9606d 100644
--- a/README.md
+++ b/README.md
@@ -3,19 +3,28 @@ make-ipinyou-data
 
 This project is to formalise the iPinYou RTB data into a standard format for further researches.
 
+**You should run these codes on Linux or WSL for preventing unexpected errors.**
+
 ### Step 0
-The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [UCL website](http://bunwell.cs.ucl.ac.uk/ipinyou.contest.dataset.zip).
+The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [Kaggle](https://www.kaggle.com/datasets/lastsummer/ipinyou).
 
 Unzip it and get the folder `ipinyou.contest.dataset`.
 
+To speed up the process of bzip2, install `pbzip2`.
+```
+# for example on Ubuntu
+sudo apt-get update
+sudo apt-get install pbzip2
+```
+
 ### Step 1
 Update the soft link for the folder `ipinyou.contest.dataset` in `original-data`. 
 ```
-weinan@ZHANG:~/Project/make-ipinyou-data/original-data$ ln -sfn ~/Data/ipinyou.contest.dataset ipinyou.contest.dataset
+make-ipinyou-data/original-data$ ln -sfn ~/Data/ipinyou.contest.dataset ipinyou.contest.dataset
 ```
 Under `make-ipinyou-data/original-data/ipinyou.contest.dataset` there should be the original dataset files like this:
 ```
-weinan@ZHANG:~/Project/make-ipinyou-data/original-data/ipinyou.contest.dataset$ ls
+make-ipinyou-data/original-data/ipinyou.contest.dataset$ ls
 algo.submission.demo.tar.bz2  README         testing2nd   training3rd
 city.cn.txt                   region.cn.txt  testing3rd   user.profile.tags.cn.txt
 city.en.txt                   region.en.txt  training1st  user.profile.tags.en.txt
@@ -28,7 +37,7 @@ Under `make-ipinyou-data` folder, just run `make all`.
 
 After the program finished, the total size of the folder will be 14G. The files under `make-ipinyou-data` should be like this:
 ```
-weinan@ZHANG:~/Project/make-ipinyou-data$ ls
+make-ipinyou-data$ ls
 1458  2261  2997  3386  3476  LICENSE   mkyzxdata.sh   python     schema.txt
 2259  2821  3358  3427  all   Makefile  original-data  README.md
 ```
@@ -37,7 +46,7 @@ Normally, we only do experiment for each campaign (e.g. `1458`). `all` is just t
 ### Use of the data
 We use campaign 1458 as example here.
 ```
-weinan@ZHANG:~/Project/make-ipinyou-data/1458$ ls
+make-ipinyou-data/1458$ ls
 featindex.txt  test.log.txt  test.yzx.txt  train.log.txt  train.yzx.txt
 ```
 * `train.log.txt` and `test.log.txt` are the formalised string data for each row (record) in train and test. The first column is whether the user click the ad or not. The 14th column is the winning price for this auction.
@@ -45,4 +54,5 @@ featindex.txt  test.log.txt  test.yzx.txt  train.log.txt  train.yzx.txt
 * `train.yzx.txt` and `test.yzx.txt` are the mapped vector data for `train.log.txt` and `test.log.txt`. The format is y:click, z:wining_price, and x:features. Such data is in the standard form as introduced in [iPinYou Benchmarking](http://arxiv.org/abs/1407.7073).
 
 
-For any questions, please report the issues or contact [Weinan Zhang](http://www0.cs.ucl.ac.uk/staff/w.zhang/).
+For any questions, please report the issues or contact [Weinan Zhang](http://www0.cs.ucl.ac.uk/staff/w.zhang/) or [frinkleko](https://github.com/frinkleko)
+
diff --git a/mkyzxdata.sh b/mkyzxdata.sh
index 777f511..aa129a8 100644
--- a/mkyzxdata.sh
+++ b/mkyzxdata.sh
@@ -2,6 +2,6 @@ advertisers="1458 2261 2997 3386 3476 2259 2821 3358 3427"
 
 for advertiser in $advertisers; do
     echo $advertiser
-    python python/mkyzx.py $advertiser/train.log.txt $advertiser/test.log.txt $advertiser/train.yzx.txt $advertiser/test.yzx.txt $advertiser/featindex.txt
+    python3 python/mkyzx.py $advertiser/train.log.txt $advertiser/test.log.txt $advertiser/train.yzx.txt $advertiser/test.yzx.txt $advertiser/featindex.txt
 done
 
diff --git a/python/formalizeua.py b/python/formalizeua.py
index 24311c4..fc1ac3a 100755
--- a/python/formalizeua.py
+++ b/python/formalizeua.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 import os
 
 if len(sys.argv) < 2:
-    print 'Usage: input'
+    print('Usage: input')
     exit(-1)
 
 
diff --git a/python/mkdata.py b/python/mkdata.py
index c827445..9f1a196 100755
--- a/python/mkdata.py
+++ b/python/mkdata.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 from datetime import date
 
 if len(sys.argv) < 3:
-    print 'Usage: schema clickfiles'
+    print('Usage: schema clickfiles')
     exit(-1)
 
 schema = [ s.strip() for s in open(sys.argv[1]).read().split() ]
diff --git a/python/mktest.py b/python/mktest.py
index c6afa62..b50f39f 100755
--- a/python/mktest.py
+++ b/python/mktest.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 from datetime import date
 
 if len(sys.argv) < 2:
-    print 'Usage: schema '
+    print('Usage: schema ')
     exit(-1)
 
 schema = [ s.strip() for s in open(sys.argv[1]).read().split() ]
diff --git a/python/mkyzx.py b/python/mkyzx.py
index caac4d7..c65d75b 100755
--- a/python/mkyzx.py
+++ b/python/mkyzx.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 import operator
 
 if len(sys.argv) < 5:
-    print 'Usage: train.log.txt test.log.txt train.lr.txt test.lr.txt featindex.txt'
+    print('Usage: train.log.txt test.log.txt train.lr.txt test.lr.txt featindex.txt')
     exit(-1)
 
 oses = ["windows", "ios", "mac", "android", "linux"]
@@ -89,15 +89,15 @@ def getTags(content):
             featindex[feat] = maxindex
             maxindex += 1
 
-print 'feature size: ' + str(maxindex)
-featvalue = sorted(featindex.iteritems(), key=operator.itemgetter(1))
+print('feature size: ' + str(maxindex))
+featvalue = sorted(featindex.items(), key=operator.itemgetter(1))
 fo = open(sys.argv[5], 'w')
 for fv in featvalue:
     fo.write(fv[0] + '\t' + str(fv[1]) + '\n')
 fo.close()
 
 # indexing train
-print 'indexing ' + sys.argv[1]
+print('indexing ' + sys.argv[1])
 fi = open(sys.argv[1], 'r')
 fo = open(sys.argv[3], 'w')
 
@@ -138,7 +138,7 @@ def getTags(content):
 fo.close()
 
 # indexing test
-print 'indexing ' + sys.argv[2]
+print('indexing ' + sys.argv[2])
 fi = open(sys.argv[2], 'r')
 fo = open(sys.argv[4], 'w')
 
@@ -154,8 +154,8 @@ def getTags(content):
     for f in f1s: # every direct first order feature
         col = namecol[f]
         if col >= len(s):
-            print 'col: ' + str(col)
-            print line
+            print('col: ' + str(col))
+            print(line)
         content = s[col]
         feat = str(col) + ':' + content
         if feat not in featindex:
diff --git a/python/splitadvertisers.py b/python/splitadvertisers.py
index 43bf20a..ce40f7e 100755
--- a/python/splitadvertisers.py
+++ b/python/splitadvertisers.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 import os
 
 if len(sys.argv) < 5:
-    print 'Usage: ipinyou.folder 25 train.log.txt test.log.txt'
+    print('Usage: ipinyou.folder 25 train.log.txt test.log.txt')
     # python splitadvertisers.py ../ 25 ../all/train.log.txt ../all/test.log.txt
     exit(-1)