wnzhang · frinkleko · Jan 15, 2023 · Jan 15, 2023 · Jan 15, 2023 · Jan 15, 2023
diff --git a/Makefile b/Makefile
@@ -13,11 +13,11 @@ init: $(ORIGINALFOLDER)
 	cp $(ORIGINALFOLDER)/training2nd/clk.*.bz2 $(TRAIN)
 	cp $(ORIGINALFOLDER)/training3rd/imp.*.bz2 $(TRAIN)
 	cp $(ORIGINALFOLDER)/training3rd/clk.*.bz2 $(TRAIN)
-	bzip2 -d $(TRAIN)/*
+	pbzip2 -d $(TRAIN)/*
 	mkdir -p $(TEST)
 	cp $(ORIGINALFOLDER)/testing2nd/* $(TEST)
 	cp $(ORIGINALFOLDER)/testing3rd/* $(TEST)
-	bzip2 -d $(TEST)/*
+	pbzip2 -d $(TEST)/*
 	mkdir $(BASE)/all	
 
 clk: $(TRAIN)

diff --git a/README.md b/README.md
@@ -3,19 +3,28 @@ make-ipinyou-data
 
 This project is to formalise the iPinYou RTB data into a standard format for further researches.
 
+**You should run these codes on Linux or WSL for preventing unexpected errors.**
+
 ### Step 0
-The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [UCL website](http://bunwell.cs.ucl.ac.uk/ipinyou.contest.dataset.zip).
+The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [Kaggle](https://www.kaggle.com/datasets/lastsummer/ipinyou).
 
 Unzip it and get the folder `ipinyou.contest.dataset`.
 
+To speed up the process of bzip2, install `pbzip2`.
+```
+# for example on Ubuntu
+sudo apt-get update
+sudo apt-get install pbzip2
+```
+
 ### Step 1
 Update the soft link for the folder `ipinyou.contest.dataset` in `original-data`. 
 ```
-weinan@ZHANG:~/Project/make-ipinyou-data/original-data$ ln -sfn ~/Data/ipinyou.contest.dataset ipinyou.contest.dataset
+make-ipinyou-data/original-data$ ln -sfn ~/Data/ipinyou.contest.dataset ipinyou.contest.dataset
 ```
 Under `make-ipinyou-data/original-data/ipinyou.contest.dataset` there should be the original dataset files like this:
 ```
-weinan@ZHANG:~/Project/make-ipinyou-data/original-data/ipinyou.contest.dataset$ ls
+make-ipinyou-data/original-data/ipinyou.contest.dataset$ ls
 algo.submission.demo.tar.bz2  README         testing2nd   training3rd
 city.cn.txt                   region.cn.txt  testing3rd   user.profile.tags.cn.txt
 city.en.txt                   region.en.txt  training1st  user.profile.tags.en.txt
@@ -28,7 +37,7 @@ Under `make-ipinyou-data` folder, just run `make all`.
 
 After the program finished, the total size of the folder will be 14G. The files under `make-ipinyou-data` should be like this:
 ```
-weinan@ZHANG:~/Project/make-ipinyou-data$ ls
+make-ipinyou-data$ ls
 1458  2261  2997  3386  3476  LICENSE   mkyzxdata.sh   python     schema.txt
 2259  2821  3358  3427  all   Makefile  original-data  README.md
 ```
@@ -37,12 +46,13 @@ Normally, we only do experiment for each campaign (e.g. `1458`). `all` is just t
 ### Use of the data
 We use campaign 1458 as example here.
 ```
-weinan@ZHANG:~/Project/make-ipinyou-data/1458$ ls
+make-ipinyou-data/1458$ ls
 featindex.txt  test.log.txt  test.yzx.txt  train.log.txt  train.yzx.txt
 ```
 * `train.log.txt` and `test.log.txt` are the formalised string data for each row (record) in train and test. The first column is whether the user click the ad or not. The 14th column is the winning price for this auction.
 * `featindex.txt`maps the features to their indexes. For example, `8:115.45.195.*	29` means that the 8th column in `train.log.txt` with the string `115.45.195.*` maps to feature index `29`.
 * `train.yzx.txt` and `test.yzx.txt` are the mapped vector data for `train.log.txt` and `test.log.txt`. The format is y:click, z:wining_price, and x:features. Such data is in the standard form as introduced in [iPinYou Benchmarking](http://arxiv.org/abs/1407.7073).
 
 
-For any questions, please report the issues or contact [Weinan Zhang](http://www0.cs.ucl.ac.uk/staff/w.zhang/).
+For any questions, please report the issues or contact [Weinan Zhang](http://www0.cs.ucl.ac.uk/staff/w.zhang/) or [frinkleko](https://github.com/frinkleko)
+
diff --git a/mkyzxdata.sh b/mkyzxdata.sh
@@ -2,6 +2,6 @@ advertisers="1458 2261 2997 3386 3476 2259 2821 3358 3427"
 
 for advertiser in $advertisers; do
     echo $advertiser
-    python python/mkyzx.py $advertiser/train.log.txt $advertiser/test.log.txt $advertiser/train.yzx.txt $advertiser/test.yzx.txt $advertiser/featindex.txt
+    python3 python/mkyzx.py $advertiser/train.log.txt $advertiser/test.log.txt $advertiser/train.yzx.txt $advertiser/test.yzx.txt $advertiser/featindex.txt
 done
 
diff --git a/python/formalizeua.py b/python/formalizeua.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 import os
 
 if len(sys.argv) < 2:
-    print 'Usage: input'
+    print('Usage: input')
     exit(-1)
 
 

diff --git a/python/mkdata.py b/python/mkdata.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 from datetime import date
 
 if len(sys.argv) < 3:
-    print 'Usage: schema clickfiles'
+    print('Usage: schema clickfiles')
     exit(-1)
 
 schema = [ s.strip() for s in open(sys.argv[1]).read().split() ]

diff --git a/python/mktest.py b/python/mktest.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 from datetime import date
 
 if len(sys.argv) < 2:
-    print 'Usage: schema '
+    print('Usage: schema ')
     exit(-1)
 
 schema = [ s.strip() for s in open(sys.argv[1]).read().split() ]

diff --git a/python/mkyzx.py b/python/mkyzx.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 import operator
 
 if len(sys.argv) < 5:
-    print 'Usage: train.log.txt test.log.txt train.lr.txt test.lr.txt featindex.txt'
+    print('Usage: train.log.txt test.log.txt train.lr.txt test.lr.txt featindex.txt')
     exit(-1)
 
 oses = ["windows", "ios", "mac", "android", "linux"]
@@ -89,15 +89,15 @@ def getTags(content):
             featindex[feat] = maxindex
             maxindex += 1
 
-print 'feature size: ' + str(maxindex)
-featvalue = sorted(featindex.iteritems(), key=operator.itemgetter(1))
+print('feature size: ' + str(maxindex))
+featvalue = sorted(featindex.items(), key=operator.itemgetter(1))
 fo = open(sys.argv[5], 'w')
 for fv in featvalue:
     fo.write(fv[0] + '\t' + str(fv[1]) + '\n')
 fo.close()
 
 # indexing train
-print 'indexing ' + sys.argv[1]
+print('indexing ' + sys.argv[1])
 fi = open(sys.argv[1], 'r')
 fo = open(sys.argv[3], 'w')
 
@@ -138,7 +138,7 @@ def getTags(content):
 fo.close()
 
 # indexing test
-print 'indexing ' + sys.argv[2]
+print('indexing ' + sys.argv[2])
 fi = open(sys.argv[2], 'r')
 fo = open(sys.argv[4], 'w')
 
@@ -154,8 +154,8 @@ def getTags(content):
     for f in f1s: # every direct first order feature
         col = namecol[f]
         if col >= len(s):
-            print 'col: ' + str(col)
-            print line
+            print('col: ' + str(col))
+            print(line)
         content = s[col]
         feat = str(col) + ':' + content
         if feat not in featindex:

diff --git a/python/splitadvertisers.py b/python/splitadvertisers.py
@@ -1,9 +1,9 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 import sys
 import os
 
 if len(sys.argv) < 5:
-    print 'Usage: ipinyou.folder 25 train.log.txt test.log.txt'
+    print('Usage: ipinyou.folder 25 train.log.txt test.log.txt')
     # python splitadvertisers.py ../ 25 ../all/train.log.txt ../all/test.log.txt
     exit(-1)