diff --git a/Makefile b/Makefile index 614787a..46ee9c2 100644 --- a/Makefile +++ b/Makefile @@ -13,11 +13,11 @@ init: $(ORIGINALFOLDER) cp $(ORIGINALFOLDER)/training2nd/clk.*.bz2 $(TRAIN) cp $(ORIGINALFOLDER)/training3rd/imp.*.bz2 $(TRAIN) cp $(ORIGINALFOLDER)/training3rd/clk.*.bz2 $(TRAIN) - bzip2 -d $(TRAIN)/* + pbzip2 -d $(TRAIN)/* mkdir -p $(TEST) cp $(ORIGINALFOLDER)/testing2nd/* $(TEST) cp $(ORIGINALFOLDER)/testing3rd/* $(TEST) - bzip2 -d $(TEST)/* + pbzip2 -d $(TEST)/* mkdir $(BASE)/all clk: $(TRAIN) diff --git a/README.md b/README.md index aa66e3a..ba9606d 100644 --- a/README.md +++ b/README.md @@ -3,19 +3,28 @@ make-ipinyou-data This project is to formalise the iPinYou RTB data into a standard format for further researches. +**You should run these codes on Linux or WSL for preventing unexpected errors.** + ### Step 0 -The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [UCL website](http://bunwell.cs.ucl.ac.uk/ipinyou.contest.dataset.zip). +The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [Kaggle](https://www.kaggle.com/datasets/lastsummer/ipinyou). Unzip it and get the folder `ipinyou.contest.dataset`. +To speed up the process of bzip2, install `pbzip2`. +``` +# for example on Ubuntu +sudo apt-get update +sudo apt-get install pbzip2 +``` + ### Step 1 Update the soft link for the folder `ipinyou.contest.dataset` in `original-data`. ``` -weinan@ZHANG:~/Project/make-ipinyou-data/original-data$ ln -sfn ~/Data/ipinyou.contest.dataset ipinyou.contest.dataset +make-ipinyou-data/original-data$ ln -sfn ~/Data/ipinyou.contest.dataset ipinyou.contest.dataset ``` Under `make-ipinyou-data/original-data/ipinyou.contest.dataset` there should be the original dataset files like this: ``` -weinan@ZHANG:~/Project/make-ipinyou-data/original-data/ipinyou.contest.dataset$ ls +make-ipinyou-data/original-data/ipinyou.contest.dataset$ ls algo.submission.demo.tar.bz2 README testing2nd training3rd city.cn.txt region.cn.txt testing3rd user.profile.tags.cn.txt city.en.txt region.en.txt training1st user.profile.tags.en.txt @@ -28,7 +37,7 @@ Under `make-ipinyou-data` folder, just run `make all`. After the program finished, the total size of the folder will be 14G. The files under `make-ipinyou-data` should be like this: ``` -weinan@ZHANG:~/Project/make-ipinyou-data$ ls +make-ipinyou-data$ ls 1458 2261 2997 3386 3476 LICENSE mkyzxdata.sh python schema.txt 2259 2821 3358 3427 all Makefile original-data README.md ``` @@ -37,7 +46,7 @@ Normally, we only do experiment for each campaign (e.g. `1458`). `all` is just t ### Use of the data We use campaign 1458 as example here. ``` -weinan@ZHANG:~/Project/make-ipinyou-data/1458$ ls +make-ipinyou-data/1458$ ls featindex.txt test.log.txt test.yzx.txt train.log.txt train.yzx.txt ``` * `train.log.txt` and `test.log.txt` are the formalised string data for each row (record) in train and test. The first column is whether the user click the ad or not. The 14th column is the winning price for this auction. @@ -45,4 +54,5 @@ featindex.txt test.log.txt test.yzx.txt train.log.txt train.yzx.txt * `train.yzx.txt` and `test.yzx.txt` are the mapped vector data for `train.log.txt` and `test.log.txt`. The format is y:click, z:wining_price, and x:features. Such data is in the standard form as introduced in [iPinYou Benchmarking](http://arxiv.org/abs/1407.7073). -For any questions, please report the issues or contact [Weinan Zhang](http://www0.cs.ucl.ac.uk/staff/w.zhang/). +For any questions, please report the issues or contact [Weinan Zhang](http://www0.cs.ucl.ac.uk/staff/w.zhang/) or [frinkleko](https://github.com/frinkleko) + diff --git a/mkyzxdata.sh b/mkyzxdata.sh index 777f511..aa129a8 100644 --- a/mkyzxdata.sh +++ b/mkyzxdata.sh @@ -2,6 +2,6 @@ advertisers="1458 2261 2997 3386 3476 2259 2821 3358 3427" for advertiser in $advertisers; do echo $advertiser - python python/mkyzx.py $advertiser/train.log.txt $advertiser/test.log.txt $advertiser/train.yzx.txt $advertiser/test.yzx.txt $advertiser/featindex.txt + python3 python/mkyzx.py $advertiser/train.log.txt $advertiser/test.log.txt $advertiser/train.yzx.txt $advertiser/test.yzx.txt $advertiser/featindex.txt done diff --git a/python/formalizeua.py b/python/formalizeua.py index 24311c4..fc1ac3a 100755 --- a/python/formalizeua.py +++ b/python/formalizeua.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import os if len(sys.argv) < 2: - print 'Usage: input' + print('Usage: input') exit(-1) diff --git a/python/mkdata.py b/python/mkdata.py index c827445..9f1a196 100755 --- a/python/mkdata.py +++ b/python/mkdata.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys from datetime import date if len(sys.argv) < 3: - print 'Usage: schema clickfiles' + print('Usage: schema clickfiles') exit(-1) schema = [ s.strip() for s in open(sys.argv[1]).read().split() ] diff --git a/python/mktest.py b/python/mktest.py index c6afa62..b50f39f 100755 --- a/python/mktest.py +++ b/python/mktest.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys from datetime import date if len(sys.argv) < 2: - print 'Usage: schema ' + print('Usage: schema ') exit(-1) schema = [ s.strip() for s in open(sys.argv[1]).read().split() ] diff --git a/python/mkyzx.py b/python/mkyzx.py index caac4d7..c65d75b 100755 --- a/python/mkyzx.py +++ b/python/mkyzx.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import operator if len(sys.argv) < 5: - print 'Usage: train.log.txt test.log.txt train.lr.txt test.lr.txt featindex.txt' + print('Usage: train.log.txt test.log.txt train.lr.txt test.lr.txt featindex.txt') exit(-1) oses = ["windows", "ios", "mac", "android", "linux"] @@ -89,15 +89,15 @@ def getTags(content): featindex[feat] = maxindex maxindex += 1 -print 'feature size: ' + str(maxindex) -featvalue = sorted(featindex.iteritems(), key=operator.itemgetter(1)) +print('feature size: ' + str(maxindex)) +featvalue = sorted(featindex.items(), key=operator.itemgetter(1)) fo = open(sys.argv[5], 'w') for fv in featvalue: fo.write(fv[0] + '\t' + str(fv[1]) + '\n') fo.close() # indexing train -print 'indexing ' + sys.argv[1] +print('indexing ' + sys.argv[1]) fi = open(sys.argv[1], 'r') fo = open(sys.argv[3], 'w') @@ -138,7 +138,7 @@ def getTags(content): fo.close() # indexing test -print 'indexing ' + sys.argv[2] +print('indexing ' + sys.argv[2]) fi = open(sys.argv[2], 'r') fo = open(sys.argv[4], 'w') @@ -154,8 +154,8 @@ def getTags(content): for f in f1s: # every direct first order feature col = namecol[f] if col >= len(s): - print 'col: ' + str(col) - print line + print('col: ' + str(col)) + print(line) content = s[col] feat = str(col) + ':' + content if feat not in featindex: diff --git a/python/splitadvertisers.py b/python/splitadvertisers.py index 43bf20a..ce40f7e 100755 --- a/python/splitadvertisers.py +++ b/python/splitadvertisers.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import os if len(sys.argv) < 5: - print 'Usage: ipinyou.folder 25 train.log.txt test.log.txt' + print('Usage: ipinyou.folder 25 train.log.txt test.log.txt') # python splitadvertisers.py ../ 25 ../all/train.log.txt ../all/test.log.txt exit(-1)