From 0ee8b67e46c96bdfde0a1df3e530798a62935ab9 Mon Sep 17 00:00:00 2001 From: frinkleko Date: Sun, 15 Jan 2023 14:39:40 +0800 Subject: [PATCH 1/4] fix: modified for python3 --- python/formalizeua.py | 4 ++-- python/mkdata.py | 4 ++-- python/mktest.py | 4 ++-- python/mkyzx.py | 14 +++++++------- python/splitadvertisers.py | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/formalizeua.py b/python/formalizeua.py index 24311c4..fc1ac3a 100755 --- a/python/formalizeua.py +++ b/python/formalizeua.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import os if len(sys.argv) < 2: - print 'Usage: input' + print('Usage: input') exit(-1) diff --git a/python/mkdata.py b/python/mkdata.py index c827445..9f1a196 100755 --- a/python/mkdata.py +++ b/python/mkdata.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys from datetime import date if len(sys.argv) < 3: - print 'Usage: schema clickfiles' + print('Usage: schema clickfiles') exit(-1) schema = [ s.strip() for s in open(sys.argv[1]).read().split() ] diff --git a/python/mktest.py b/python/mktest.py index c6afa62..b50f39f 100755 --- a/python/mktest.py +++ b/python/mktest.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys from datetime import date if len(sys.argv) < 2: - print 'Usage: schema ' + print('Usage: schema ') exit(-1) schema = [ s.strip() for s in open(sys.argv[1]).read().split() ] diff --git a/python/mkyzx.py b/python/mkyzx.py index caac4d7..a2d6112 100755 --- a/python/mkyzx.py +++ b/python/mkyzx.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import operator if len(sys.argv) < 5: - print 'Usage: train.log.txt test.log.txt train.lr.txt test.lr.txt featindex.txt' + print('Usage: train.log.txt test.log.txt train.lr.txt test.lr.txt featindex.txt') exit(-1) oses = ["windows", "ios", "mac", "android", "linux"] @@ -89,7 +89,7 @@ def getTags(content): featindex[feat] = maxindex maxindex += 1 -print 'feature size: ' + str(maxindex) +print('feature size: ' + str(maxindex)) featvalue = sorted(featindex.iteritems(), key=operator.itemgetter(1)) fo = open(sys.argv[5], 'w') for fv in featvalue: @@ -97,7 +97,7 @@ def getTags(content): fo.close() # indexing train -print 'indexing ' + sys.argv[1] +print('indexing ' + sys.argv[1]) fi = open(sys.argv[1], 'r') fo = open(sys.argv[3], 'w') @@ -138,7 +138,7 @@ def getTags(content): fo.close() # indexing test -print 'indexing ' + sys.argv[2] +print('indexing ' + sys.argv[2]) fi = open(sys.argv[2], 'r') fo = open(sys.argv[4], 'w') @@ -154,8 +154,8 @@ def getTags(content): for f in f1s: # every direct first order feature col = namecol[f] if col >= len(s): - print 'col: ' + str(col) - print line + print('col: ' + str(col)) + print(line) content = s[col] feat = str(col) + ':' + content if feat not in featindex: diff --git a/python/splitadvertisers.py b/python/splitadvertisers.py index 43bf20a..ce40f7e 100755 --- a/python/splitadvertisers.py +++ b/python/splitadvertisers.py @@ -1,9 +1,9 @@ -#!/usr/bin/python +#!/usr/bin/python3 import sys import os if len(sys.argv) < 5: - print 'Usage: ipinyou.folder 25 train.log.txt test.log.txt' + print('Usage: ipinyou.folder 25 train.log.txt test.log.txt') # python splitadvertisers.py ../ 25 ../all/train.log.txt ../all/test.log.txt exit(-1) From be26638e446639422c1576abf6e9263265283d64 Mon Sep 17 00:00:00 2001 From: frinkleko Date: Sun, 15 Jan 2023 14:48:32 +0800 Subject: [PATCH 2/4] fix: replace iteritems func and python in .sh --- README.md | 15 +++++++++------ mkyzxdata.sh | 2 +- python/mkyzx.py | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index aa66e3a..b2bc319 100644 --- a/README.md +++ b/README.md @@ -3,19 +3,21 @@ make-ipinyou-data This project is to formalise the iPinYou RTB data into a standard format for further researches. +**You should run these codes on Linux or WSL for unexpected errors.** + ### Step 0 -The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [UCL website](http://bunwell.cs.ucl.ac.uk/ipinyou.contest.dataset.zip). +The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [Kaggle](https://www.kaggle.com/datasets/lastsummer/ipinyou). Unzip it and get the folder `ipinyou.contest.dataset`. ### Step 1 Update the soft link for the folder `ipinyou.contest.dataset` in `original-data`. ``` -weinan@ZHANG:~/Project/make-ipinyou-data/original-data$ ln -sfn ~/Data/ipinyou.contest.dataset ipinyou.contest.dataset +make-ipinyou-data/original-data$ ln -sfn ~/Data/ipinyou.contest.dataset ipinyou.contest.dataset ``` Under `make-ipinyou-data/original-data/ipinyou.contest.dataset` there should be the original dataset files like this: ``` -weinan@ZHANG:~/Project/make-ipinyou-data/original-data/ipinyou.contest.dataset$ ls +make-ipinyou-data/original-data/ipinyou.contest.dataset$ ls algo.submission.demo.tar.bz2 README testing2nd training3rd city.cn.txt region.cn.txt testing3rd user.profile.tags.cn.txt city.en.txt region.en.txt training1st user.profile.tags.en.txt @@ -28,7 +30,7 @@ Under `make-ipinyou-data` folder, just run `make all`. After the program finished, the total size of the folder will be 14G. The files under `make-ipinyou-data` should be like this: ``` -weinan@ZHANG:~/Project/make-ipinyou-data$ ls +make-ipinyou-data$ ls 1458 2261 2997 3386 3476 LICENSE mkyzxdata.sh python schema.txt 2259 2821 3358 3427 all Makefile original-data README.md ``` @@ -37,7 +39,7 @@ Normally, we only do experiment for each campaign (e.g. `1458`). `all` is just t ### Use of the data We use campaign 1458 as example here. ``` -weinan@ZHANG:~/Project/make-ipinyou-data/1458$ ls +make-ipinyou-data/1458$ ls featindex.txt test.log.txt test.yzx.txt train.log.txt train.yzx.txt ``` * `train.log.txt` and `test.log.txt` are the formalised string data for each row (record) in train and test. The first column is whether the user click the ad or not. The 14th column is the winning price for this auction. @@ -45,4 +47,5 @@ featindex.txt test.log.txt test.yzx.txt train.log.txt train.yzx.txt * `train.yzx.txt` and `test.yzx.txt` are the mapped vector data for `train.log.txt` and `test.log.txt`. The format is y:click, z:wining_price, and x:features. Such data is in the standard form as introduced in [iPinYou Benchmarking](http://arxiv.org/abs/1407.7073). -For any questions, please report the issues or contact [Weinan Zhang](http://www0.cs.ucl.ac.uk/staff/w.zhang/). +For any questions, please report the issues or contact [Weinan Zhang](http://www0.cs.ucl.ac.uk/staff/w.zhang/) or [frinkleko](https://github.com/frinkleko) + diff --git a/mkyzxdata.sh b/mkyzxdata.sh index 777f511..aa129a8 100644 --- a/mkyzxdata.sh +++ b/mkyzxdata.sh @@ -2,6 +2,6 @@ advertisers="1458 2261 2997 3386 3476 2259 2821 3358 3427" for advertiser in $advertisers; do echo $advertiser - python python/mkyzx.py $advertiser/train.log.txt $advertiser/test.log.txt $advertiser/train.yzx.txt $advertiser/test.yzx.txt $advertiser/featindex.txt + python3 python/mkyzx.py $advertiser/train.log.txt $advertiser/test.log.txt $advertiser/train.yzx.txt $advertiser/test.yzx.txt $advertiser/featindex.txt done diff --git a/python/mkyzx.py b/python/mkyzx.py index a2d6112..c65d75b 100755 --- a/python/mkyzx.py +++ b/python/mkyzx.py @@ -90,7 +90,7 @@ def getTags(content): maxindex += 1 print('feature size: ' + str(maxindex)) -featvalue = sorted(featindex.iteritems(), key=operator.itemgetter(1)) +featvalue = sorted(featindex.items(), key=operator.itemgetter(1)) fo = open(sys.argv[5], 'w') for fv in featvalue: fo.write(fv[0] + '\t' + str(fv[1]) + '\n') From a41782aeda43e72d2ffe8748420f7fb5cea84311 Mon Sep 17 00:00:00 2001 From: frinkleko Date: Sun, 15 Jan 2023 15:09:23 +0800 Subject: [PATCH 3/4] fix: wrong description in readme --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b2bc319..43a38f8 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,15 @@ make-ipinyou-data This project is to formalise the iPinYou RTB data into a standard format for further researches. -**You should run these codes on Linux or WSL for unexpected errors.** +**You should run these codes on Linux or WSL for preventing unexpected errors.** ### Step 0 The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [Kaggle](https://www.kaggle.com/datasets/lastsummer/ipinyou). Unzip it and get the folder `ipinyou.contest.dataset`. +To speed up the process of + ### Step 1 Update the soft link for the folder `ipinyou.contest.dataset` in `original-data`. ``` From dee6bee41021649c3c409664af3817b2ed51b3df Mon Sep 17 00:00:00 2001 From: frinkleko Date: Sun, 15 Jan 2023 15:50:43 +0800 Subject: [PATCH 4/4] feat: use pbzip2 --- Makefile | 4 ++-- README.md | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 614787a..46ee9c2 100644 --- a/Makefile +++ b/Makefile @@ -13,11 +13,11 @@ init: $(ORIGINALFOLDER) cp $(ORIGINALFOLDER)/training2nd/clk.*.bz2 $(TRAIN) cp $(ORIGINALFOLDER)/training3rd/imp.*.bz2 $(TRAIN) cp $(ORIGINALFOLDER)/training3rd/clk.*.bz2 $(TRAIN) - bzip2 -d $(TRAIN)/* + pbzip2 -d $(TRAIN)/* mkdir -p $(TEST) cp $(ORIGINALFOLDER)/testing2nd/* $(TEST) cp $(ORIGINALFOLDER)/testing3rd/* $(TEST) - bzip2 -d $(TEST)/* + pbzip2 -d $(TEST)/* mkdir $(BASE)/all clk: $(TRAIN) diff --git a/README.md b/README.md index 43a38f8..ba9606d 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,12 @@ The raw data of iPinYou (`ipinyou.contest.dataset.zip`) can be downloaded from [ Unzip it and get the folder `ipinyou.contest.dataset`. -To speed up the process of +To speed up the process of bzip2, install `pbzip2`. +``` +# for example on Ubuntu +sudo apt-get update +sudo apt-get install pbzip2 +``` ### Step 1 Update the soft link for the folder `ipinyou.contest.dataset` in `original-data`.