From fc4d5f3d09cf1055aaaabbc3db06907c473369ca Mon Sep 17 00:00:00 2001 From: camera-2018 <2907618001@qq.com> Date: Tue, 25 Apr 2023 20:03:35 +0800 Subject: [PATCH 1/2] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E5=87=A0=E5=A4=84?= =?UTF-8?q?=E9=93=BE=E6=8E=A5=E5=A5=97=E9=93=BE=E6=8E=A5=E7=9A=84=E9=94=99?= =?UTF-8?q?=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/ch02/ch2.1/ch2.1.3/PinSage.md | 2 +- docs/ch02/ch2.2/ch2.2.2/FM.md | 2 +- .../1.1.5 FM.md" | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/ch02/ch2.1/ch2.1.3/PinSage.md b/docs/ch02/ch2.1/ch2.1.3/PinSage.md index 73d827112..bd35612cf 100644 --- a/docs/ch02/ch2.1/ch2.1.3/PinSage.md +++ b/docs/ch02/ch2.1/ch2.1.3/PinSage.md @@ -128,7 +128,7 @@ PinSage在训练时采用的是 Margin Hinge Loss 损失函数,主要的思想 - easy 负样本:这里对于mini-batch内的所有pair(训练样本对)会共享500负样本,这500个样本从batch之外的所有节点中随机采样得到。这么做可以减少在每个mini-batch中因计算所有节点的embedding所需的时间,文中指出这和为每个item采样一定数量负样本无差异。 - hard 负样本:这里使用hard 负样本的原因是根据实际场景的问题出发,模型需要从20亿的物品item集合中识别出最相似的1000个,即模型需要从2百万 item 中识别出最相似的那一个 item。也就是说模型的区分能力不够细致,为了解决这个问题,加入了一些hard样本。对于hard 负样本,应该是与 q 相似 以及和 i 不相似的物品,具体地的生成方式是将图上的节点计算相对节点 q 的个性化PageRank分值,根据分值的排序随机从2000~5000的位置选取节点作为负样本。 -负样本的构建是召回模型的中关键的内容,在各家公司的工作都予以体现,具体的大家可以参考 Facebook 发表的[《Embedding-based Retrieval in Facebook Search》]([https://arxiv.org/pdf/2006.11632v1.pdf](https://links.jianshu.com/go?to=https%3A%2F%2Farxiv.org%2Fpdf%2F2006.11632v1.pdf)) +负样本的构建是召回模型的中关键的内容,在各家公司的工作都予以体现,具体的大家可以参考 Facebook 发表的[《Embedding-based Retrieval in Facebook Search》](https://arxiv.org/pdf/2006.11632v1.pdf) **渐进式训练(Curriculum training)** diff --git a/docs/ch02/ch2.2/ch2.2.2/FM.md b/docs/ch02/ch2.2/ch2.2.2/FM.md index 44a713ff3..1141af5eb 100644 --- a/docs/ch02/ch2.2/ch2.2.2/FM.md +++ b/docs/ch02/ch2.2/ch2.2.2/FM.md @@ -143,5 +143,5 @@ class FM(Layer): **参考资料** * [FM:推荐算法中的瑞士军刀](https://zhuanlan.zhihu.com/p/343174108) * [FM算法解析](https://zhuanlan.zhihu.com/p/37963267) -* [FM论文原文]([https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)) +* [FM论文原文](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) * [AI上推荐 之 FM和FFM](https://blog.csdn.net/wuzhongqiang/article/details/108719417) \ No newline at end of file diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.5 FM.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.5 FM.md" index 99abc85e2..34e20ad80 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.5 FM.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.5 FM.md" @@ -317,6 +317,6 @@ sparse_feats: 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11 * [FM算法解析](https://zhuanlan.zhihu.com/p/37963267) * [推荐系统遇上深度学习(一)--FM模型理论和实践](https://www.jianshu.com/p/152ae633fb00) -* [FM论文原文]([https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)) +* [FM论文原文](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) * [FM算法原理分析与实践](https://www.csuldw.com/2019/02/08/2019-02-08-fm-algorithm-theory/) * [AI上推荐 之 FM和FFM](https://blog.csdn.net/wuzhongqiang/article/details/108719417) \ No newline at end of file From 43151d0cdf930ce0cb2d04115cb2f6e00bd0d0c2 Mon Sep 17 00:00:00 2001 From: camera-2018 <2907618001@qq.com> Date: Tue, 25 Apr 2023 20:05:27 +0800 Subject: [PATCH 2/2] =?UTF-8?q?chore:=20ryluo-oss=20=E4=B8=8B=E9=93=BE?= =?UTF-8?q?=E6=8E=A5=E4=BD=BF=E7=94=A8https?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/README.md | 10 +- docs/ch01/ch1.1.md | 32 +- docs/ch01/ch1.2.md | 10 +- docs/ch01/ch1.3.md | 8 +- docs/ch02/ch2.1/ch2.1.1/Swing.md | 6 +- docs/ch02/ch2.1/ch2.1.1/itemcf.md | 10 +- docs/ch02/ch2.1/ch2.1.1/usercf.md | 6 +- docs/ch02/ch2.1/ch2.1.2/Airbnb.md | 22 +- docs/ch02/ch2.1/ch2.1.2/word2vec.md | 24 +- docs/ch02/ch2.1/ch2.1.3/EGES.md | 16 +- docs/ch02/ch2.1/ch2.1.5/TDM.md | 8 +- docs/ch02/ch2.2/ch2.2.2/DCN.md | 6 +- docs/ch02/ch2.2/ch2.2.2/PNN.md | 6 +- docs/ch02/ch2.2/ch2.2.3/AFM.md | 6 +- docs/ch02/ch2.2/ch2.2.3/DeepFM.md | 22 +- docs/ch02/ch2.2/ch2.2.3/NFM.md | 8 +- docs/ch02/ch2.2/ch2.2.3/WideNDeep.md | 6 +- docs/ch02/ch2.2/ch2.2.4/DIEN.md | 8 +- docs/ch02/ch2.2/ch2.2.4/DIN.md | 4 +- ...350\267\257\345\217\254\345\233\236.ipynb" | 4172 ++++----- ...345\236\213\350\236\215\345\220\210.ipynb" | 5304 +++++------ ...346\215\256\345\210\206\346\236\220.ipynb" | 7762 ++++++++--------- ...345\276\201\345\267\245\347\250\213.ipynb" | 3536 ++++---- ...30\347\220\206\350\247\243+Baseline.ipynb" | 1310 +-- docs/ch03/ch3.1/markdown/ch3.1.1.md | 2 +- docs/ch03/ch3.1/markdown/ch3.1.2.md | 84 +- docs/ch03/ch3.1/markdown/ch3.1.3.md | 4 +- docs/ch03/ch3.1/markdown/ch3.1.4.md | 4 +- docs/ch03/ch3.1/markdown/ch3.1.5.md | 4 +- docs/ch03/ch3.2/3.2.1.3.md | 8 +- docs/ch03/ch3.2/3.2.1.4.md | 6 +- docs/ch03/ch3.2/3.2.1.5.md | 16 +- docs/ch03/ch3.2/3.2.2.3.md | 2 +- docs/ch03/ch3.2/3.2.3.md | 2 +- docs/ch03/ch3.2/3.2.4.3.md | 8 +- docs/ch03/ch3.2/3.2.8.1.md | 22 +- docs/ch03/ch3.2/3.2.8.2.md | 8 +- docs/ch03/ch3.2/3.2.8.3.md | 14 +- docs/ch03/ch3.2/3.2.md | 4 +- ...73\350\276\221\345\233\236\345\275\222.md" | 4 +- ...36\347\273\217\347\275\221\347\273\234.md" | 34 +- ...30\345\214\226\347\256\227\346\263\225.md" | 6 +- ...55\345\273\272\345\237\272\347\241\200.md" | 10 +- .../1.0.6 Word2vec.md" | 24 +- .../1.1.1 \346\246\202\350\277\260.md" | 8 +- ...220\214\350\277\207\346\273\244-UserCF.md" | 6 +- ...220\214\350\277\207\346\273\244-ItemCF.md" | 10 +- .../readme.md" | 4 +- .../1.2.1 NeuralCF.md" | 4 +- .../1.2.10 DIEN.md" | 8 +- .../1.2.2 DeepCrossing.md" | 8 +- .../1.2.3 PNN.md" | 6 +- .../1.2.4 Wide&Deep.md" | 6 +- .../1.2.5 DeepFM.md" | 22 +- .../1.2.6 NFM.md" | 8 +- .../1.2.7 DCN.md" | 6 +- .../1.2.8 AFM.md" | 6 +- .../1.2.9 DIN.md" | 4 +- ...30\347\220\206\350\247\243+Baseline.ipynb" | 1310 +-- ...346\215\256\345\210\206\346\236\220.ipynb" | 7762 ++++++++--------- ...350\267\257\345\217\254\345\233\236.ipynb" | 4172 ++++----- ...345\276\201\345\267\245\347\250\213.ipynb" | 3536 ++++---- ...345\236\213\350\236\215\345\220\210.ipynb" | 5304 +++++------ ...2\230\347\220\206\350\247\243+Baseline.md" | 2 +- ...60\346\215\256\345\210\206\346\236\220.md" | 84 +- ...32\350\267\257\345\217\254\345\233\236.md" | 4 +- ...71\345\276\201\345\267\245\347\250\213.md" | 4 +- ...41\345\236\213\350\236\215\345\220\210.md" | 4 +- .../2.2.1.3 Redis\345\237\272\347\241\200.md" | 8 +- ...54\345\217\226\345\256\236\346\210\230.md" | 6 +- ...51\346\226\231\347\224\273\345\203\217.md" | 16 +- ...16\347\253\257\344\272\244\344\272\222.md" | 2 +- ...13\347\232\204\346\236\204\345\273\272.md" | 2 +- .../2.2.5.1 DSSM\345\217\254\345\233\236.md" | 8 +- .../readme.md" | 2 +- readme.md | 10 +- 76 files changed, 22455 insertions(+), 22455 deletions(-) diff --git a/docs/README.md b/docs/README.md index 121c7fa59..61a4992eb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -18,7 +18,7 @@ 为了方便学习和交流,**我们建立了FunRec学习社区(微信群+知识星球)**,微信群方便大家平时日常交流和讨论,知识星球方便沉淀内容。由于我们的内容面向的人群主要是学生,所以**知识星球永久免费**,感兴趣的可以加入星球讨论(加入星球的同学先看置定的必读帖)!**FunRec学习社区内部会不定期分享(FunRec社区中爱分享的同学)技术总结、个人管理等内容,[跟技术相关的分享内容都放在了B站](https://space.bilibili.com/431850986/channel/collectiondetail?sid=339597)上面**。由于微信群的二维码只有7天内有效,所以直接加下面这个微信,备注:**Fun-Rec**,会被拉到Fun-Rec交流群,如果觉得微信群比较吵建议直接加知识星球!。
-image-20220408193745249 +image-20220408193745249
## 内容导航 @@ -134,15 +134,15 @@ [2.1 竞赛实践(天池入门赛-新闻推荐)](https://tianchi.aliyun.com/competition/entrance/531842/forum)
- image-20211213165802957 - image-20211213165847593 + image-20211213165802957 + image-20211213165847593
**2.2 新闻推荐系统实践前端展示和后端逻辑(项目没有任何商用价值仅供入门者学习)**
- image-20211205142026937 - Fun-Rec新闻推荐系统 + image-20211205142026937 + Fun-Rec新闻推荐系统
diff --git a/docs/ch01/ch1.1.md b/docs/ch01/ch1.1.md index e0d956542..6e198a0d0 100644 --- a/docs/ch01/ch1.1.md +++ b/docs/ch01/ch1.1.md @@ -46,45 +46,45 @@ - **电商首页推荐(淘宝、京东、拼多多)**
-image-20220421190313917 -image-20220421191138469 -image-20220421191441104 +image-20220421190313917 +image-20220421191138469 +image-20220421191441104
- **视频推荐(抖音、快手、B站、爱奇艺)**
-image-20220421190629410 -image-20220421191849577 -image-20220421192047973 -image-20220421192209412 +image-20220421190629410 +image-20220421191849577 +image-20220421192047973 +image-20220421192209412
- **饮食推荐(美团、饿了么、叮咚买菜)**
-image-20220421192623380 -image-20220421192717773 -image-20220421192749794 +image-20220421192623380 +image-20220421192717773 +image-20220421192749794
- **音乐电台(网易云音乐、QQ音乐、喜马拉雅)**
-image-20220421193139183 -image-20220421193447933 -image-20220421193325921 +image-20220421193139183 +image-20220421193447933 +image-20220421193325921
- **资讯、阅读(头条、知乎、豆瓣)**
-image-20220421193856262 -image-20220421193923283 -image-20220421194244083 +image-20220421193856262 +image-20220421193923283 +image-20220421194244083
diff --git a/docs/ch01/ch1.2.md b/docs/ch01/ch1.2.md index 228e470e3..e4dbf2404 100644 --- a/docs/ch01/ch1.2.md +++ b/docs/ch01/ch1.2.md @@ -30,12 +30,12 @@
-在这里插入图片描述 +在这里插入图片描述
上面是网飞的原图,我搬运了更加容易理解的线条梳理后的结构:
-在这里插入图片描述 +在这里插入图片描述
整个数据部分其实是一整个链路,主要是三块,分别是客户端及服务器实时数据处理、流处理平台准实时数据处理和大数据平台离线数据处理这三个部分。 @@ -62,7 +62,7 @@ 这里我们可以看出离线层的任务是最接近学校中我们处理数据、训练模型这种任务的,不同可能就是需要面临更大规模的数据。离线任务一般会按照天或者更久运行,比如每天晚上定期更新这一天的数据,然后重新训练模型,第二天上线新模型。
-在这里插入图片描述 +在这里插入图片描述
#### 离线层优势和不足 @@ -83,7 +83,7 @@ 近线层的发展得益于最近几年大数据技术的发展,很多流处理框架的提出大大促进了近线层的进步。如今Flink、Storm等工具一统天下。
-在这里插入图片描述 +在这里插入图片描述
### 在线层 @@ -106,7 +106,7 @@ 所以一个通用的算法架构,设计思想就是对数据层层建模,层层筛选,帮助用户从海量数据中找出其真正感兴趣的部分。
-在这里插入图片描述 +在这里插入图片描述
- 召回 diff --git a/docs/ch01/ch1.3.md b/docs/ch01/ch1.3.md index a37d0e35b..a8cda17a6 100644 --- a/docs/ch01/ch1.3.md +++ b/docs/ch01/ch1.3.md @@ -7,7 +7,7 @@ 首先我们从推荐系统架构出发,一种分法是将整个推荐系统架构分为召回、粗排、精排、重排、混排等模块。它的分解方法是从一份数据如何从生产出来,到线上服务完整顺序的一个流程。因为在不同环节,我们一般会考虑不同的算法,所以这种角度出发我们来研究推荐系统主流的算法技术栈。
-在这里插入图片描述 +在这里插入图片描述
为了帮助新手在后文方便理解,首先简单介绍这些模块的功能主要是: @@ -22,7 +22,7 @@ 首先是推荐系统的物料库,这部分内容里,算法主要体现在如何绘制一个用户画像和商品画像。这个环节是推荐系统架构的基础设施,一般可能新用户/商品进来,或者每周定期会重新一次整个物料库,计算其中信息,为用户打上标签,计算统计信息,为商品做内容理解等内容。其中用户画像是大家比较容易理解的,比如用户年龄、爱好通常APP会通过注册界面收集这些信息。而商品画像形式就非常多了,比如淘宝主要推荐商品,抖音主要是短视频,所以大家的物料形式比较多,内容、质量差异也比较大,所以内容画像各家的做法也不同,当前比较主流的都会涉及到一个多模态信息内容理解。下面我贴了一个微信看一看的内容画像框架,然后我们来介绍下在这一块主要使用的算法技术。
-在这里插入图片描述 +在这里插入图片描述
一般推荐系统会加入多模态的一个内容理解。我们用短视频形式举个例子,假设用户拍摄了一条短视频,上传到了平台,从推荐角度看,首先我们有的信息是这条短视频的作者、长度、作者为它选择的标签、时间戳这些信息。但是这对于推荐来说是远远不够的,首先作者打上的标签不一定准确反映作品,原因可能是我们模型的语义空间可能和作者/现实世界不一致。其次我们需要更多维度的特征,比如有些用户喜欢看小姐姐跳舞,那我希望能够判断一条视频中是否有小姐姐,这就涉及到封面图的基于CV的内容抽取或者整个视频的抽取;再比如作品的标题一般能够反映主题信息,除了很多平台常用的用“#”加上一个标签以外,我们也希望能够通过标题抽取出基于NLP的信息。还有更多的维度可以考虑:封面图多维度的多媒体特征体系,包括人脸识别,人脸embedding,标签,一二级分类,视频embedding表示,水印,OCR识别,清晰度,低俗色情,敏感信息等多种维度。 @@ -58,7 +58,7 @@ 推荐系统的召回阶段可以理解为根据用户的历史行为数据,为用户在海量的信息中粗选一批待推荐的内容,挑选出一个小的候选集的过程。粗排用到的很多技术与召回重合,所以放在一起讲,粗排也不是必需的环节,它的功能对召回的结果进行个粗略的排序,在保证一定精准的前提下,进一步减少往后传送的物品数量,这就是粗排的作用。
-在这里插入图片描述 +在这里插入图片描述
召回模块面对几百上千万的推荐池物料规模,候选集十分庞大。由于后续有排序模块作为保障,故不需要十分准确,但必须保证不要遗漏和低延迟。目前主要通过多路召回来实现,一方面各路可以并行计算,另一方面取长补短。可以看到各类同类竞品的系统虽然细节上多少存在差异,但不约而同的采取了多路召回的架构,这类设计考虑如下几点问题: @@ -112,7 +112,7 @@ 排序模型是推荐系统中涵盖的研究方向最多,有非常多的子领域值得研究探索,这也是推荐系统中技术含量最高的部分,毕竟它是直接面对用户,产生的结果对用户影响最大的一层。目前精排层深度学习已经一统天下了,这是王喆老师《深度学习推荐算法》书中的精排层模型演化线路。具体来看分为DNN、Wide&Deep两大块,实际深入还有序列建模,以及没有提到的多任务建模都是工业界非常常用的,所以我们接下来具体谈论其中每一块的技术栈。
-在这里插入图片描述 +在这里插入图片描述
#### 特征交叉模型 diff --git a/docs/ch02/ch2.1/ch2.1.1/Swing.md b/docs/ch02/ch2.1/ch2.1.1/Swing.md index 9ca39f2a9..ac321df11 100644 --- a/docs/ch02/ch2.1/ch2.1.1/Swing.md +++ b/docs/ch02/ch2.1/ch2.1.1/Swing.md @@ -3,7 +3,7 @@ 大规模推荐系统需要实时对用户行为做出海量预测,为了保证这种实时性,大规模的推荐系统通常严重依赖于预先计算好的产品索引。产品索引的功能为:给定种子产品返回排序后的候选相关产品列表。
-在这里插入图片描述 +在这里插入图片描述
相关性产品索引主要包含两部分:替代性产品和互补性产品。例如图中的不同种类的衬衫构成了替代关系,而衬衫和风衣裤子等构成了互补关系。用户通常希望在完成购买行为之前尽可能看更多的衬衫,而用户购买过衬衫之后更希望看到与之搭配的单品而不是其他衬衫了。 @@ -23,7 +23,7 @@ Swing 通过利用 User-Item-User 路径中所包含的信息,考虑 User-Item - 什么是内部子结构? 以经典的啤酒尿布故事为例,张三同时购买了啤酒和尿布,这可能是一种巧合。但两个甚至多个顾客都同时购买了啤酒尿布,这就证明啤酒和尿布具有相关关系。这样共同购买啤酒和尿布的用户越多,啤酒和尿布的相关度就会越高。
- 在这里插入图片描述 + 在这里插入图片描述
图中的红色四边形就是一种Swing子结构,这种子结构可以作为给王五推荐尿布的依据。 @@ -256,7 +256,7 @@ Swing 通过利用 User-Item-User 路径中所包含的信息,考虑 User-Item 由于类别直接的种类差异,每个类别的相关类数量存在差异,因此采用最大相对落点来作为划分阈值。
- 在这里插入图片描述 + 在这里插入图片描述
例如图(a)中T恤的相关类选择前八个,图(b)中手机的相关类选择前三个。 diff --git a/docs/ch02/ch2.1/ch2.1.1/itemcf.md b/docs/ch02/ch2.1/ch2.1.1/itemcf.md index aa800f377..92c69d21c 100644 --- a/docs/ch02/ch2.1/ch2.1.1/itemcf.md +++ b/docs/ch02/ch2.1/ch2.1.1/itemcf.md @@ -9,13 +9,13 @@ 举例来说,如果用户 1 喜欢物品 A ,而物品 A 和 C 非常相似,则可以将物品 C 推荐给用户1。ItemCF算法并不利用物品的内容属性计算物品之间的相似度, 主要通过分析用户的行为记录计算物品之间的相似度, 该算法认为, 物品 A 和物品 C 具有很大的相似度是因为喜欢物品 A 的用户极可能喜欢物品 C。 -![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavagdvaYX0HSW4PdssV.png!thumbnail) +![图片](https://ryluo.oss-cn-chengdu.aliyuncs.com/JavagdvaYX0HSW4PdssV.png!thumbnail) ## 计算过程 基于物品的协同过滤算法和基于用户的协同过滤算法很像, 所以我们这里直接还是拿上面 Alice 的那个例子来看。 -![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavaE306yXB4mGmjIxbn.png!thumbnail) +![图片](https://ryluo.oss-cn-chengdu.aliyuncs.com/JavaE306yXB4mGmjIxbn.png!thumbnail) 如果想知道 Alice 对物品5打多少分, 基于物品的协同过滤算法会这么做: @@ -41,7 +41,7 @@ 2. 基于 `sklearn` 计算物品之间的皮尔逊相关系数: -图片 +图片 3. 根据皮尔逊相关系数, 可以找到与物品5最相似的2个物品是 item1 和 item4, 下面基于上面的公式计算最终得分: @@ -196,7 +196,7 @@ $$ 比如下面这个例子: -![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavaxxhHm3BAtMfsy2AV.png!thumbnail) +![图片](https://ryluo.oss-cn-chengdu.aliyuncs.com/JavaxxhHm3BAtMfsy2AV.png!thumbnail) + 左边矩阵中,$A, B, C, D$ 表示的是物品。 + 可以看出,$D $ 是一件热门物品,其与 $A、B、C$ 的相似度比较大。因此,推荐系统更可能将 $D$ 推荐给用过 $A、B、C$ 的用户。 @@ -242,7 +242,7 @@ $$ > > 举例来说明,如下图(`X,Y,Z` 表示物品,`d,e,f`表示用户): > -> ![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavaWKvITKBhYOkfXrzs.png!thumbnail) +> ![图片](https://ryluo.oss-cn-chengdu.aliyuncs.com/JavaWKvITKBhYOkfXrzs.png!thumbnail) > > + 如果使用余弦相似度进行计算,用户 d 和 e 之间较为相似。但是实际上,用户 d 和 f 之间应该更加相似。只不过由于 d 倾向于打高分,e 倾向于打低分导致二者之间的余弦相似度更高。 > + 这种情况下,可以考虑使用皮尔逊相关系数计算用户之间的相似性关系。 diff --git a/docs/ch02/ch2.1/ch2.1.1/usercf.md b/docs/ch02/ch2.1/ch2.1.1/usercf.md index 2a34353e3..72a3c8617 100644 --- a/docs/ch02/ch2.1/ch2.1.1/usercf.md +++ b/docs/ch02/ch2.1/ch2.1.1/usercf.md @@ -99,13 +99,13 @@ + 例如,我们要对用户 $A$ 进行物品推荐,可以先找到和他有相似兴趣的其他用户。 + 然后,将共同兴趣用户喜欢的,但用户 $A$ 未交互过的物品推荐给 $A$。 -image-20210629232540289 +image-20210629232540289 ## 计算过程 以下图为例,给用户推荐物品的过程可以形象化为一个猜测用户对物品进行打分的任务,表格里面是5个用户对于5件物品的一个打分情况,就可以理解为用户对物品的喜欢程度。 -![image-20210629232622758](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20210629232622758.png) +![image-20210629232622758](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20210629232622758.png) UserCF算法的两个步骤: @@ -164,7 +164,7 @@ UserCF算法的两个步骤: + 基于 sklearn 计算所有用户之间的皮尔逊相关系数。可以看出,与 Alice 相似度最高的用户为用户1和用户2。 - 图片 + 图片 2. **根据相似度用户计算 Alice对物品5的最终得分** 用户1对物品5的评分是3, 用户2对物品5的打分是5, 那么根据上面的计算公式, 可以计算出 Alice 对物品5的最终得分是 diff --git a/docs/ch02/ch2.1/ch2.1.2/Airbnb.md b/docs/ch02/ch2.1/ch2.1.2/Airbnb.md index 65ce7bc37..efa24a285 100644 --- a/docs/ch02/ch2.1/ch2.1.2/Airbnb.md +++ b/docs/ch02/ch2.1/ch2.1.2/Airbnb.md @@ -16,11 +16,11 @@ - 当用户在查看某一个房源时,接下来的有两种方式继续搜索: - 返回搜索结果页,继续查看其他搜索结果。 -img +img - 在当前房源的详情页下,「相似房源」板块(你可能还喜欢)所推荐的房源。 -img +img - Airbnb 平台 99% 的房源预订来自于搜索排序和相似房源推荐。 # Embedding 方法 @@ -59,7 +59,7 @@ $$ - Airbnb 将最终预定的房源,始终作为滑窗的上下文,即全局上下文。如下图: - 如图,对于当前滑动窗口的 central listing,实线箭头表示context listings,虚线(指向booked listing)表示 global context listing。 -img +img - booked listing 作为全局正样本,故优化的目标函数更新为: @@ -103,12 +103,12 @@ $$ - 理论上,同一区域的房源相似性应该更高,不同区域房源相似性更低。 - Airbnb 利用 k-means 聚类,将加利福尼亚州的房源聚成100个集群,来验证类似位置的房源是否聚集在一起。 -img +img - 评估不同类型、价格区间的房源之间的相似性。 - 简而言之,我们希望类型相同、价格区间一致的房源它们之间的相似度更高。 -img +img - 评估房源的隐式特征 - Airbnb 在训练房源(listing)的 Embedding时,并没有用到房源的图像信息。 @@ -117,7 +117,7 @@ $$ - 大致原理就是,利用训练好的 Embedding 进行 K 近邻相似度检索。 - 如下,与查询房源在 Embedding 相似性高的其他房源,它们之间的外观风格也很相似。 -img +img ## User-type & Listing-type Embedding @@ -169,7 +169,7 @@ Airbnb 除了挖掘 Listing 的短期兴趣特征表示外,还对 User 和 Lis - 所有的属性,都基于一定的规则进行了分桶(buckets)。例如21岁,被分桶到 20-30 岁的区间。 - 对于首次预定的用户,他的属性为 buckets 的前5行,因为预定之前没有历史预定相关的信息。 -img +img 看到过前面那个简单的例子后,现在可以看一个原文的 Listing-type 的例子: @@ -233,7 +233,7 @@ Type Embedding 的学习同样是基于 Skip-Gram 模型,但是有两点需要 \end{aligned} $$ -img +img # 实验部分 @@ -276,13 +276,13 @@ Airbnb 的搜索排名的大致流程为: - 表中的 Embedding Features 包含了8种类型,前6种类型的特征计算方式相同。 -img +img **① 基于 Listing Embedding Features 的特征构建** - Airbnb 保留了用户过去两周6种不同类型的历史行为,如下图: -img +img - 对于每个行为,还要将其按照 market (地域)进行划分。以 $ H_c $ 为例: @@ -312,7 +312,7 @@ Airbnb 的搜索排名的大致流程为: 为了验证上述特征的构建是否有效,Airbnb 还做了特征重要性排序,如下表: -img +img **(3)模型** 特征构建完成后,开始对模型进行训练。 diff --git a/docs/ch02/ch2.1/ch2.1.2/word2vec.md b/docs/ch02/ch2.1/ch2.1.2/word2vec.md index 2ed19ec8b..2883242cf 100644 --- a/docs/ch02/ch2.1/ch2.1.2/word2vec.md +++ b/docs/ch02/ch2.1/ch2.1.2/word2vec.md @@ -56,7 +56,7 @@ one-hot向量的维度是词汇表的大小(如:500,000) 如果我们可以使用某种方法为每个单词构建一个合适的dense vector,如下图,那么通过点积等数学计算就可以获得单词之间的某种联系
-在这里插入图片描述 +在这里插入图片描述
# Word2vec @@ -71,7 +71,7 @@ one-hot向量的维度是词汇表的大小(如:500,000) 我们先引入上下文context的概念:当单词 w 出现在文本中时,其**上下文context**是出现在w附近的一组单词(在固定大小的窗口内),如下图
-在这里插入图片描述 +在这里插入图片描述
这些上下文单词context words决定了banking的意义 @@ -97,13 +97,13 @@ Word2vec包含两个模型,**Skip-gram与CBOW**。下面,我们先讲**Skip- 下图展示了以“into”为中心词,窗口大小为2的情况下它的上下文词。以及相对应的$P(o|c)$
-在这里插入图片描述 +在这里插入图片描述
我们滑动窗口,再以banking为中心词
- +
那么,如果我们在整个语料库上不断地滑动窗口,我们可以得到所有位置的$P(o|c)$,我们希望在所有位置上**最大化单词o在单词c周围出现了这一事实**,由极大似然法,可得: @@ -115,13 +115,13 @@ $$ 此式还可以依图3写为:
-在这里插入图片描述 +在这里插入图片描述
加log,加负号,缩放大小可得:
-在这里插入图片描述 +在这里插入图片描述
上式即为**skip-gram的损失函数**,最小化损失函数,就可以得到合适的词向量 @@ -141,7 +141,7 @@ $$ 又P(o|c)是一个概率,所以我们在整个语料库上使用**softmax**将点积的值映射到概率,如图6
-在这里插入图片描述 +在这里插入图片描述
注:注意到上图,中心词词向量为$v_{c}$,而上下文词词向量为$u_{o}$。也就是说每个词会对应两个词向量,**在词w做中心词时,使用$v_{w}$作为词向量,而在它做上下文词时,使用$u_{w}$作为词向量**。这样做的原因是为了求导等操作时计算上的简便。当整个模型训练完成后,我们既可以使用$v_{w}$作为词w的词向量,也可以使用$u_{w}$作为词w的词向量,亦或是将二者平均。在下一部分的模型结构中,我们将更清楚地看到两个词向量究竟在模型的哪个位置。 @@ -153,7 +153,7 @@ $$ ## Word2vec模型结构
-在这里插入图片描述 +在这里插入图片描述
如图八所示,这是一个输入为1 X V维的one-hot向量(V为整个词汇表的长度,这个向量只有一个1值,其余为0值表示一个词),单隐藏层(**隐藏层的维度为N,这里是一个超参数,这个参数由我们定义,也就是词向量的维度**),输出为1 X V维的softmax层的模型。 @@ -175,13 +175,13 @@ $W^{I}$为V X N的参数矩阵,$W^{O}$为N X V的参数矩阵。 如上文所述,Skip-gram为给定中心词,预测周围的词,即求P(o|c),如下图所示:
-在这里插入图片描述 +在这里插入图片描述
而CBOW为给定周围的词,预测中心词,即求P(c|o),如下图所示:
-在这里插入图片描述 +在这里插入图片描述
@@ -194,7 +194,7 @@ $W^{I}$为V X N的参数矩阵,$W^{O}$为N X V的参数矩阵。 我们再看一眼,通过softmax得到的$P(o|c)$,如图:
-在这里插入图片描述 +在这里插入图片描述
@@ -209,7 +209,7 @@ $W^{I}$为V X N的参数矩阵,$W^{O}$为N X V的参数矩阵。 我们首先给出负采样的损失函数:
-在这里插入图片描述 +在这里插入图片描述
diff --git a/docs/ch02/ch2.1/ch2.1.3/EGES.md b/docs/ch02/ch2.1/ch2.1.3/EGES.md index 366da3477..3d1479c23 100644 --- a/docs/ch02/ch2.1/ch2.1.3/EGES.md +++ b/docs/ch02/ch2.1/ch2.1.3/EGES.md @@ -33,7 +33,7 @@ 在介绍三个模型之前,我们首先需要构建好item-item图。由于基于CF的方法仅考虑物品之间的共现,忽略了行为的序列信息(即序列中相邻的物品之间的语义信息),因此item-item图的构建方式如下图所示。
- +
首先根据用户的session行为序列构建网络结构,即序列中相邻两个item之间在存在边,并且是有向带权图。物品图边上的权重为所有用户行为序列中两个 item 共现的次数,最终构造出来简单的有向有权图。 @@ -53,7 +53,7 @@ 对于图嵌入模型,第一步先进行随机游走得到物品序列;第二部通过skip-gram为图上节点生成embedding。那么对于随机游走的思想:如何利用随机游走在图中生成的序列?不同于DeepWalk中的随机游走,本文的采样策略使用的是带权游走策略,不同权重的游走到的概率不同,(其本质上就是node2vec),传统的node2vec方法可以直接支持有向带权图。因此在给定图的邻接矩阵M后(表示节点之间的边权重),随机游走中每次转移的概率为:
- +
其中$M_{ij}$为边$e_{ij}$上的权重,$N_{+}(v_i)$表示节点$v_i$所有邻居节点集合,并且随机游走的转移概率的对每个节点所有邻接边权重的归一化结果。在随即游走之后,每个item得到一个序列,如下图所示: @@ -65,19 +65,19 @@ 然后类似于word2vec,为每个item学习embedding,于是优化目标如下:
- +
其中,w 为窗口大小。考虑独立性假设的话,上面的式子可以进一步化简:
- +
这样看起来就很直观了,在已知物品 i 时,最大化序列中(上下文)其他物品 j 的条件概率。为了近似计算,采样了Negative sampling,上面的优化目标可以化简得到如下式子:
- +
其中$N(v_i)'$表示负样本集合,负采样个数越多,结果越好。 @@ -101,7 +101,7 @@ 针对上述问题,作者提出了weight pooling方法来聚合不同类型的 side information。具体地,EGES 与 GES 的区别在聚合不同类型 side information计算不同的权重,根据权重聚合 side information 得到商品的embedding,如下图所示:
- +
其中 $a_i$ 表示每个side information 用于计算权重的参数向量,最终通过下面的公式得到商品的embedding: @@ -117,13 +117,13 @@ 以上就是这三个模型主要的区别,下面是EGES的伪代码。
- +
其中**WeightedSkipGram**函数为带权重的SkipGram算法。
- +
diff --git a/docs/ch02/ch2.1/ch2.1.5/TDM.md b/docs/ch02/ch2.1/ch2.1.5/TDM.md index 209928f3b..51fffdd14 100644 --- a/docs/ch02/ch2.1/ch2.1.5/TDM.md +++ b/docs/ch02/ch2.1/ch2.1.5/TDM.md @@ -9,7 +9,7 @@ **树结构**
-image-20210308142624189 +image-20210308142624189
如上图,树中的每一个叶子节点对应一个商品item,非叶子结点表示的是item的集合**(这里的树不限于二叉树)**。这种层次化结构体现了粒度从粗到细的item架构。 @@ -17,7 +17,7 @@ **整体结构**
-image-20210308142624189 +image-20210308142624189
# 算法详解 @@ -34,7 +34,7 @@ 2. 对兴趣进行建模
- image-20210308142624189 + image-20210308142624189
​如上图,用户对叶子层item6感兴趣,可以认为它的兴趣是1,同层别的候选节点的兴趣为0,顺着着绿色线路上去的节点都标记为1,路线上的同层别的候选节点都标记为0。这样的操作就可以根据1和0构建用于每一层的正负样本。 @@ -44,7 +44,7 @@ 3. 训练过程
- image-20210308142624189 + image-20210308142624189
整体联合训练的方式如下: diff --git a/docs/ch02/ch2.2/ch2.2.2/DCN.md b/docs/ch02/ch2.2/ch2.2.2/DCN.md index 41c2e97d3..f9721752e 100644 --- a/docs/ch02/ch2.2/ch2.2.2/DCN.md +++ b/docs/ch02/ch2.2/ch2.2.2/DCN.md @@ -6,7 +6,7 @@ Wide&Deep模型的提出不仅综合了“记忆能力”和“泛化能力” 这个模型的结构是这个样子的:
- +
这个模型的结构也是比较简洁的, 从下到上依次为:Embedding和Stacking层, Cross网络层与Deep网络层并列, 以及最后的输出层。下面也是一一为大家剖析。 @@ -34,7 +34,7 @@ $$ $$ 可以看到, 交叉层的二阶部分非常类似PNN提到的外积操作, 在此基础上增加了外积操作的权重向量$w_l$, 以及原输入向量$x_l$和偏置向量$b_l$。 交叉层的可视化如下: -
+
可以看到, 每一层增加了一个$n$维的权重向量$w_l$(n表示输入向量维度), 并且在每一层均保留了输入向量, 因此输入和输出之间的变化不会特别明显。关于这一层, 原论文里面有个具体的证明推导Cross Network为啥有效, 不过比较复杂,这里我拿一个式子简单的解释下上面这个公式的伟大之处: @@ -139,7 +139,7 @@ def DCN(linear_feature_columns, dnn_feature_columns): 下面是一个通过keras画的模型结构图,为了更好的显示,类别特征都只是选择了一小部分,画图的代码也在github中。 -
image-20210308143101261 +
image-20210308143101261
## 思考 diff --git a/docs/ch02/ch2.2/ch2.2.2/PNN.md b/docs/ch02/ch2.2/ch2.2.2/PNN.md index 31feda2b0..393f3fea9 100644 --- a/docs/ch02/ch2.2/ch2.2.2/PNN.md +++ b/docs/ch02/ch2.2/ch2.2.2/PNN.md @@ -10,13 +10,13 @@ PNN模型其实是对IPNN和OPNN的总称,两者分别对应的是不同的Pro PNN模型的整体架构如下图所示: -
image-20210308142624189
+
image-20210308142624189
一共分为五层,其中除了Product Layer别的layer都是比较常规的处理方法,均可以从前面的章节进一步了解。模型中最重要的部分就是通过Product层对embedding特征进行交叉组合,也就是上图中红框所显示的部分。 Product层主要有线性部分和非线性部分组成,分别用$l_z$和$l_p$来表示, -
image-20210308143101261 +
image-20210308143101261
1. 线性模块,一阶特征(未经过显示特征交叉处理),对应论文中的$l_z=(l_z^1,l_z^2, ..., l_z^{D_1})$ @@ -230,7 +230,7 @@ class ProductLayer(Layer): 下面是一个通过keras画的模型结构图,为了更好的显示,类别特征都只是选择了一小部分,画图的代码也在github中。 -
image-20210308143101261 +
image-20210308143101261
## 思考题 diff --git a/docs/ch02/ch2.2/ch2.2.3/AFM.md b/docs/ch02/ch2.2/ch2.2.3/AFM.md index 82164853d..1de9803ac 100644 --- a/docs/ch02/ch2.2/ch2.2.3/AFM.md +++ b/docs/ch02/ch2.2/ch2.2.3/AFM.md @@ -9,7 +9,7 @@ $$ ## AFM模型原理
-image-20210131092744905 +image-20210131092744905
上图表示的就是AFM交叉特征部分的模型结构(非交叉部分与FM是一样的,图中并没有给出)。AFM最核心的两个点分别是Pair-wise Interaction Layer和Attention-based Pooling。前者将输入的非零特征的隐向量两两计算element-wise product(哈达玛积,两个向量对应元素相乘,得到的还是一个向量),假如输入的特征中的非零向量的数量为m,那么经过Pair-wise Interaction Layer之后输出的就是$\frac{m(m-1)}{2}$个向量,再将前面得到的交叉特征向量组输入到Attention-based Pooling,该pooling层会先计算出每个特征组合的自适应权重(通过Attention Net进行计算),通过加权求和的方式将向量组压缩成一个向量,由于最终需要输出的是一个数值,所以还需要将前一步得到的向量通过另外一个向量将其映射成一个值,得到最终的基于注意力加权的二阶交叉特征的输出。(对于这部分如果不是很清楚,可以先看下面对两个核心层的介绍) @@ -109,13 +109,13 @@ def AFM(linear_feature_columns, dnn_feature_columns): 关于每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。
-image-20210307200304199 +image-20210307200304199
下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。
-image-20210307200304199 +image-20210307200304199
## 思考 diff --git a/docs/ch02/ch2.2/ch2.2.3/DeepFM.md b/docs/ch02/ch2.2/ch2.2.3/DeepFM.md index 93d532fab..c03efaf7a 100644 --- a/docs/ch02/ch2.2/ch2.2.3/DeepFM.md +++ b/docs/ch02/ch2.2/ch2.2.3/DeepFM.md @@ -7,17 +7,17 @@ - **DNN局限** 当我们使用DNN网络解决推荐问题的时候存在网络参数过于庞大的问题,这是因为在进行特征处理的时候我们需要使用one-hot编码来处理离散特征,这会导致输入的维度猛增。这里借用AI大会的一张图片:
- +
这样庞大的参数量也是不实际的。为了解决DNN参数量过大的局限性,可以采用非常经典的Field思想,将OneHot特征转换为Dense Vector
- +
此时通过增加全连接层就可以实现高阶的特征组合,如下图所示:
- +
但是仍然缺少低阶的特征组合,于是增加FM来表示低阶的特征组合。 @@ -25,7 +25,7 @@ 结合FM和DNN其实有两种方式,可以并行结合也可以串行结合。这两种方式各有几种代表模型。在DeepFM之前有FNN,虽然在影响力上可能并不如DeepFM,但是了解FNN的思想对我们理解DeepFM的特点和优点是很有帮助的。
- +
FNN是使用预训练好的FM模块,得到隐向量,然后把隐向量作为DNN的输入,但是经过实验进一步发现,在Embedding layer和hidden layer1之间增加一个product层(如上图所示)可以提高模型的表现,所以提出了PNN,使用product layer替换FM预训练层。 @@ -33,7 +33,7 @@ FNN是使用预训练好的FM模块,得到隐向量,然后把隐向量作为 - **Wide&Deep** FNN和PNN模型仍然有一个比较明显的尚未解决的缺点:对于低阶组合特征学习到的比较少,这一点主要是由于FM和DNN的串行方式导致的,也就是虽然FM学到了低阶特征组合,但是DNN的全连接结构导致低阶特征并不能在DNN的输出端较好的表现。看来我们已经找到问题了,将串行方式改进为并行方式能比较好的解决这个问题。于是Google提出了Wide&Deep模型(将前几章),但是如果深入探究Wide&Deep的构成方式,虽然将整个模型的结构调整为了并行结构,在实际的使用中Wide Module中的部分需要较为精巧的特征工程,换句话说人工处理对于模型的效果具有比较大的影响(这一点可以在Wide&Deep模型部分得到验证)。
-image-20200910214310877 +image-20200910214310877
如上图所示,该模型仍然存在问题:**在output Units阶段直接将低阶和高阶特征进行组合,很容易让模型最终偏向学习到低阶或者高阶的特征,而不能做到很好的结合。** @@ -41,7 +41,7 @@ FNN和PNN模型仍然有一个比较明显的尚未解决的缺点:对于低 ## 模型的结构与原理
-image-20210225180556628 +image-20210225180556628
前面的Field和Embedding处理是和前面的方法是相同的,如上图中的绿色部分;DeepFM将Wide部分替换为了FM layer如上图中的蓝色部分 @@ -58,12 +58,12 @@ $$ \hat{y}_{FM}(x) = w_0+\sum_{i=1}^N w_ix_i + \sum_{i=1}^N \sum_{j=i+1}^N v_i^T v_j x_ix_j $$
-image-20210225181340313 +image-20210225181340313
### Deep Deep架构图
-image-20210225181010107 +image-20210225181010107
Deep Module是为了学习高阶的特征组合,在上图中使用用全连接的方式将Dense Embedding输入到Hidden Layer,这里面Dense Embeddings就是为了解决DNN中的参数爆炸问题,这也是推荐模型中常用的处理方法。 @@ -130,13 +130,13 @@ def DeepFM(linear_feature_columns, dnn_feature_columns): 关于每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。
-image-20210228161135777 +image-20210228161135777
下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。
-image-20210225180556628 +image-20210225180556628
## 思考 @@ -144,7 +144,7 @@ def DeepFM(linear_feature_columns, dnn_feature_columns): 2. 对于下图所示,根据你的理解Sparse Feature中的不同颜色节点分别表示什么意思
-image-20210225180556628 +image-20210225180556628
diff --git a/docs/ch02/ch2.2/ch2.2.3/NFM.md b/docs/ch02/ch2.2/ch2.2.3/NFM.md index 0ccd2caa3..e7945ff00 100644 --- a/docs/ch02/ch2.2/ch2.2.3/NFM.md +++ b/docs/ch02/ch2.2/ch2.2.3/NFM.md @@ -10,11 +10,11 @@ $$ 我们对比FM, 就会发现变化的是第三项,前两项还是原来的, 因为我们说FM的一个问题,就是只能到二阶交叉, 且是线性模型, 这是他本身的一个局限性, 而如果想突破这个局限性, 就需要从他的公式本身下点功夫, 于是乎,作者在这里改进的思路就是**用一个表达能力更强的函数来替代原FM中二阶隐向量内积的部分**。
- +
而这个表达能力更强的函数呢, 我们很容易就可以想到神经网络来充当,因为神经网络理论上可以拟合任何复杂能力的函数, 所以作者真的就把这个$f(x)$换成了一个神经网络,当然不是一个简单的DNN, 而是依然底层考虑了交叉,然后高层使用的DNN网络, 这个也就是我们最终的NFM网络了:
- +
这个结构,如果前面看过了PNN的伙伴会发现,这个结构和PNN非常像,只不过那里是一个product_layer, 而这里换成了Bi-Interaction Pooling了, 这个也是NFM的核心结构了。这里注意, 这个结构中,忽略了一阶部分,只可视化出来了$f(x)$, 我们还是下面从底层一点点的对这个网络进行剖析。 @@ -130,11 +130,11 @@ def NFM(linear_feature_columns, dnn_feature_columns): 有了上面的解释,这个模型的宏观层面相信就很容易理解了。关于这每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播。(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。
-NFM_aaaa +NFM_aaaa
下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。
-NFM_aaaa +NFM_aaaa
## 思考题 diff --git a/docs/ch02/ch2.2/ch2.2.3/WideNDeep.md b/docs/ch02/ch2.2/ch2.2.3/WideNDeep.md index a42f0872f..504ef9cb4 100644 --- a/docs/ch02/ch2.2/ch2.2.3/WideNDeep.md +++ b/docs/ch02/ch2.2/ch2.2.3/WideNDeep.md @@ -12,7 +12,7 @@ Wide&Deep模型就是围绕记忆性和泛化性进行讨论的,模型能够 ## 模型结构及原理
-image-20200910214310877 +image-20200910214310877
其实wide&deep模型本身的结构是非常简单的,对于有点机器学习基础和深度学习基础的人来说都非常的容易看懂,但是如何根据自己的场景去选择那些特征放在Wide部分,哪些特征放在Deep部分就需要理解这篇论文提出者当时对于设计该模型不同结构时的意图了,所以这也是用好这个模型的一个前提。 @@ -88,13 +88,13 @@ def WideNDeep(linear_feature_columns, dnn_feature_columns): 关于每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播。(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。
-image-20210228160557072 +image-20210228160557072
下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。
-image-20210228160557072 +image-20210228160557072
## 思考 diff --git a/docs/ch02/ch2.2/ch2.2.4/DIEN.md b/docs/ch02/ch2.2/ch2.2.4/DIEN.md index 37a21e713..9be1b388d 100644 --- a/docs/ch02/ch2.2/ch2.2.4/DIEN.md +++ b/docs/ch02/ch2.2/ch2.2.4/DIEN.md @@ -6,7 +6,7 @@ DIN模型考虑了用户兴趣,并且强调用户兴趣是多样的,该模 ## DIEN模型原理
-image-20210218155901144 +image-20210218155901144
模型的输入可以分成两大部分,一部分是用户的行为序列(这部分会通过兴趣提取层及兴趣演化层转换成与用户当前兴趣相关的embedding),另一部分就是除了用户行为以外的其他所有特征,如Target id, Coontext Feature, UserProfile Feature,这些特征都转化成embedding的类型然后concat在一起(形成一个大的embedding)作为非行为相关的特征(这里可能也会存在一些非id类特征,应该可以直接进行concat)。最后DNN输入的部分由行为序列embedding和非行为特征embedding(多个特征concat到一起之后形成的一个大的向量)组成,将两者concat之后输入到DNN中。 @@ -23,13 +23,13 @@ DIN模型考虑了用户兴趣,并且强调用户兴趣是多样的,该模 首先需要明确的就是辅助损失是计算哪两个量的损失。计算的是用户每个时刻的兴趣表示(GRU每个时刻输出的隐藏状态形成的序列)与用户当前时刻实际点击的物品表示(输入的embedding序列)之间的损失,相当于是行为序列中的第t+1个物品与用户第t时刻的兴趣表示之间的损失**(为什么这里用户第t时刻的兴趣与第t+1时刻的真实点击做损失呢?我的理解是,只有知道了用户第t+1真实点击的商品,才能更好的确定用户第t时刻的兴趣)。**
-image-20210218163742638 +image-20210218163742638
当然,如果只计算用户点击物品与其点击前一次的兴趣之间的损失,只能认为是正样本之间的损失,那么用户第t时刻的兴趣其实还有很多其他的未点击的商品,这些未点击的商品就是负样本,负样本一般通过从用户点击序列中采样得到,这样一来辅助损失中就包含了用户某个时刻下的兴趣及与该时刻兴趣相关的正负物品。所以最终的损失函数表示如下。
-image-20210218162447125 +image-20210218162447125
其中$h_t^i$表示的是用户$i$第$t$时刻的隐藏状态,可以表示用户第$t$时刻的兴趣向量,$e_b^i,\hat{e_b^i}$分别表示的是正负样本,$e_b^i[t+1]$表示的是用户$i$第$t+1$时刻点击的物品向量。 @@ -56,7 +56,7 @@ $$ 由于用户的兴趣是多样的,但是用户的每一种兴趣都有自己的发展过程,即使兴趣发生漂移我们可以只考虑用户与target item(广告或者商品)相关的兴趣演化过程,这样就不用考虑用户多样化的兴趣的问题了,而如何只获取与target item相关的信息,作者使用了与DIN模型中提取与target item相同的方法,来计算用户历史兴趣与target item之间的相似度,即这里也使用了DIN中介绍的局部激活单元(就是下图中的Attention模块)。
-image-20210218180755462 +image-20210218180755462
当得到了用户历史兴趣序列及兴趣序列与target item之间的相关性(注意力分数)之后,就需要再次对注意力序列进行建模得到用户注意力的演化过程,进一步表示用户最终的兴趣向量。此时的序列数据等同于有了一个序列及序列中每个向量的注意力权重,下面就是考虑如何使用这个注意力权重来一起优化序列建模的结果了。作者提出了三种注意力结合的GRU模型快: diff --git a/docs/ch02/ch2.2/ch2.2.4/DIN.md b/docs/ch02/ch2.2/ch2.2.4/DIN.md index f99a9c5fa..dc104de6f 100644 --- a/docs/ch02/ch2.2/ch2.2.4/DIN.md +++ b/docs/ch02/ch2.2/ch2.2.4/DIN.md @@ -158,13 +158,13 @@ def DIN(feature_columns, behavior_feature_list, behavior_seq_feature_list): 关于每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播。(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。
-DIN_aaaa +DIN_aaaa
下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。
-DIN_aaaa +DIN_aaaa
## 思考 diff --git "a/docs/ch03/ch3.1/jupyter/\345\244\232\350\267\257\345\217\254\345\233\236.ipynb" "b/docs/ch03/ch3.1/jupyter/\345\244\232\350\267\257\345\217\254\345\233\236.ipynb" index 3a4bccd4e..08bc05222 100644 --- "a/docs/ch03/ch3.1/jupyter/\345\244\232\350\267\257\345\217\254\345\233\236.ipynb" +++ "b/docs/ch03/ch3.1/jupyter/\345\244\232\350\267\257\345\217\254\345\233\236.ipynb" @@ -1,2107 +1,2107 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 多路召回\n", - "\n", - "所谓的“多路召回”策略,就是指采用不同的策略、特征或简单模型,分别召回一部分候选集,然后把候选集混合在一起供后续排序模型使用,可以明显的看出,“多路召回策略”是在“计算速度”和“召回率”之间进行权衡的结果。其中,各种简单策略保证候选集的快速召回,从不同角度设计的策略保证召回率接近理想的状态,不至于损伤排序效果。如下图是多路召回的一个示意图,在多路召回中,每个策略之间毫不相关,所以一般可以写并发多线程同时进行,这样可以更加高效。\n", - "\n", - "\"image-20201119132726873\"\n", - "\n", - "上图只是一个多路召回的例子,也就是说可以使用多种不同的策略来获取用户排序的候选商品集合,而具体使用哪些召回策略其实是与业务强相关的 ,针对不同的任务就会有对于该业务真实场景下需要考虑的召回规则。例如新闻推荐,召回规则可以是“热门新闻”、“作者召回”、“关键词召回”、“主题召回“、”协同过滤召回“等等。 \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 导包" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:29.834662Z", - "start_time": "2020-11-16T11:26:27.811511Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd \n", - "import numpy as np\n", - "from tqdm import tqdm \n", - "from collections import defaultdict \n", - "import os, math, warnings, math, pickle\n", - "from tqdm import tqdm\n", - "import faiss\n", - "import collections\n", - "import random\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from datetime import datetime\n", - "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from tensorflow.python.keras import backend as K\n", - "from tensorflow.python.keras.models import Model\n", - "from tensorflow.python.keras.preprocessing.sequence import pad_sequences\n", - "\n", - "from deepmatch.models import *\n", - "from deepmatch.utils import sampledsoftmaxloss\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:31.831215Z", - "start_time": "2020-11-16T11:26:31.826939Z" - } - }, - "outputs": [], - "source": [ - "data_path = './data_raw/'\n", - "save_path = './temp_results/'\n", - "# 做召回评估的一个标志, 如果不进行评估就是直接使用全量数据进行召回\n", - "metric_recall = False" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取数据\n", - "在一般的rs比赛中读取数据部分主要分为三种模式, 不同的模式对应的不同的数据集:\n", - "1. debug模式: 这个的目的是帮助我们基于数据先搭建一个简易的baseline并跑通, 保证写的baseline代码没有什么问题。 由于推荐比赛的数据往往非常巨大, 如果一上来直接采用全部的数据进行分析,搭建baseline框架, 往往会带来时间和设备上的损耗, **所以这时候我们往往需要从海量数据的训练集中随机抽取一部分样本来进行调试(train_click_log_sample)**, 先跑通一个baseline。\n", - "2. 线下验证模式: 这个的目的是帮助我们在线下基于已有的训练集数据, 来选择好合适的模型和一些超参数。 **所以我们这一块只需要加载整个训练集(train_click_log)**, 然后把整个训练集再分成训练集和验证集。 训练集是模型的训练数据, 验证集部分帮助我们调整模型的参数和其他的一些超参数。\n", - "3. 线上模式: 我们用debug模式搭建起一个推荐系统比赛的baseline, 用线下验证模式选择好了模型和一些超参数, 这一部分就是真正的对于给定的测试集进行预测, 提交到线上, **所以这一块使用的训练数据集是全量的数据集(train_click_log+test_click_log)**\n", - "\n", - "下面就分别对这三种不同的数据读取模式先建立不同的代导入函数, 方便后面针对不同的模式下导入数据。" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:34.476240Z", - "start_time": "2020-11-16T11:26:34.467352Z" - } - }, - "outputs": [], - "source": [ - "# debug模式: 从训练集中划出一部分数据来调试代码\n", - "def get_all_click_sample(data_path, sample_nums=10000):\n", - " \"\"\"\n", - " 训练集中采样一部分数据调试\n", - " data_path: 原数据的存储路径\n", - " sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)\n", - " \"\"\"\n", - " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " all_user_ids = all_click.user_id.unique()\n", - "\n", - " sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) \n", - " all_click = all_click[all_click['user_id'].isin(sample_user_ids)]\n", - " \n", - " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", - " return all_click\n", - "\n", - "# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中\n", - "# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集\n", - "def get_all_click_df(data_path='./data_raw/', offline=True):\n", - " if offline:\n", - " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " else:\n", - " trn_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", - "\n", - " all_click = trn_click.append(tst_click)\n", - " \n", - " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", - " return all_click" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:35.168738Z", - "start_time": "2020-11-16T11:26:35.163210Z" - } - }, - "outputs": [], - "source": [ - "# 读取文章的基本属性\n", - "def get_item_info_df(data_path):\n", - " item_info_df = pd.read_csv(data_path + 'articles.csv')\n", - " \n", - " # 为了方便与训练集中的click_article_id拼接,需要把article_id修改成click_article_id\n", - " item_info_df = item_info_df.rename(columns={'article_id': 'click_article_id'})\n", - " \n", - " return item_info_df" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:36.152958Z", - "start_time": "2020-11-16T11:26:36.146324Z" - } - }, - "outputs": [], - "source": [ - "# 读取文章的Embedding数据\n", - "def get_item_emb_dict(data_path):\n", - " item_emb_df = pd.read_csv(data_path + 'articles_emb.csv')\n", - " \n", - " item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]\n", - " item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])\n", - " # 进行归一化\n", - " item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)\n", - "\n", - " item_emb_dict = dict(zip(item_emb_df['article_id'], item_emb_np))\n", - " pickle.dump(item_emb_dict, open(save_path + 'item_content_emb.pkl', 'wb'))\n", - " \n", - " return item_emb_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:37.333536Z", - "start_time": "2020-11-16T11:26:37.329545Z" - } - }, - "outputs": [], - "source": [ - "max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:42.163494Z", - "start_time": "2020-11-16T11:26:38.018094Z" - } - }, - "outputs": [], - "source": [ - "# 采样数据\n", - "# all_click_df = get_all_click_sample(data_path)\n", - "\n", - "# 全量训练集\n", - "all_click_df = get_all_click_df(offline=False)\n", - "\n", - "# 对时间戳进行归一化,用于在关联规则的时候计算权重\n", - "all_click_df['click_timestamp'] = all_click_df[['click_timestamp']].apply(max_min_scaler)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:44.343500Z", - "start_time": "2020-11-16T11:26:44.113891Z" - } - }, - "outputs": [], - "source": [ - "item_info_df = get_item_info_df(data_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:24.295343Z", - "start_time": "2020-11-16T11:26:44.398007Z" - } - }, - "outputs": [], - "source": [ - "item_emb_dict = get_item_emb_dict(data_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 工具函数" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取用户-文章-时间函数\n", - "这个在基于关联规则的用户协同过滤的时候会用到" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:33.791656Z", - "start_time": "2020-11-16T11:27:33.784305Z" - } - }, - "outputs": [], - "source": [ - "# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - "def get_user_item_time(click_df):\n", - " \n", - " click_df = click_df.sort_values('click_timestamp')\n", - " \n", - " def make_item_time_pair(df):\n", - " return list(zip(df['click_article_id'], df['click_timestamp']))\n", - " \n", - " user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\\\n", - " .reset_index().rename(columns={0: 'item_time_list'})\n", - " user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))\n", - " \n", - " return user_item_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取文章-用户-时间函数\n", - "这个在基于关联规则的文章协同过滤的时候会用到" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:38.327581Z", - "start_time": "2020-11-16T11:27:38.321059Z" - } - }, - "outputs": [], - "source": [ - "# 根据时间获取商品被点击的用户序列 {item1: [(user1, time1), (user2, time2)...]...}\n", - "# 这里的时间是用户点击当前商品的时间,好像没有直接的关系。\n", - "def get_item_user_time_dict(click_df):\n", - " def make_user_time_pair(df):\n", - " return list(zip(df['user_id'], df['click_timestamp']))\n", - " \n", - " click_df = click_df.sort_values('click_timestamp')\n", - " item_user_time_df = click_df.groupby('click_article_id')['user_id', 'click_timestamp'].apply(lambda x: make_user_time_pair(x))\\\n", - " .reset_index().rename(columns={0: 'user_time_list'})\n", - " \n", - " item_user_time_dict = dict(zip(item_user_time_df['click_article_id'], item_user_time_df['user_time_list']))\n", - " return item_user_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取历史和最后一次点击\n", - "这个在评估召回结果, 特征工程和制作标签转成监督学习测试集的时候回用到" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:50.894683Z", - "start_time": "2020-11-16T11:27:50.888002Z" - } - }, - "outputs": [], - "source": [ - "# 获取当前数据的历史点击和最后一次点击\n", - "def get_hist_and_last_click(all_click):\n", - " \n", - " all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])\n", - " click_last_df = all_click.groupby('user_id').tail(1)\n", - "\n", - " # 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下\n", - " def hist_func(user_df):\n", - " if len(user_df) == 1:\n", - " return user_df\n", - " else:\n", - " return user_df[:-1]\n", - "\n", - " click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n", - "\n", - " return click_hist_df, click_last_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取文章属性特征" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:55.893810Z", - "start_time": "2020-11-16T11:27:55.887623Z" - } - }, - "outputs": [], - "source": [ - "# 获取文章id对应的基本属性,保存成字典的形式,方便后面召回阶段,冷启动阶段直接使用\n", - "def get_item_info_dict(item_info_df):\n", - " max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n", - " item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].apply(max_min_scaler)\n", - " \n", - " item_type_dict = dict(zip(item_info_df['click_article_id'], item_info_df['category_id']))\n", - " item_words_dict = dict(zip(item_info_df['click_article_id'], item_info_df['words_count']))\n", - " item_created_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))\n", - " \n", - " return item_type_dict, item_words_dict, item_created_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T06:42:38.730939Z", - "start_time": "2020-11-13T06:42:38.728461Z" - } - }, - "source": [ - "### 获取用户历史点击的文章信息" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:59.650781Z", - "start_time": "2020-11-16T11:27:59.640572Z" - } - }, - "outputs": [], - "source": [ - "def get_user_hist_item_info_dict(all_click):\n", - " \n", - " # 获取user_id对应的用户历史点击文章类型的集合字典\n", - " user_hist_item_typs = all_click.groupby('user_id')['category_id'].agg(set).reset_index()\n", - " user_hist_item_typs_dict = dict(zip(user_hist_item_typs['user_id'], user_hist_item_typs['category_id']))\n", - " \n", - " # 获取user_id对应的用户点击文章的集合\n", - " user_hist_item_ids_dict = all_click.groupby('user_id')['click_article_id'].agg(set).reset_index()\n", - " user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['click_article_id']))\n", - " \n", - " # 获取user_id对应的用户历史点击的文章的平均字数字典\n", - " user_hist_item_words = all_click.groupby('user_id')['words_count'].agg('mean').reset_index()\n", - " user_hist_item_words_dict = dict(zip(user_hist_item_words['user_id'], user_hist_item_words['words_count']))\n", - " \n", - " # 获取user_id对应的用户最后一次点击的文章的创建时间\n", - " all_click_ = all_click.sort_values('click_timestamp')\n", - " user_last_item_created_time = all_click_.groupby('user_id')['created_at_ts'].apply(lambda x: x.iloc[-1]).reset_index()\n", - " \n", - " max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n", - " user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']].apply(max_min_scaler)\n", - " \n", - " user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \\\n", - " user_last_item_created_time['created_at_ts']))\n", - " \n", - " return user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取点击次数最多的topk个文章" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:28:04.761105Z", - "start_time": "2020-11-16T11:28:04.756419Z" - } - }, - "outputs": [], - "source": [ - "# 获取近期点击最多的文章\n", - "def get_item_topk_click(click_df, k):\n", - " topk_click = click_df['click_article_id'].value_counts().index[:k]\n", - " return topk_click" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 定义多路召回字典" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:28:08.321506Z", - "start_time": "2020-11-16T11:28:07.623281Z" - } - }, - "outputs": [], - "source": [ - "# 获取文章的属性信息,保存成字典的形式方便查询\n", - "item_type_dict, item_words_dict, item_created_time_dict = get_item_info_dict(item_info_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:28:13.791569Z", - "start_time": "2020-11-16T11:28:13.786522Z" - } - }, - "outputs": [], - "source": [ - "# 定义一个多路召回的字典,将各路召回的结果都保存在这个字典当中\n", - "user_multi_recall_dict = {'itemcf_sim_itemcf_recall': {},\n", - " 'embedding_sim_item_recall': {},\n", - " 'youtubednn_recall': {},\n", - " 'youtubednn_usercf_recall': {}, \n", - " 'cold_start_recall': {}}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T05:41:12.710754Z", - "start_time": "2020-11-16T05:40:57.842614Z" - } - }, - "outputs": [], - "source": [ - "# 提取最后一次点击作为召回评估,如果不需要做召回评估直接使用全量的训练集进行召回(线下验证模型)\n", - "# 如果不是召回评估,直接使用全量数据进行召回,不用将最后一次提取出来\n", - "trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 召回效果评估函数\n", - "做完了召回有时候也需要对当前的召回方法或者参数进行调整以达到更好的召回效果,因为召回的结果决定了最终排序的上限,下面也会提供一个召回评估的方法" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T05:41:18.579118Z", - "start_time": "2020-11-16T05:41:18.571887Z" - } - }, - "outputs": [], - "source": [ - "# 依次评估召回的前10, 20, 30, 40, 50个文章中的击中率\n", - "def metrics_recall(user_recall_items_dict, trn_last_click_df, topk=5):\n", - " last_click_item_dict = dict(zip(trn_last_click_df['user_id'], trn_last_click_df['click_article_id']))\n", - " user_num = len(user_recall_items_dict)\n", - " \n", - " for k in range(10, topk+1, 10):\n", - " hit_num = 0\n", - " for user, item_list in user_recall_items_dict.items():\n", - " # 获取前k个召回的结果\n", - " tmp_recall_items = [x[0] for x in user_recall_items_dict[user][:k]]\n", - " if last_click_item_dict[user] in set(tmp_recall_items):\n", - " hit_num += 1\n", - " \n", - " hit_rate = round(hit_num * 1.0 / user_num, 5)\n", - " print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 计算相似性矩阵\n", - "\n", - "这一部分主要是通过协同过滤以及向量检索得到相似性矩阵,相似性矩阵主要分为user2user和item2item,下面依次获取基于itemcf的item2item的相似性矩阵," - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### itemcf i2i_sim\n", - "\n", - "借鉴KDD2020的去偏商品推荐,在计算item2item相似性矩阵时,使用关联规则,使得计算的文章的相似性还考虑到了:\n", - "1. 用户点击的时间权重\n", - "2. 用户点击的顺序权重\n", - "3. 文章创建的时间权重" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:30:51.872262Z", - "start_time": "2020-11-16T11:30:51.860099Z" - } - }, - "outputs": [], - "source": [ - "def itemcf_sim(df, item_created_time_dict):\n", - " \"\"\"\n", - " 文章与文章之间的相似性矩阵计算\n", - " :param df: 数据表\n", - " :item_created_time_dict: 文章创建时间的字典\n", - " return : 文章与文章的相似性矩阵\n", - " \n", - " 思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则\n", - " \"\"\"\n", - " \n", - " user_item_time_dict = get_user_item_time(df)\n", - " \n", - " # 计算物品相似度\n", - " i2i_sim = {}\n", - " item_cnt = defaultdict(int)\n", - " for user, item_time_list in tqdm(user_item_time_dict.items()):\n", - " # 在基于商品的协同过滤优化的时候可以考虑时间因素\n", - " for loc1, (i, i_click_time) in enumerate(item_time_list):\n", - " item_cnt[i] += 1\n", - " i2i_sim.setdefault(i, {})\n", - " for loc2, (j, j_click_time) in enumerate(item_time_list):\n", - " if(i == j):\n", - " continue\n", - " \n", - " # 考虑文章的正向顺序点击和反向顺序点击 \n", - " loc_alpha = 1.0 if loc2 > loc1 else 0.7\n", - " # 位置信息权重,其中的参数可以调节\n", - " loc_weight = loc_alpha * (0.9 ** (np.abs(loc2 - loc1) - 1))\n", - " # 点击时间权重,其中的参数可以调节\n", - " click_time_weight = np.exp(0.7 ** np.abs(i_click_time - j_click_time))\n", - " # 两篇文章创建时间的权重,其中的参数可以调节\n", - " created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", - " i2i_sim[i].setdefault(j, 0)\n", - " # 考虑多种因素的权重计算最终的文章之间的相似度\n", - " i2i_sim[i][j] += loc_weight * click_time_weight * created_time_weight / math.log(len(item_time_list) + 1)\n", - " \n", - " i2i_sim_ = i2i_sim.copy()\n", - " for i, related_items in i2i_sim.items():\n", - " for j, wij in related_items.items():\n", - " i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])\n", - " \n", - " # 将得到的相似性矩阵保存到本地\n", - " pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))\n", - " \n", - " return i2i_sim_" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:47:09.937002Z", - "start_time": "2020-11-16T11:30:57.394334Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [14:20<00:00, 290.38it/s]\n" - ] - } - ], - "source": [ - "i2i_sim = itemcf_sim(all_click_df, item_created_time_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### usercf u2u_sim\n", - "\n", - "在计算用户之间的相似度的时候,也可以使用一些简单的关联规则,比如用户活跃度权重,这里将用户的点击次数作为用户活跃度的指标" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T09:11:14.951940Z", - "start_time": "2020-11-16T09:11:14.945654Z" - } - }, - "outputs": [], - "source": [ - "def get_user_activate_degree_dict(all_click_df):\n", - " all_click_df_ = all_click_df.groupby('user_id')['click_article_id'].count().reset_index()\n", - " \n", - " # 用户活跃度归一化\n", - " mm = MinMaxScaler()\n", - " all_click_df_['click_article_id'] = mm.fit_transform(all_click_df_[['click_article_id']])\n", - " user_activate_degree_dict = dict(zip(all_click_df_['user_id'], all_click_df_['click_article_id']))\n", - " \n", - " return user_activate_degree_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T09:11:19.879276Z", - "start_time": "2020-11-16T09:11:19.868808Z" - } - }, - "outputs": [], - "source": [ - "def usercf_sim(all_click_df, user_activate_degree_dict):\n", - " \"\"\"\n", - " 用户相似性矩阵计算\n", - " :param all_click_df: 数据表\n", - " :param user_activate_degree_dict: 用户活跃度的字典\n", - " return 用户相似性矩阵\n", - " \n", - " 思路: 基于用户的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则\n", - " \"\"\"\n", - " item_user_time_dict = get_item_user_time_dict(all_click_df)\n", - " \n", - " u2u_sim = {}\n", - " user_cnt = defaultdict(int)\n", - " for item, user_time_list in tqdm(item_user_time_dict.items()):\n", - " for u, click_time in user_time_list:\n", - " user_cnt[u] += 1\n", - " u2u_sim.setdefault(u, {})\n", - " for v, click_time in user_time_list:\n", - " u2u_sim[u].setdefault(v, 0)\n", - " if u == v:\n", - " continue\n", - " # 用户平均活跃度作为活跃度的权重,这里的式子也可以改善\n", - " activate_weight = 100 * 0.5 * (user_activate_degree_dict[u] + user_activate_degree_dict[v]) \n", - " u2u_sim[u][v] += activate_weight / math.log(len(user_time_list) + 1)\n", - " \n", - " u2u_sim_ = u2u_sim.copy()\n", - " for u, related_users in u2u_sim.items():\n", - " for v, wij in related_users.items():\n", - " u2u_sim_[u][v] = wij / math.sqrt(user_cnt[u] * user_cnt[v])\n", - " \n", - " # 将得到的相似性矩阵保存到本地\n", - " pickle.dump(u2u_sim_, open(save_path + 'usercf_u2u_sim.pkl', 'wb'))\n", - "\n", - " return u2u_sim_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T06:59:46.701572Z", - "start_time": "2020-11-16T06:59:26.852246Z" - } - }, - "outputs": [], - "source": [ - "# 由于usercf计算时候太耗费内存了,这里就不直接运行了\n", - "# 如果是采样的话,是可以运行的\n", - "user_activate_degree_dict = get_user_activate_degree_dict(all_click_df)\n", - "u2u_sim = usercf_sim(all_click_df, user_activate_degree_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### item embedding sim\n", - "\n", - "使用Embedding计算item之间的相似度是为了后续冷启动的时候可以获取未出现在点击数据中的文章,后面有对冷启动专门的介绍,这里简单的说一下faiss。\n", - "\n", - "aiss是Facebook的AI团队开源的一套用于做聚类或者相似性搜索的软件库,底层是用C++实现。Faiss因为超级优越的性能,被广泛应用于推荐相关的业务当中.\n", - "\n", - "faiss工具包一般使用在推荐系统中的向量召回部分。在做向量召回的时候要么是u2u,u2i或者i2i,这里的u和i指的是user和item.我们知道在实际的场景中user和item的数量都是海量的,我们最容易想到的基于向量相似度的召回就是使用两层循环遍历user列表或者item列表计算两个向量的相似度,但是这样做在面对海量数据是不切实际的,faiss就是用来加速计算某个查询向量最相似的topk个索引向量。\n", - "\n", - "**faiss查询的原理:**\n", - "\n", - "faiss使用了PCA和PQ(Product quantization乘积量化)两种技术进行向量压缩和编码,当然还使用了其他的技术进行优化,但是PCA和PQ是其中最核心部分。\n", - "\n", - "1. PCA降维算法细节参考下面这个链接进行学习 \n", - "[主成分分析(PCA)原理总结](https://www.cnblogs.com/pinard/p/6239403.html) \n", - "\n", - "2. PQ编码的细节下面这个链接进行学习 \n", - "[实例理解product quantization算法](http://www.fabwrite.com/productquantization)\n", - "\n", - "**faiss使用**\n", - "\n", - "[faiss官方教程](https://github.com/facebookresearch/faiss/wiki/Getting-started)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T09:11:28.631803Z", - "start_time": "2020-11-16T09:11:28.619926Z" - } - }, - "outputs": [], - "source": [ - "# 向量检索相似度计算\n", - "# topk指的是每个item, faiss搜索后返回最相似的topk个item\n", - "def embdding_sim(click_df, item_emb_df, save_path, topk):\n", - " \"\"\"\n", - " 基于内容的文章embedding相似性矩阵计算\n", - " :param click_df: 数据表\n", - " :param item_emb_df: 文章的embedding\n", - " :param save_path: 保存路径\n", - " :patam topk: 找最相似的topk篇\n", - " return 文章相似性矩阵\n", - " \n", - " 思路: 对于每一篇文章, 基于embedding的相似性返回topk个与其最相似的文章, 只不过由于文章数量太多,这里用了faiss进行加速\n", - " \"\"\"\n", - " \n", - " # 文章索引与文章id的字典映射\n", - " item_idx_2_rawid_dict = dict(zip(item_emb_df.index, item_emb_df['article_id']))\n", - " \n", - " item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]\n", - " item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols].values, dtype=np.float32)\n", - " # 向量进行单位化\n", - " item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)\n", - " \n", - " # 建立faiss索引\n", - " item_index = faiss.IndexFlatIP(item_emb_np.shape[1])\n", - " item_index.add(item_emb_np)\n", - " # 相似度查询,给每个索引位置上的向量返回topk个item以及相似度\n", - " sim, idx = item_index.search(item_emb_np, topk) # 返回的是列表\n", - " \n", - " # 将向量检索的结果保存成原始id的对应关系\n", - " item_sim_dict = collections.defaultdict(dict)\n", - " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(item_emb_np)), sim, idx)):\n", - " target_raw_id = item_idx_2_rawid_dict[target_idx]\n", - " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", - " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", - " rele_raw_id = item_idx_2_rawid_dict[rele_idx]\n", - " item_sim_dict[target_raw_id][rele_raw_id] = item_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value\n", - " \n", - " # 保存i2i相似度矩阵\n", - " pickle.dump(item_sim_dict, open(save_path + 'emb_i2i_sim.pkl', 'wb')) \n", - " \n", - " return item_sim_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T09:32:35.926116Z", - "start_time": "2020-11-16T09:11:44.586967Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "364047it [00:23, 15292.14it/s]\n" - ] - } - ], - "source": [ - "item_emb_df = pd.read_csv(data_path + '/articles_emb.csv')\n", - "emb_i2i_sim = embdding_sim(all_click_df, item_emb_df, save_path, topk=10) # topk可以自行设置" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 召回\n", - "这个就是我们开篇提到的那个问题, 面的36万篇文章, 20多万用户的推荐, 我们又有哪些策略来缩减问题的规模? 我们就可以再召回阶段筛选出用户对于点击文章的候选集合, 从而降低问题的规模。召回常用的策略:\n", - "* Youtube DNN 召回\n", - "* 基于文章的召回\n", - " * 文章的协同过滤\n", - " * 基于文章embedding的召回\n", - "* 基于用户的召回\n", - " * 用户的协同过滤\n", - " * 用户embedding\n", - "\n", - "上面的各种召回方式一部分在基于用户已经看得文章的基础上去召回与这些文章相似的一些文章, 而这个相似性的计算方式不同, 就得到了不同的召回方式, 比如文章的协同过滤, 文章内容的embedding等。还有一部分是根据用户的相似性进行推荐,对于某用户推荐与其相似的其他用户看过的文章,比如用户的协同过滤和用户embedding。 还有一种思路是类似矩阵分解的思路,先计算出用户和文章的embedding之后,就可以直接算用户和文章的相似度, 根据这个相似度进行推荐, 比如YouTube DNN。 我们下面详细来看一下每一个召回方法:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### YoutubeDNN召回\n", - "**(这一步是直接获取用户召回的候选文章列表)**\n", - "\n", - "[论文下载地址](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)\n", - "\n", - "**Youtubednn召回架构**\n", - "\n", - "![image-20201111160516562](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201111160516562.png)\n", - "\n", - "\n", - "\n", - "关于YoutubeDNN原理和应用推荐看王喆的两篇博客:\n", - "\n", - "1. [重读Youtube深度学习推荐系统论文,字字珠玑,惊为神文](https://zhuanlan.zhihu.com/p/52169807)\n", - "2. [YouTube深度学习推荐系统的十大工程问题](https://zhuanlan.zhihu.com/p/52504407)\n", - "\n", - "\n", - "**参考文献:**\n", - "1. https://zhuanlan.zhihu.com/p/52169807 (YouTubeDNN原理)\n", - "2. https://zhuanlan.zhihu.com/p/26306795 (Word2Vec知乎众赞文章) --- word2vec放到排序中的w2v的介绍部分\n" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:13:11.058766Z", - "start_time": "2020-11-16T10:13:11.041084Z" - } - }, - "outputs": [], - "source": [ - "# 获取双塔召回时的训练验证数据\n", - "# negsample指的是通过滑窗构建样本的时候,负样本的数量\n", - "def gen_data_set(data, negsample=0):\n", - " data.sort_values(\"click_timestamp\", inplace=True)\n", - " item_ids = data['click_article_id'].unique()\n", - "\n", - " train_set = []\n", - " test_set = []\n", - " for reviewerID, hist in tqdm(data.groupby('user_id')):\n", - " pos_list = hist['click_article_id'].tolist()\n", - " \n", - " if negsample > 0:\n", - " candidate_set = list(set(item_ids) - set(pos_list)) # 用户没看过的文章里面选择负样本\n", - " neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True) # 对于每个正样本,选择n个负样本\n", - " \n", - " # 长度只有一个的时候,需要把这条数据也放到训练集中,不然的话最终学到的embedding就会有缺失\n", - " if len(pos_list) == 1:\n", - " train_set.append((reviewerID, [pos_list[0]], pos_list[0],1,len(pos_list)))\n", - " test_set.append((reviewerID, [pos_list[0]], pos_list[0],1,len(pos_list)))\n", - " \n", - " # 滑窗构造正负样本\n", - " for i in range(1, len(pos_list)):\n", - " hist = pos_list[:i]\n", - " \n", - " if i != len(pos_list) - 1:\n", - " train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]))) # 正样本 [user_id, his_item, pos_item, label, len(his_item)]\n", - " for negi in range(negsample):\n", - " train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1]))) # 负样本 [user_id, his_item, neg_item, label, len(his_item)]\n", - " else:\n", - " # 将最长的那一个序列长度作为测试数据\n", - " test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1])))\n", - " \n", - " random.shuffle(train_set)\n", - " random.shuffle(test_set)\n", - " \n", - " return train_set, test_set\n", - "\n", - "# 将输入的数据进行padding,使得序列特征的长度都一致\n", - "def gen_model_input(train_set,user_profile,seq_max_len):\n", - "\n", - " train_uid = np.array([line[0] for line in train_set])\n", - " train_seq = [line[1] for line in train_set]\n", - " train_iid = np.array([line[2] for line in train_set])\n", - " train_label = np.array([line[3] for line in train_set])\n", - " train_hist_len = np.array([line[4] for line in train_set])\n", - "\n", - " train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)\n", - " train_model_input = {\"user_id\": train_uid, \"click_article_id\": train_iid, \"hist_article_id\": train_seq_pad,\n", - " \"hist_len\": train_hist_len}\n", - "\n", - " return train_model_input, train_label" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:13:18.124452Z", - "start_time": "2020-11-16T10:13:18.098284Z" - } - }, - "outputs": [], - "source": [ - "def youtubednn_u2i_dict(data, topk=20): \n", - " sparse_features = [\"click_article_id\", \"user_id\"]\n", - " SEQ_LEN = 30 # 用户点击序列的长度,短的填充,长的截断\n", - " \n", - " user_profile_ = data[[\"user_id\"]].drop_duplicates('user_id')\n", - " item_profile_ = data[[\"click_article_id\"]].drop_duplicates('click_article_id') \n", - " \n", - " # 类别编码\n", - " features = [\"click_article_id\", \"user_id\"]\n", - " feature_max_idx = {}\n", - " \n", - " for feature in features:\n", - " lbe = LabelEncoder()\n", - " data[feature] = lbe.fit_transform(data[feature])\n", - " feature_max_idx[feature] = data[feature].max() + 1\n", - " \n", - " # 提取user和item的画像,这里具体选择哪些特征还需要进一步的分析和考虑\n", - " user_profile = data[[\"user_id\"]].drop_duplicates('user_id')\n", - " item_profile = data[[\"click_article_id\"]].drop_duplicates('click_article_id') \n", - " \n", - " user_index_2_rawid = dict(zip(user_profile['user_id'], user_profile_['user_id']))\n", - " item_index_2_rawid = dict(zip(item_profile['click_article_id'], item_profile_['click_article_id']))\n", - " \n", - " # 划分训练和测试集\n", - " # 由于深度学习需要的数据量通常都是非常大的,所以为了保证召回的效果,往往会通过滑窗的形式扩充训练样本\n", - " train_set, test_set = gen_data_set(data, 0)\n", - " # 整理输入数据,具体的操作可以看上面的函数\n", - " train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)\n", - " test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)\n", - " \n", - " # 确定Embedding的维度\n", - " embedding_dim = 16\n", - " \n", - " # 将数据整理成模型可以直接输入的形式\n", - " user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),\n", - " VarLenSparseFeat(SparseFeat('hist_article_id', feature_max_idx['click_article_id'], embedding_dim,\n", - " embedding_name=\"click_article_id\"), SEQ_LEN, 'mean', 'hist_len'),]\n", - " item_feature_columns = [SparseFeat('click_article_id', feature_max_idx['click_article_id'], embedding_dim)]\n", - " \n", - " # 模型的定义 \n", - " # num_sampled: 负采样时的样本数量\n", - " model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim))\n", - " # 模型编译\n", - " model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss) \n", - " \n", - " # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练\n", - " history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0)\n", - " \n", - " # 训练完模型之后,提取训练的Embedding,包括user端和item端\n", - " test_user_model_input = test_model_input\n", - " all_item_model_input = {\"click_article_id\": item_profile['click_article_id'].values}\n", - "\n", - " user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n", - " item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n", - " \n", - " # 保存当前的item_embedding 和 user_embedding 排序的时候可能能够用到,但是需要注意保存的时候需要和原始的id对应\n", - " user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n", - " item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n", - " \n", - " # embedding保存之前归一化一下\n", - " user_embs = user_embs / np.linalg.norm(user_embs, axis=1, keepdims=True)\n", - " item_embs = item_embs / np.linalg.norm(item_embs, axis=1, keepdims=True)\n", - " \n", - " # 将Embedding转换成字典的形式方便查询\n", - " raw_user_id_emb_dict = {user_index_2_rawid[k]: \\\n", - " v for k, v in zip(user_profile['user_id'], user_embs)}\n", - " raw_item_id_emb_dict = {item_index_2_rawid[k]: \\\n", - " v for k, v in zip(item_profile['click_article_id'], item_embs)}\n", - " # 将Embedding保存到本地\n", - " pickle.dump(raw_user_id_emb_dict, open(save_path + 'user_youtube_emb.pkl', 'wb'))\n", - " pickle.dump(raw_item_id_emb_dict, open(save_path + 'item_youtube_emb.pkl', 'wb'))\n", - " \n", - " # faiss紧邻搜索,通过user_embedding 搜索与其相似性最高的topk个item\n", - " index = faiss.IndexFlatIP(embedding_dim)\n", - " # 上面已经进行了归一化,这里可以不进行归一化了\n", - "# faiss.normalize_L2(user_embs)\n", - "# faiss.normalize_L2(item_embs)\n", - " index.add(item_embs) # 将item向量构建索引\n", - " sim, idx = index.search(np.ascontiguousarray(user_embs), topk) # 通过user去查询最相似的topk个item\n", - " \n", - " user_recall_items_dict = collections.defaultdict(dict)\n", - " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(test_user_model_input['user_id'], sim, idx)):\n", - " target_raw_id = user_index_2_rawid[target_idx]\n", - " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", - " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", - " rele_raw_id = item_index_2_rawid[rele_idx]\n", - " user_recall_items_dict[target_raw_id][rele_raw_id] = user_recall_items_dict.get(target_raw_id, {})\\\n", - " .get(rele_raw_id, 0) + sim_value\n", - " \n", - " user_recall_items_dict = {k: sorted(v.items(), key=lambda x: x[1], reverse=True) for k, v in user_recall_items_dict.items()}\n", - " # 将召回的结果进行排序\n", - " \n", - " # 保存召回的结果\n", - " # 这里是直接通过向量的方式得到了召回结果,相比于上面的召回方法,上面的只是得到了i2i及u2u的相似性矩阵,还需要进行协同过滤召回才能得到召回结果\n", - " # 可以直接对这个召回结果进行评估,为了方便可以统一写一个评估函数对所有的召回结果进行评估\n", - " pickle.dump(user_recall_items_dict, open(save_path + 'youtube_u2i_dict.pkl', 'wb'))\n", - " return user_recall_items_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:21:46.420014Z", - "start_time": "2020-11-16T10:13:35.351131Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [02:02<00:00, 2038.57it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:253: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "keep_dims is deprecated, use keepdims instead\n", - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:253: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Deprecated in favor of operator or tf.math.divide.\n", - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "1149673/1149673 [==============================] - 216s 188us/sample - loss: 0.1326\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "250000it [00:32, 7720.75it/s]\n" - ] - } - ], - "source": [ - "# 由于这里需要做召回评估,所以讲训练集中的最后一次点击都提取了出来\n", - "if not metric_recall:\n", - " user_multi_recall_dict['youtubednn_recall'] = youtubednn_u2i_dict(all_click_df, topk=20)\n", - "else:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - " user_multi_recall_dict['youtubednn_recall'] = youtubednn_u2i_dict(trn_hist_click_df, topk=20)\n", - " # 召回效果评估\n", - " metrics_recall(user_multi_recall_dict['youtubednn_recall'], trn_last_click_df, topk=20)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### itemcf recall\n", - "\n", - "上面已经通过协同过滤,Embedding检索的方式得到了文章的相似度矩阵,下面使用协同过滤的思想,给用户召回与其历史文章相似的文章。\n", - "这里在召回的时候,也是用了关联规则的方式:\n", - "1. 考虑相似文章与历史点击文章顺序的权重(细节看代码)\n", - "2. 考虑文章创建时间的权重,也就是考虑相似文章与历史点击文章创建时间差的权重\n", - "3. 考虑文章内容相似度权重(使用Embedding计算相似文章相似度,但是这里需要注意,在Embedding的时候并没有计算所有商品两两之间的相似度,所以相似的文章与历史点击文章不存在相似度,需要做特殊处理)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:48:40.580553Z", - "start_time": "2020-11-16T11:48:40.567130Z" - } - }, - "outputs": [], - "source": [ - "# 基于商品的召回i2i\n", - "def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim):\n", - " \"\"\"\n", - " 基于文章协同过滤的召回\n", - " :param user_id: 用户id\n", - " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - " :param i2i_sim: 字典,文章相似性矩阵\n", - " :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章\n", - " :param recall_item_num: 整数, 最后的召回文章数量\n", - " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全\n", - " :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵\n", - " \n", - " return: 召回的文章列表 [(item1, score1), (item2, score2)...]\n", - " \"\"\"\n", - " # 获取用户历史交互的文章\n", - " user_hist_items = user_item_time_dict[user_id]\n", - " user_hist_items_ = {user_id for user_id, _ in user_hist_items}\n", - " \n", - " item_rank = {}\n", - " for loc, (i, click_time) in enumerate(user_hist_items):\n", - " for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:\n", - " if j in user_hist_items_:\n", - " continue\n", - " \n", - " # 文章创建时间差权重\n", - " created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", - " # 相似文章和历史点击文章序列中历史文章所在的位置权重\n", - " loc_weight = (0.9 ** (len(user_hist_items) - loc))\n", - " \n", - " content_weight = 1.0\n", - " if emb_i2i_sim.get(i, {}).get(j, None) is not None:\n", - " content_weight += emb_i2i_sim[i][j]\n", - " if emb_i2i_sim.get(j, {}).get(i, None) is not None:\n", - " content_weight += emb_i2i_sim[j][i]\n", - " \n", - " item_rank.setdefault(j, 0)\n", - " item_rank[j] += created_time_weight * loc_weight * content_weight * wij\n", - " \n", - " # 不足10个,用热门商品补全\n", - " if len(item_rank) < recall_item_num:\n", - " for i, item in enumerate(item_topk_click):\n", - " if item in item_rank.items(): # 填充的item应该不在原来的列表中\n", - " continue\n", - " item_rank[item] = - i - 100 # 随便给个负数就行\n", - " if len(item_rank) == recall_item_num:\n", - " break\n", - " \n", - " item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]\n", - " \n", - " return item_rank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### itemcf sim召回" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T14:41:23.433038Z", - "start_time": "2020-11-16T11:48:46.286350Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [2:51:13<00:00, 24.33it/s] \n" - ] - } - ], - "source": [ - "# 先进行itemcf召回, 为了召回评估,所以提取最后一次点击\n", - "\n", - "if metric_recall:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - "else:\n", - " trn_hist_click_df = all_click_df\n", - "\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "\n", - "i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))\n", - "emb_i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl', 'rb'))\n", - "\n", - "sim_item_topk = 20\n", - "recall_item_num = 10\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, \\\n", - " i2i_sim, sim_item_topk, recall_item_num, \\\n", - " item_topk_click, item_created_time_dict, emb_i2i_sim)\n", - "\n", - "user_multi_recall_dict['itemcf_sim_itemcf_recall'] = user_recall_items_dict\n", - "pickle.dump(user_multi_recall_dict['itemcf_sim_itemcf_recall'], open(save_path + 'itemcf_recall_dict.pkl', 'wb'))\n", - "\n", - "if metric_recall:\n", - " # 召回效果评估\n", - " metrics_recall(user_multi_recall_dict['itemcf_sim_itemcf_recall'], trn_last_click_df, topk=recall_item_num)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### embedding sim 召回" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T15:04:51.527795Z", - "start_time": "2020-11-16T14:59:03.907519Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [04:35<00:00, 905.85it/s] \n" - ] - } - ], - "source": [ - "# 这里是为了召回评估,所以提取最后一次点击\n", - "if metric_recall:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - "else:\n", - " trn_hist_click_df = all_click_df\n", - "\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))\n", - "\n", - "sim_item_topk = 20\n", - "recall_item_num = 10\n", - "\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, \n", - " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)\n", - " \n", - "user_multi_recall_dict['embedding_sim_item_recall'] = user_recall_items_dict\n", - "pickle.dump(user_multi_recall_dict['embedding_sim_item_recall'], open(save_path + 'embedding_sim_item_recall.pkl', 'wb'))\n", - "\n", - "if metric_recall:\n", - " # 召回效果评估\n", - " metrics_recall(user_multi_recall_dict['embedding_sim_item_recall'], trn_last_click_df, topk=recall_item_num)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### usercf召回\n", - "\n", - "基于用户协同过滤,核心思想是给用户推荐与其相似的用户历史点击文章,因为这里涉及到了相似用户的历史文章,这里仍然可以加上一些关联规则来给用户可能点击的文章进行加权,这里使用的关联规则主要是考虑相似用户的历史点击文章与被推荐用户历史点击商品的关系权重,而这里的关系就可以直接借鉴基于物品的协同过滤相似的做法,只不过这里是对被推荐物品关系的一个累加的过程,下面是使用的一些关系权重,及相关的代码:\n", - "\n", - "1. 计算被推荐用户历史点击文章与相似用户历史点击文章的相似度,文章创建时间差,相对位置的总和,作为各自的权重" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:09:32.293990Z", - "start_time": "2020-11-17T02:09:32.278678Z" - } - }, - "outputs": [], - "source": [ - "# 基于用户的召回 u2u2i\n", - "def user_based_recommend(user_id, user_item_time_dict, u2u_sim, sim_user_topk, recall_item_num, \n", - " item_topk_click, item_created_time_dict, emb_i2i_sim):\n", - " \"\"\"\n", - " 基于文章协同过滤的召回\n", - " :param user_id: 用户id\n", - " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - " :param u2u_sim: 字典,文章相似性矩阵\n", - " :param sim_user_topk: 整数, 选择与当前用户最相似的前k个用户\n", - " :param recall_item_num: 整数, 最后的召回文章数量\n", - " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全\n", - " :param item_created_time_dict: 文章创建时间列表\n", - " :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵\n", - " \n", - " return: 召回的文章列表 [(item1, score1), (item2, score2)...]\n", - " \"\"\"\n", - " # 历史交互\n", - " user_item_time_list = user_item_time_dict[user_id] # [(item1, time1), (item2, time2)..]\n", - " user_hist_items = set([i for i, t in user_item_time_list]) # 存在一个用户与某篇文章的多次交互, 这里得去重\n", - " \n", - " items_rank = {}\n", - " for sim_u, wuv in sorted(u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[:sim_user_topk]:\n", - " for i, click_time in user_item_time_dict[sim_u]:\n", - " if i in user_hist_items:\n", - " continue\n", - " items_rank.setdefault(i, 0)\n", - " \n", - " loc_weight = 1.0\n", - " content_weight = 1.0\n", - " created_time_weight = 1.0\n", - " \n", - " # 当前文章与该用户看的历史文章进行一个权重交互\n", - " for loc, (j, click_time) in enumerate(user_item_time_list):\n", - " # 点击时的相对位置权重\n", - " loc_weight += 0.9 ** (len(user_item_time_list) - loc)\n", - " # 内容相似性权重\n", - " if emb_i2i_sim.get(i, {}).get(j, None) is not None:\n", - " content_weight += emb_i2i_sim[i][j]\n", - " if emb_i2i_sim.get(j, {}).get(i, None) is not None:\n", - " content_weight += emb_i2i_sim[j][i]\n", - " \n", - " # 创建时间差权重\n", - " created_time_weight += np.exp(0.8 * np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", - " \n", - " items_rank[i] += loc_weight * content_weight * created_time_weight * wuv\n", - " \n", - " # 热度补全\n", - " if len(items_rank) < recall_item_num:\n", - " for i, item in enumerate(item_topk_click):\n", - " if item in items_rank.items(): # 填充的item应该不在原来的列表中\n", - " continue\n", - " items_rank[item] = - i - 100 # 随便给个复数就行\n", - " if len(items_rank) == recall_item_num:\n", - " break\n", - " \n", - " items_rank = sorted(items_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num] \n", - " \n", - " return items_rank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### usercf sim召回" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:05:41.652501Z", - "start_time": "2020-11-16T07:05:40.953871Z" - } - }, - "outputs": [], - "source": [ - "# 这里是为了召回评估,所以提取最后一次点击\n", - "# 由于usercf中计算user之间的相似度的过程太费内存了,全量数据这里就没有跑,跑了一个采样之后的数据\n", - "if metric_recall:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - "else:\n", - " trn_hist_click_df = all_click_df\n", - " \n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "\n", - "u2u_sim = pickle.load(open(save_path + 'usercf_u2u_sim.pkl', 'rb'))\n", - "\n", - "sim_user_topk = 20\n", - "recall_item_num = 10\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, \\\n", - " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim) \n", - "\n", - "pickle.dump(user_recall_items_dict, open(save_path + 'usercf_u2u2i_recall.pkl', 'wb'))\n", - "\n", - "if metric_recall:\n", - " # 召回效果评估\n", - " metrics_recall(user_recall_items_dict, trn_last_click_df, topk=recall_item_num)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T03:09:35.853516Z", - "start_time": "2020-11-16T03:09:35.737625Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### user embedding sim召回\n", - "\n", - "虽然没有直接跑usercf的计算用户之间的相似度,为了验证上述基于用户的协同过滤的代码,下面使用了YoutubeDNN过程中产生的user embedding来进行向量检索每个user最相似的topk个user,在使用这里得到的u2u的相似性矩阵,使用usercf进行召回,具体代码如下" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:09:46.807811Z", - "start_time": "2020-11-17T02:09:46.798033Z" - } - }, - "outputs": [], - "source": [ - "# 使用Embedding的方式获取u2u的相似性矩阵\n", - "# topk指的是每个user, faiss搜索后返回最相似的topk个user\n", - "def u2u_embdding_sim(click_df, user_emb_dict, save_path, topk):\n", - " \n", - " user_list = []\n", - " user_emb_list = []\n", - " for user_id, user_emb in user_emb_dict.items():\n", - " user_list.append(user_id)\n", - " user_emb_list.append(user_emb)\n", - " \n", - " user_index_2_rawid_dict = {k: v for k, v in zip(range(len(user_list)), user_list)} \n", - " \n", - " user_emb_np = np.array(user_emb_list, dtype=np.float32)\n", - " \n", - " # 建立faiss索引\n", - " user_index = faiss.IndexFlatIP(user_emb_np.shape[1])\n", - " user_index.add(user_emb_np)\n", - " # 相似度查询,给每个索引位置上的向量返回topk个item以及相似度\n", - " sim, idx = user_index.search(user_emb_np, topk) # 返回的是列表\n", - " \n", - " # 将向量检索的结果保存成原始id的对应关系\n", - " user_sim_dict = collections.defaultdict(dict)\n", - " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(user_emb_np)), sim, idx)):\n", - " target_raw_id = user_index_2_rawid_dict[target_idx]\n", - " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", - " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", - " rele_raw_id = user_index_2_rawid_dict[rele_idx]\n", - " user_sim_dict[target_raw_id][rele_raw_id] = user_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value\n", - " \n", - " # 保存i2i相似度矩阵\n", - " pickle.dump(user_sim_dict, open(save_path + 'youtube_u2u_sim.pkl', 'wb')) \n", - " return user_sim_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:14:31.355905Z", - "start_time": "2020-11-17T02:09:53.236531Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "250000it [00:23, 10507.45it/s]\n" - ] - } - ], - "source": [ - "# 读取YoutubeDNN过程中产生的user embedding, 然后使用faiss计算用户之间的相似度\n", - "# 这里需要注意,这里得到的user embedding其实并不是很好,因为YoutubeDNN中使用的是用户点击序列来训练的user embedding,\n", - "# 如果序列普遍都比较短的话,其实效果并不是很好\n", - "user_emb_dict = pickle.load(open(save_path + 'user_youtube_emb.pkl', 'rb'))\n", - "u2u_sim = u2u_embdding_sim(all_click_df, user_emb_dict, save_path, topk=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "通过YoutubeDNN得到的user_embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:49:40.755431Z", - "start_time": "2020-11-17T02:28:47.003514Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [19:43<00:00, 211.22it/s]\n" - ] - } - ], - "source": [ - "# 使用召回评估函数验证当前召回方式的效果\n", - "if metric_recall:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - "else:\n", - " trn_hist_click_df = all_click_df\n", - "\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "u2u_sim = pickle.load(open(save_path + 'youtube_u2u_sim.pkl', 'rb'))\n", - "\n", - "sim_user_topk = 20\n", - "recall_item_num = 10\n", - "\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, \\\n", - " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)\n", - " \n", - "user_multi_recall_dict['youtubednn_usercf_recall'] = user_recall_items_dict\n", - "pickle.dump(user_multi_recall_dict['youtubednn_usercf_recall'], open(save_path + 'youtubednn_usercf_recall.pkl', 'wb'))\n", - "\n", - "if metric_recall:\n", - " # 召回效果评估\n", - " metrics_recall(user_multi_recall_dict['youtubednn_usercf_recall'], trn_last_click_df, topk=recall_item_num)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:07:44.326253Z", - "start_time": "2020-11-16T07:07:43.798931Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 冷启动问题" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**冷启动问题可以分成三类:文章冷启动,用户冷启动,系统冷启动。**\n", - "\n", - "- 文章冷启动:对于一个平台系统新加入的文章,该文章没有任何的交互记录,如何推荐给用户的问题。(对于我们场景可以认为是,日志数据中没有出现过的文章都可以认为是冷启动的文章)\n", - "- 用户冷启动:对于一个平台系统新来的用户,该用户还没有文章的交互信息,如何给该用户进行推荐。(对于我们场景就是,测试集中的用户是否在测试集对应的log数据中出现过,如果没有出现过,那么可以认为该用户是冷启动用户。但是有时候并没有这么严格,我们也可以自己设定某些指标来判别哪些用户是冷启动用户,比如通过使用时长,点击率,留存率等等)\n", - "- 系统冷启动:就是对于一个平台刚上线,还没有任何的相关历史数据,此时就是系统冷启动,其实也就是前面两种的一个综合。\n", - "\n", - "**当前场景下冷启动问题的分析:**\n", - "\n", - "对当前的数据进行分析会发现,日志中所有出现过的点击文章只有3w多个,而整个文章库中却有30多万,那么测试集中的用户最后一次点击是否会点击没有出现在日志中的文章呢?如果存在这种情况,说明用户点击的文章之前没有任何的交互信息,这也就是我们所说的文章冷启动。通过数据分析还可以发现,测试集用户只有一次点击的数据占得比例还不少,其实仅仅通过用户的一次点击就给用户推荐文章使用模型的方式也是比较难的,这里其实也可以考虑用户冷启动的问题,但是这里只给出物品冷启动的一些解决方案及代码,关于用户冷启动的话提一些可行性的做法。\n", - "\n", - "1. 文章冷启动(没有冷启动的探索问题) \n", - " 其实我们这里不是为了做文章的冷启动而做冷启动,而是猜测用户可能会点击一些没有在log数据中出现的文章,我们要做的就是如何从将近27万的文章中选择一些文章作为用户冷启动的文章,这里其实也可以看成是一种召回策略,我们这里就采用简单的比较好理解的基于规则的召回策略来获取用户可能点击的未出现在log数据中的文章。\n", - " 现在的问题变成了:如何给每个用户考虑从27万个商品中获取一小部分商品?随机选一些可能是一种方案。下面给出一些参考的方案。\n", - " 1. 首先基于Embedding召回一部分与用户历史相似的文章\n", - " 2. 从基于Embedding召回的文章中通过一些规则过滤掉一些文章,使得留下的文章用户更可能点击。我们这里的规则,可以是,留下那些与用户历史点击文章主题相同的文章,或者字数相差不大的文章。并且留下的文章尽量是与测试集用户最后一次点击时间更接近的文章,或者是当天的文章也行。\n", - "2. 用户冷启动 \n", - " 这里对测试集中的用户点击数据进行分析会发现,测试集中有百分之20的用户只有一次点击,那么这些点击特别少的用户的召回是不是可以单独做一些策略上的补充呢?或者是在排序后直接基于规则加上一些文章呢?这些都可以去尝试,这里没有提供具体的做法。\n", - " \n", - "**注意:** \n", - "\n", - "这里看似和基于embedding计算的item之间相似度然后做itemcf是一致的,但是现在我们的目的不一样,我们这里的目的是找到相似的向量,并且还没有出现在log日志中的商品,再加上一些其他的冷启动的策略,这里需要找回的数量会偏多一点,不然被筛选完之后可能都没有文章了" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T04:30:23.027164Z", - "start_time": "2020-11-17T04:23:09.960235Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [05:01<00:00, 828.60it/s] \n" - ] - } - ], - "source": [ - "# 先进行itemcf召回,这里不需要做召回评估,这里只是一种策略\n", - "trn_hist_click_df = all_click_df\n", - "\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))\n", - "\n", - "sim_item_topk = 150\n", - "recall_item_num = 100 # 稍微召回多一点文章,便于后续的规则筛选\n", - "\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, \n", - " recall_item_num, item_topk_click,item_created_time_dict, emb_i2i_sim)\n", - "pickle.dump(user_recall_items_dict, open(save_path + 'cold_start_items_raw_dict.pkl', 'wb'))" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:11:39.267581Z", - "start_time": "2020-11-17T06:11:39.252563Z" - } - }, - "outputs": [], - "source": [ - "# 基于规则进行文章过滤\n", - "# 保留文章主题与用户历史浏览主题相似的文章\n", - "# 保留文章字数与用户历史浏览文章字数相差不大的文章\n", - "# 保留最后一次点击当天的文章\n", - "# 按照相似度返回最终的结果\n", - "\n", - "def get_click_article_ids_set(all_click_df):\n", - " return set(all_click_df.click_article_id.values)\n", - "\n", - "def cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \\\n", - " user_last_item_created_time_dict, item_type_dict, item_words_dict, \n", - " item_created_time_dict, click_article_ids_set, recall_item_num):\n", - " \"\"\"\n", - " 冷启动的情况下召回一些文章\n", - " :param user_recall_items_dict: 基于内容embedding相似性召回来的很多文章, 字典, {user1: [(item1, item2), ..], }\n", - " :param user_hist_item_typs_dict: 字典, 用户点击的文章的主题映射\n", - " :param user_hist_item_words_dict: 字典, 用户点击的历史文章的字数映射\n", - " :param user_last_item_created_time_idct: 字典,用户点击的历史文章创建时间映射\n", - " :param item_tpye_idct: 字典,文章主题映射\n", - " :param item_words_dict: 字典,文章字数映射\n", - " :param item_created_time_dict: 字典, 文章创建时间映射\n", - " :param click_article_ids_set: 集合,用户点击过得文章, 也就是日志里面出现过的文章\n", - " :param recall_item_num: 召回文章的数量, 这个指的是没有出现在日志里面的文章数量\n", - " \"\"\"\n", - " \n", - " cold_start_user_items_dict = {}\n", - " for user, item_list in tqdm(user_recall_items_dict.items()):\n", - " cold_start_user_items_dict.setdefault(user, [])\n", - " for item, score in item_list:\n", - " # 获取历史文章信息\n", - " hist_item_type_set = user_hist_item_typs_dict[user]\n", - " hist_mean_words = user_hist_item_words_dict[user]\n", - " hist_last_item_created_time = user_last_item_created_time_dict[user]\n", - " hist_last_item_created_time = datetime.fromtimestamp(hist_last_item_created_time)\n", - " \n", - " # 获取当前召回文章的信息\n", - " curr_item_type = item_type_dict[item]\n", - " curr_item_words = item_words_dict[item]\n", - " curr_item_created_time = item_created_time_dict[item]\n", - " curr_item_created_time = datetime.fromtimestamp(curr_item_created_time)\n", - "\n", - " # 首先,文章不能出现在用户的历史点击中, 然后根据文章主题,文章单词数,文章创建时间进行筛选\n", - " if curr_item_type not in hist_item_type_set or \\\n", - " item in click_article_ids_set or \\\n", - " abs(curr_item_words - hist_mean_words) > 200 or \\\n", - " abs((curr_item_created_time - hist_last_item_created_time).days) > 90: \n", - " continue\n", - " \n", - " cold_start_user_items_dict[user].append((item, score)) # {user1: [(item1, score1), (item2, score2)..]...}\n", - " \n", - " # 需要控制一下冷启动召回的数量\n", - " cold_start_user_items_dict = {k: sorted(v, key=lambda x:x[1], reverse=True)[:recall_item_num] \\\n", - " for k, v in cold_start_user_items_dict.items()}\n", - " \n", - " pickle.dump(cold_start_user_items_dict, open(save_path + 'cold_start_user_items_dict.pkl', 'wb'))\n", - " \n", - " return cold_start_user_items_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:35:38.758278Z", - "start_time": "2020-11-17T06:31:40.164332Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [01:49<00:00, 2293.37it/s]\n" - ] - } - ], - "source": [ - "all_click_df_ = all_click_df.copy()\n", - "all_click_df_ = all_click_df_.merge(item_info_df, how='left', on='click_article_id')\n", - "user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict = get_user_hist_item_info_dict(all_click_df_)\n", - "click_article_ids_set = get_click_article_ids_set(all_click_df)\n", - "# 需要注意的是\n", - "# 这里使用了很多规则来筛选冷启动的文章,所以前面再召回的阶段就应该尽可能的多召回一些文章,否则很容易被删掉\n", - "cold_start_user_items_dict = cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \\\n", - " user_last_item_created_time_dict, item_type_dict, item_words_dict, \\\n", - " item_created_time_dict, click_article_ids_set, recall_item_num)\n", - "\n", - "user_multi_recall_dict['cold_start_recall'] = cold_start_user_items_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:13:33.099298Z", - "start_time": "2020-11-16T07:13:32.655036Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 多路召回合并\n", - "多路召回合并就是将前面所有的召回策略得到的用户文章列表合并起来,下面是对前面所有召回结果的汇总\n", - "1. 基于itemcf计算的item之间的相似度sim进行的召回 \n", - "2. 基于embedding搜索得到的item之间的相似度进行的召回\n", - "3. YoutubeDNN召回\n", - "4. YoutubeDNN得到的user之间的相似度进行的召回\n", - "5. 基于冷启动策略的召回\n", - "\n", - "**注意:** \n", - "在做召回评估的时候就会发现有些召回的效果不错有些召回的效果很差,所以对每一路召回的结果,我们可以认为的定义一些权重,来做最终的相似度融合" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T07:02:16.033971Z", - "start_time": "2020-11-17T07:02:16.019819Z" - } - }, - "outputs": [], - "source": [ - "def combine_recall_results(user_multi_recall_dict, weight_dict=None, topk=25):\n", - " final_recall_items_dict = {}\n", - " \n", - " # 对每一种召回结果按照用户进行归一化,方便后面多种召回结果,相同用户的物品之间权重相加\n", - " def norm_user_recall_items_sim(sorted_item_list):\n", - " # 如果冷启动中没有文章或者只有一篇文章,直接返回,出现这种情况的原因可能是冷启动召回的文章数量太少了,\n", - " # 基于规则筛选之后就没有文章了, 这里还可以做一些其他的策略性的筛选\n", - " if len(sorted_item_list) < 2:\n", - " return sorted_item_list\n", - " \n", - " min_sim = sorted_item_list[-1][1]\n", - " max_sim = sorted_item_list[0][1]\n", - " \n", - " norm_sorted_item_list = []\n", - " for item, score in sorted_item_list:\n", - " if max_sim > 0:\n", - " norm_score = 1.0 * (score - min_sim) / (max_sim - min_sim) if max_sim > min_sim else 1.0\n", - " else:\n", - " norm_score = 0.0\n", - " norm_sorted_item_list.append((item, norm_score))\n", - " \n", - " return norm_sorted_item_list\n", - " \n", - " print('多路召回合并...')\n", - " for method, user_recall_items in tqdm(user_multi_recall_dict.items()):\n", - " print(method + '...')\n", - " # 在计算最终召回结果的时候,也可以为每一种召回结果设置一个权重\n", - " if weight_dict == None:\n", - " recall_method_weight = 1\n", - " else:\n", - " recall_method_weight = weight_dict[method]\n", - " \n", - " for user_id, sorted_item_list in user_recall_items.items(): # 进行归一化\n", - " user_recall_items[user_id] = norm_user_recall_items_sim(sorted_item_list)\n", - " \n", - " for user_id, sorted_item_list in user_recall_items.items():\n", - " # print('user_id')\n", - " final_recall_items_dict.setdefault(user_id, {})\n", - " for item, score in sorted_item_list:\n", - " final_recall_items_dict[user_id].setdefault(item, 0)\n", - " final_recall_items_dict[user_id][item] += recall_method_weight * score \n", - " \n", - " final_recall_items_dict_rank = {}\n", - " # 多路召回时也可以控制最终的召回数量\n", - " for user, recall_item_dict in final_recall_items_dict.items():\n", - " final_recall_items_dict_rank[user] = sorted(recall_item_dict.items(), key=lambda x: x[1], reverse=True)[:topk]\n", - "\n", - " # 将多路召回后的最终结果字典保存到本地\n", - " pickle.dump(final_recall_items_dict_rank, open(os.path.join(save_path, 'final_recall_items_dict.pkl'),'wb'))\n", - "\n", - " return final_recall_items_dict_rank" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T07:02:21.078455Z", - "start_time": "2020-11-17T07:02:21.074060Z" - } - }, - "outputs": [], - "source": [ - "# 这里直接对多路召回的权重给了一个相同的值,其实可以根据前面召回的情况来调整参数的值\n", - "weight_dict = {'itemcf_sim_itemcf_recall': 1.0,\n", - " 'embedding_sim_item_recall': 1.0,\n", - " 'youtubednn_recall': 1.0,\n", - " 'youtubednn_usercf_recall': 1.0, \n", - " 'cold_start_recall': 1.0}" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T07:04:35.747924Z", - "start_time": "2020-11-17T07:02:26.889573Z" - } - }, - "outputs": [ + "cells": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/5 [00:00\n", + "\n", + "上图只是一个多路召回的例子,也就是说可以使用多种不同的策略来获取用户排序的候选商品集合,而具体使用哪些召回策略其实是与业务强相关的 ,针对不同的任务就会有对于该业务真实场景下需要考虑的召回规则。例如新闻推荐,召回规则可以是“热门新闻”、“作者召回”、“关键词召回”、“主题召回“、”协同过滤召回“等等。 \n", + "\n" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "多路召回合并...\n", - "itemcf_sim_itemcf_recall...\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 导包" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - " 20%|██ | 1/5 [00:08<00:34, 8.66s/it]" - ] + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:29.834662Z", + "start_time": "2020-11-16T11:26:27.811511Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import numpy as np\n", + "from tqdm import tqdm \n", + "from collections import defaultdict \n", + "import os, math, warnings, math, pickle\n", + "from tqdm import tqdm\n", + "import faiss\n", + "import collections\n", + "import random\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from datetime import datetime\n", + "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from tensorflow.python.keras import backend as K\n", + "from tensorflow.python.keras.models import Model\n", + "from tensorflow.python.keras.preprocessing.sequence import pad_sequences\n", + "\n", + "from deepmatch.models import *\n", + "from deepmatch.utils import sampledsoftmaxloss\n", + "warnings.filterwarnings('ignore')" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "embedding_sim_item_recall...\n" - ] + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:31.831215Z", + "start_time": "2020-11-16T11:26:31.826939Z" + } + }, + "outputs": [], + "source": [ + "data_path = './data_raw/'\n", + "save_path = './temp_results/'\n", + "# 做召回评估的一个标志, 如果不进行评估就是直接使用全量数据进行召回\n", + "metric_recall = False" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - " 40%|████ | 2/5 [00:16<00:24, 8.29s/it]" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取数据\n", + "在一般的rs比赛中读取数据部分主要分为三种模式, 不同的模式对应的不同的数据集:\n", + "1. debug模式: 这个的目的是帮助我们基于数据先搭建一个简易的baseline并跑通, 保证写的baseline代码没有什么问题。 由于推荐比赛的数据往往非常巨大, 如果一上来直接采用全部的数据进行分析,搭建baseline框架, 往往会带来时间和设备上的损耗, **所以这时候我们往往需要从海量数据的训练集中随机抽取一部分样本来进行调试(train_click_log_sample)**, 先跑通一个baseline。\n", + "2. 线下验证模式: 这个的目的是帮助我们在线下基于已有的训练集数据, 来选择好合适的模型和一些超参数。 **所以我们这一块只需要加载整个训练集(train_click_log)**, 然后把整个训练集再分成训练集和验证集。 训练集是模型的训练数据, 验证集部分帮助我们调整模型的参数和其他的一些超参数。\n", + "3. 线上模式: 我们用debug模式搭建起一个推荐系统比赛的baseline, 用线下验证模式选择好了模型和一些超参数, 这一部分就是真正的对于给定的测试集进行预测, 提交到线上, **所以这一块使用的训练数据集是全量的数据集(train_click_log+test_click_log)**\n", + "\n", + "下面就分别对这三种不同的数据读取模式先建立不同的代导入函数, 方便后面针对不同的模式下导入数据。" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "youtubednn_recall...\n", - "youtubednn_usercf_recall...\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:34.476240Z", + "start_time": "2020-11-16T11:26:34.467352Z" + } + }, + "outputs": [], + "source": [ + "# debug模式: 从训练集中划出一部分数据来调试代码\n", + "def get_all_click_sample(data_path, sample_nums=10000):\n", + " \"\"\"\n", + " 训练集中采样一部分数据调试\n", + " data_path: 原数据的存储路径\n", + " sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)\n", + " \"\"\"\n", + " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " all_user_ids = all_click.user_id.unique()\n", + "\n", + " sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) \n", + " all_click = all_click[all_click['user_id'].isin(sample_user_ids)]\n", + " \n", + " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", + " return all_click\n", + "\n", + "# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中\n", + "# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集\n", + "def get_all_click_df(data_path='./data_raw/', offline=True):\n", + " if offline:\n", + " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " else:\n", + " trn_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", + "\n", + " all_click = trn_click.append(tst_click)\n", + " \n", + " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", + " return all_click" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - " 80%|████████ | 4/5 [00:23<00:06, 6.98s/it]" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:35.168738Z", + "start_time": "2020-11-16T11:26:35.163210Z" + } + }, + "outputs": [], + "source": [ + "# 读取文章的基本属性\n", + "def get_item_info_df(data_path):\n", + " item_info_df = pd.read_csv(data_path + 'articles.csv')\n", + " \n", + " # 为了方便与训练集中的click_article_id拼接,需要把article_id修改成click_article_id\n", + " item_info_df = item_info_df.rename(columns={'article_id': 'click_article_id'})\n", + " \n", + " return item_info_df" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "cold_start_recall...\n" - ] + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:36.152958Z", + "start_time": "2020-11-16T11:26:36.146324Z" + } + }, + "outputs": [], + "source": [ + "# 读取文章的Embedding数据\n", + "def get_item_emb_dict(data_path):\n", + " item_emb_df = pd.read_csv(data_path + 'articles_emb.csv')\n", + " \n", + " item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]\n", + " item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])\n", + " # 进行归一化\n", + " item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)\n", + "\n", + " item_emb_dict = dict(zip(item_emb_df['article_id'], item_emb_np))\n", + " pickle.dump(item_emb_dict, open(save_path + 'item_content_emb.pkl', 'wb'))\n", + " \n", + " return item_emb_dict" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:42<00:00, 8.40s/it]\n" - ] - } - ], - "source": [ - "# 最终合并之后每个用户召回150个商品进行排序\n", - "final_recall_items_dict_rank = combine_recall_results(user_multi_recall_dict, weight_dict, topk=150)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 总结\n", - "\n", - "上述实现了如下召回策略:\n", - "\n", - "1. 基于关联规则的itemcf\n", - "2. 基于关联规则的usercf\n", - "3. youtubednn召回\n", - "4. 冷启动召回\n", - "\n", - "对于上述实现的召回策略其实都不是最优的结果,我们只是做了个简单的尝试,其中还有很多地方可以优化,包括已经实现的这些召回策略的参数或者新加一些,修改一些关联规则都可以。当然还可以尝试更多的召回策略,比如对新闻进行热度召回等等。\n", - "\n", - "\n", - "\n", - "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "nbTranslate": { - "displayLangs": [ - "*" - ], - "hotkey": "alt-t", - "langInMainMenu": true, - "sourceLang": "en", - "targetLang": "fr", - "useGoogleTranslate": true - }, - "tianchi_metadata": { - "competitions": [], - "datasets": [ + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:37.333536Z", + "start_time": "2020-11-16T11:26:37.329545Z" + } + }, + "outputs": [], + "source": [ + "max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:42.163494Z", + "start_time": "2020-11-16T11:26:38.018094Z" + } + }, + "outputs": [], + "source": [ + "# 采样数据\n", + "# all_click_df = get_all_click_sample(data_path)\n", + "\n", + "# 全量训练集\n", + "all_click_df = get_all_click_df(offline=False)\n", + "\n", + "# 对时间戳进行归一化,用于在关联规则的时候计算权重\n", + "all_click_df['click_timestamp'] = all_click_df[['click_timestamp']].apply(max_min_scaler)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:44.343500Z", + "start_time": "2020-11-16T11:26:44.113891Z" + } + }, + "outputs": [], + "source": [ + "item_info_df = get_item_info_df(data_path)" + ] + }, { - "id": "83580", - "title": "零基础入门推荐系统 - 新闻推荐" + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:24.295343Z", + "start_time": "2020-11-16T11:26:44.398007Z" + } + }, + "outputs": [], + "source": [ + "item_emb_dict = get_item_emb_dict(data_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 工具函数" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取用户-文章-时间函数\n", + "这个在基于关联规则的用户协同过滤的时候会用到" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:33.791656Z", + "start_time": "2020-11-16T11:27:33.784305Z" + } + }, + "outputs": [], + "source": [ + "# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + "def get_user_item_time(click_df):\n", + " \n", + " click_df = click_df.sort_values('click_timestamp')\n", + " \n", + " def make_item_time_pair(df):\n", + " return list(zip(df['click_article_id'], df['click_timestamp']))\n", + " \n", + " user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\\\n", + " .reset_index().rename(columns={0: 'item_time_list'})\n", + " user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))\n", + " \n", + " return user_item_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取文章-用户-时间函数\n", + "这个在基于关联规则的文章协同过滤的时候会用到" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:38.327581Z", + "start_time": "2020-11-16T11:27:38.321059Z" + } + }, + "outputs": [], + "source": [ + "# 根据时间获取商品被点击的用户序列 {item1: [(user1, time1), (user2, time2)...]...}\n", + "# 这里的时间是用户点击当前商品的时间,好像没有直接的关系。\n", + "def get_item_user_time_dict(click_df):\n", + " def make_user_time_pair(df):\n", + " return list(zip(df['user_id'], df['click_timestamp']))\n", + " \n", + " click_df = click_df.sort_values('click_timestamp')\n", + " item_user_time_df = click_df.groupby('click_article_id')['user_id', 'click_timestamp'].apply(lambda x: make_user_time_pair(x))\\\n", + " .reset_index().rename(columns={0: 'user_time_list'})\n", + " \n", + " item_user_time_dict = dict(zip(item_user_time_df['click_article_id'], item_user_time_df['user_time_list']))\n", + " return item_user_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取历史和最后一次点击\n", + "这个在评估召回结果, 特征工程和制作标签转成监督学习测试集的时候回用到" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:50.894683Z", + "start_time": "2020-11-16T11:27:50.888002Z" + } + }, + "outputs": [], + "source": [ + "# 获取当前数据的历史点击和最后一次点击\n", + "def get_hist_and_last_click(all_click):\n", + " \n", + " all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])\n", + " click_last_df = all_click.groupby('user_id').tail(1)\n", + "\n", + " # 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下\n", + " def hist_func(user_df):\n", + " if len(user_df) == 1:\n", + " return user_df\n", + " else:\n", + " return user_df[:-1]\n", + "\n", + " click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n", + "\n", + " return click_hist_df, click_last_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取文章属性特征" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:55.893810Z", + "start_time": "2020-11-16T11:27:55.887623Z" + } + }, + "outputs": [], + "source": [ + "# 获取文章id对应的基本属性,保存成字典的形式,方便后面召回阶段,冷启动阶段直接使用\n", + "def get_item_info_dict(item_info_df):\n", + " max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n", + " item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].apply(max_min_scaler)\n", + " \n", + " item_type_dict = dict(zip(item_info_df['click_article_id'], item_info_df['category_id']))\n", + " item_words_dict = dict(zip(item_info_df['click_article_id'], item_info_df['words_count']))\n", + " item_created_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))\n", + " \n", + " return item_type_dict, item_words_dict, item_created_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T06:42:38.730939Z", + "start_time": "2020-11-13T06:42:38.728461Z" + } + }, + "source": [ + "### 获取用户历史点击的文章信息" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:59.650781Z", + "start_time": "2020-11-16T11:27:59.640572Z" + } + }, + "outputs": [], + "source": [ + "def get_user_hist_item_info_dict(all_click):\n", + " \n", + " # 获取user_id对应的用户历史点击文章类型的集合字典\n", + " user_hist_item_typs = all_click.groupby('user_id')['category_id'].agg(set).reset_index()\n", + " user_hist_item_typs_dict = dict(zip(user_hist_item_typs['user_id'], user_hist_item_typs['category_id']))\n", + " \n", + " # 获取user_id对应的用户点击文章的集合\n", + " user_hist_item_ids_dict = all_click.groupby('user_id')['click_article_id'].agg(set).reset_index()\n", + " user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['click_article_id']))\n", + " \n", + " # 获取user_id对应的用户历史点击的文章的平均字数字典\n", + " user_hist_item_words = all_click.groupby('user_id')['words_count'].agg('mean').reset_index()\n", + " user_hist_item_words_dict = dict(zip(user_hist_item_words['user_id'], user_hist_item_words['words_count']))\n", + " \n", + " # 获取user_id对应的用户最后一次点击的文章的创建时间\n", + " all_click_ = all_click.sort_values('click_timestamp')\n", + " user_last_item_created_time = all_click_.groupby('user_id')['created_at_ts'].apply(lambda x: x.iloc[-1]).reset_index()\n", + " \n", + " max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n", + " user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']].apply(max_min_scaler)\n", + " \n", + " user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \\\n", + " user_last_item_created_time['created_at_ts']))\n", + " \n", + " return user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取点击次数最多的topk个文章" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:28:04.761105Z", + "start_time": "2020-11-16T11:28:04.756419Z" + } + }, + "outputs": [], + "source": [ + "# 获取近期点击最多的文章\n", + "def get_item_topk_click(click_df, k):\n", + " topk_click = click_df['click_article_id'].value_counts().index[:k]\n", + " return topk_click" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 定义多路召回字典" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:28:08.321506Z", + "start_time": "2020-11-16T11:28:07.623281Z" + } + }, + "outputs": [], + "source": [ + "# 获取文章的属性信息,保存成字典的形式方便查询\n", + "item_type_dict, item_words_dict, item_created_time_dict = get_item_info_dict(item_info_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:28:13.791569Z", + "start_time": "2020-11-16T11:28:13.786522Z" + } + }, + "outputs": [], + "source": [ + "# 定义一个多路召回的字典,将各路召回的结果都保存在这个字典当中\n", + "user_multi_recall_dict = {'itemcf_sim_itemcf_recall': {},\n", + " 'embedding_sim_item_recall': {},\n", + " 'youtubednn_recall': {},\n", + " 'youtubednn_usercf_recall': {}, \n", + " 'cold_start_recall': {}}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T05:41:12.710754Z", + "start_time": "2020-11-16T05:40:57.842614Z" + } + }, + "outputs": [], + "source": [ + "# 提取最后一次点击作为召回评估,如果不需要做召回评估直接使用全量的训练集进行召回(线下验证模型)\n", + "# 如果不是召回评估,直接使用全量数据进行召回,不用将最后一次提取出来\n", + "trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 召回效果评估函数\n", + "做完了召回有时候也需要对当前的召回方法或者参数进行调整以达到更好的召回效果,因为召回的结果决定了最终排序的上限,下面也会提供一个召回评估的方法" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T05:41:18.579118Z", + "start_time": "2020-11-16T05:41:18.571887Z" + } + }, + "outputs": [], + "source": [ + "# 依次评估召回的前10, 20, 30, 40, 50个文章中的击中率\n", + "def metrics_recall(user_recall_items_dict, trn_last_click_df, topk=5):\n", + " last_click_item_dict = dict(zip(trn_last_click_df['user_id'], trn_last_click_df['click_article_id']))\n", + " user_num = len(user_recall_items_dict)\n", + " \n", + " for k in range(10, topk+1, 10):\n", + " hit_num = 0\n", + " for user, item_list in user_recall_items_dict.items():\n", + " # 获取前k个召回的结果\n", + " tmp_recall_items = [x[0] for x in user_recall_items_dict[user][:k]]\n", + " if last_click_item_dict[user] in set(tmp_recall_items):\n", + " hit_num += 1\n", + " \n", + " hit_rate = round(hit_num * 1.0 / user_num, 5)\n", + " print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 计算相似性矩阵\n", + "\n", + "这一部分主要是通过协同过滤以及向量检索得到相似性矩阵,相似性矩阵主要分为user2user和item2item,下面依次获取基于itemcf的item2item的相似性矩阵," + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### itemcf i2i_sim\n", + "\n", + "借鉴KDD2020的去偏商品推荐,在计算item2item相似性矩阵时,使用关联规则,使得计算的文章的相似性还考虑到了:\n", + "1. 用户点击的时间权重\n", + "2. 用户点击的顺序权重\n", + "3. 文章创建的时间权重" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:30:51.872262Z", + "start_time": "2020-11-16T11:30:51.860099Z" + } + }, + "outputs": [], + "source": [ + "def itemcf_sim(df, item_created_time_dict):\n", + " \"\"\"\n", + " 文章与文章之间的相似性矩阵计算\n", + " :param df: 数据表\n", + " :item_created_time_dict: 文章创建时间的字典\n", + " return : 文章与文章的相似性矩阵\n", + " \n", + " 思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则\n", + " \"\"\"\n", + " \n", + " user_item_time_dict = get_user_item_time(df)\n", + " \n", + " # 计算物品相似度\n", + " i2i_sim = {}\n", + " item_cnt = defaultdict(int)\n", + " for user, item_time_list in tqdm(user_item_time_dict.items()):\n", + " # 在基于商品的协同过滤优化的时候可以考虑时间因素\n", + " for loc1, (i, i_click_time) in enumerate(item_time_list):\n", + " item_cnt[i] += 1\n", + " i2i_sim.setdefault(i, {})\n", + " for loc2, (j, j_click_time) in enumerate(item_time_list):\n", + " if(i == j):\n", + " continue\n", + " \n", + " # 考虑文章的正向顺序点击和反向顺序点击 \n", + " loc_alpha = 1.0 if loc2 > loc1 else 0.7\n", + " # 位置信息权重,其中的参数可以调节\n", + " loc_weight = loc_alpha * (0.9 ** (np.abs(loc2 - loc1) - 1))\n", + " # 点击时间权重,其中的参数可以调节\n", + " click_time_weight = np.exp(0.7 ** np.abs(i_click_time - j_click_time))\n", + " # 两篇文章创建时间的权重,其中的参数可以调节\n", + " created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", + " i2i_sim[i].setdefault(j, 0)\n", + " # 考虑多种因素的权重计算最终的文章之间的相似度\n", + " i2i_sim[i][j] += loc_weight * click_time_weight * created_time_weight / math.log(len(item_time_list) + 1)\n", + " \n", + " i2i_sim_ = i2i_sim.copy()\n", + " for i, related_items in i2i_sim.items():\n", + " for j, wij in related_items.items():\n", + " i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])\n", + " \n", + " # 将得到的相似性矩阵保存到本地\n", + " pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))\n", + " \n", + " return i2i_sim_" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:47:09.937002Z", + "start_time": "2020-11-16T11:30:57.394334Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [14:20<00:00, 290.38it/s]\n" + ] + } + ], + "source": [ + "i2i_sim = itemcf_sim(all_click_df, item_created_time_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### usercf u2u_sim\n", + "\n", + "在计算用户之间的相似度的时候,也可以使用一些简单的关联规则,比如用户活跃度权重,这里将用户的点击次数作为用户活跃度的指标" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T09:11:14.951940Z", + "start_time": "2020-11-16T09:11:14.945654Z" + } + }, + "outputs": [], + "source": [ + "def get_user_activate_degree_dict(all_click_df):\n", + " all_click_df_ = all_click_df.groupby('user_id')['click_article_id'].count().reset_index()\n", + " \n", + " # 用户活跃度归一化\n", + " mm = MinMaxScaler()\n", + " all_click_df_['click_article_id'] = mm.fit_transform(all_click_df_[['click_article_id']])\n", + " user_activate_degree_dict = dict(zip(all_click_df_['user_id'], all_click_df_['click_article_id']))\n", + " \n", + " return user_activate_degree_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T09:11:19.879276Z", + "start_time": "2020-11-16T09:11:19.868808Z" + } + }, + "outputs": [], + "source": [ + "def usercf_sim(all_click_df, user_activate_degree_dict):\n", + " \"\"\"\n", + " 用户相似性矩阵计算\n", + " :param all_click_df: 数据表\n", + " :param user_activate_degree_dict: 用户活跃度的字典\n", + " return 用户相似性矩阵\n", + " \n", + " 思路: 基于用户的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则\n", + " \"\"\"\n", + " item_user_time_dict = get_item_user_time_dict(all_click_df)\n", + " \n", + " u2u_sim = {}\n", + " user_cnt = defaultdict(int)\n", + " for item, user_time_list in tqdm(item_user_time_dict.items()):\n", + " for u, click_time in user_time_list:\n", + " user_cnt[u] += 1\n", + " u2u_sim.setdefault(u, {})\n", + " for v, click_time in user_time_list:\n", + " u2u_sim[u].setdefault(v, 0)\n", + " if u == v:\n", + " continue\n", + " # 用户平均活跃度作为活跃度的权重,这里的式子也可以改善\n", + " activate_weight = 100 * 0.5 * (user_activate_degree_dict[u] + user_activate_degree_dict[v]) \n", + " u2u_sim[u][v] += activate_weight / math.log(len(user_time_list) + 1)\n", + " \n", + " u2u_sim_ = u2u_sim.copy()\n", + " for u, related_users in u2u_sim.items():\n", + " for v, wij in related_users.items():\n", + " u2u_sim_[u][v] = wij / math.sqrt(user_cnt[u] * user_cnt[v])\n", + " \n", + " # 将得到的相似性矩阵保存到本地\n", + " pickle.dump(u2u_sim_, open(save_path + 'usercf_u2u_sim.pkl', 'wb'))\n", + "\n", + " return u2u_sim_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T06:59:46.701572Z", + "start_time": "2020-11-16T06:59:26.852246Z" + } + }, + "outputs": [], + "source": [ + "# 由于usercf计算时候太耗费内存了,这里就不直接运行了\n", + "# 如果是采样的话,是可以运行的\n", + "user_activate_degree_dict = get_user_activate_degree_dict(all_click_df)\n", + "u2u_sim = usercf_sim(all_click_df, user_activate_degree_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### item embedding sim\n", + "\n", + "使用Embedding计算item之间的相似度是为了后续冷启动的时候可以获取未出现在点击数据中的文章,后面有对冷启动专门的介绍,这里简单的说一下faiss。\n", + "\n", + "aiss是Facebook的AI团队开源的一套用于做聚类或者相似性搜索的软件库,底层是用C++实现。Faiss因为超级优越的性能,被广泛应用于推荐相关的业务当中.\n", + "\n", + "faiss工具包一般使用在推荐系统中的向量召回部分。在做向量召回的时候要么是u2u,u2i或者i2i,这里的u和i指的是user和item.我们知道在实际的场景中user和item的数量都是海量的,我们最容易想到的基于向量相似度的召回就是使用两层循环遍历user列表或者item列表计算两个向量的相似度,但是这样做在面对海量数据是不切实际的,faiss就是用来加速计算某个查询向量最相似的topk个索引向量。\n", + "\n", + "**faiss查询的原理:**\n", + "\n", + "faiss使用了PCA和PQ(Product quantization乘积量化)两种技术进行向量压缩和编码,当然还使用了其他的技术进行优化,但是PCA和PQ是其中最核心部分。\n", + "\n", + "1. PCA降维算法细节参考下面这个链接进行学习 \n", + "[主成分分析(PCA)原理总结](https://www.cnblogs.com/pinard/p/6239403.html) \n", + "\n", + "2. PQ编码的细节下面这个链接进行学习 \n", + "[实例理解product quantization算法](http://www.fabwrite.com/productquantization)\n", + "\n", + "**faiss使用**\n", + "\n", + "[faiss官方教程](https://github.com/facebookresearch/faiss/wiki/Getting-started)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T09:11:28.631803Z", + "start_time": "2020-11-16T09:11:28.619926Z" + } + }, + "outputs": [], + "source": [ + "# 向量检索相似度计算\n", + "# topk指的是每个item, faiss搜索后返回最相似的topk个item\n", + "def embdding_sim(click_df, item_emb_df, save_path, topk):\n", + " \"\"\"\n", + " 基于内容的文章embedding相似性矩阵计算\n", + " :param click_df: 数据表\n", + " :param item_emb_df: 文章的embedding\n", + " :param save_path: 保存路径\n", + " :patam topk: 找最相似的topk篇\n", + " return 文章相似性矩阵\n", + " \n", + " 思路: 对于每一篇文章, 基于embedding的相似性返回topk个与其最相似的文章, 只不过由于文章数量太多,这里用了faiss进行加速\n", + " \"\"\"\n", + " \n", + " # 文章索引与文章id的字典映射\n", + " item_idx_2_rawid_dict = dict(zip(item_emb_df.index, item_emb_df['article_id']))\n", + " \n", + " item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]\n", + " item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols].values, dtype=np.float32)\n", + " # 向量进行单位化\n", + " item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)\n", + " \n", + " # 建立faiss索引\n", + " item_index = faiss.IndexFlatIP(item_emb_np.shape[1])\n", + " item_index.add(item_emb_np)\n", + " # 相似度查询,给每个索引位置上的向量返回topk个item以及相似度\n", + " sim, idx = item_index.search(item_emb_np, topk) # 返回的是列表\n", + " \n", + " # 将向量检索的结果保存成原始id的对应关系\n", + " item_sim_dict = collections.defaultdict(dict)\n", + " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(item_emb_np)), sim, idx)):\n", + " target_raw_id = item_idx_2_rawid_dict[target_idx]\n", + " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", + " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", + " rele_raw_id = item_idx_2_rawid_dict[rele_idx]\n", + " item_sim_dict[target_raw_id][rele_raw_id] = item_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value\n", + " \n", + " # 保存i2i相似度矩阵\n", + " pickle.dump(item_sim_dict, open(save_path + 'emb_i2i_sim.pkl', 'wb')) \n", + " \n", + " return item_sim_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T09:32:35.926116Z", + "start_time": "2020-11-16T09:11:44.586967Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "364047it [00:23, 15292.14it/s]\n" + ] + } + ], + "source": [ + "item_emb_df = pd.read_csv(data_path + '/articles_emb.csv')\n", + "emb_i2i_sim = embdding_sim(all_click_df, item_emb_df, save_path, topk=10) # topk可以自行设置" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 召回\n", + "这个就是我们开篇提到的那个问题, 面的36万篇文章, 20多万用户的推荐, 我们又有哪些策略来缩减问题的规模? 我们就可以再召回阶段筛选出用户对于点击文章的候选集合, 从而降低问题的规模。召回常用的策略:\n", + "* Youtube DNN 召回\n", + "* 基于文章的召回\n", + " * 文章的协同过滤\n", + " * 基于文章embedding的召回\n", + "* 基于用户的召回\n", + " * 用户的协同过滤\n", + " * 用户embedding\n", + "\n", + "上面的各种召回方式一部分在基于用户已经看得文章的基础上去召回与这些文章相似的一些文章, 而这个相似性的计算方式不同, 就得到了不同的召回方式, 比如文章的协同过滤, 文章内容的embedding等。还有一部分是根据用户的相似性进行推荐,对于某用户推荐与其相似的其他用户看过的文章,比如用户的协同过滤和用户embedding。 还有一种思路是类似矩阵分解的思路,先计算出用户和文章的embedding之后,就可以直接算用户和文章的相似度, 根据这个相似度进行推荐, 比如YouTube DNN。 我们下面详细来看一下每一个召回方法:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### YoutubeDNN召回\n", + "**(这一步是直接获取用户召回的候选文章列表)**\n", + "\n", + "[论文下载地址](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)\n", + "\n", + "**Youtubednn召回架构**\n", + "\n", + "![image-20201111160516562](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201111160516562.png)\n", + "\n", + "\n", + "\n", + "关于YoutubeDNN原理和应用推荐看王喆的两篇博客:\n", + "\n", + "1. [重读Youtube深度学习推荐系统论文,字字珠玑,惊为神文](https://zhuanlan.zhihu.com/p/52169807)\n", + "2. [YouTube深度学习推荐系统的十大工程问题](https://zhuanlan.zhihu.com/p/52504407)\n", + "\n", + "\n", + "**参考文献:**\n", + "1. https://zhuanlan.zhihu.com/p/52169807 (YouTubeDNN原理)\n", + "2. https://zhuanlan.zhihu.com/p/26306795 (Word2Vec知乎众赞文章) --- word2vec放到排序中的w2v的介绍部分\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:13:11.058766Z", + "start_time": "2020-11-16T10:13:11.041084Z" + } + }, + "outputs": [], + "source": [ + "# 获取双塔召回时的训练验证数据\n", + "# negsample指的是通过滑窗构建样本的时候,负样本的数量\n", + "def gen_data_set(data, negsample=0):\n", + " data.sort_values(\"click_timestamp\", inplace=True)\n", + " item_ids = data['click_article_id'].unique()\n", + "\n", + " train_set = []\n", + " test_set = []\n", + " for reviewerID, hist in tqdm(data.groupby('user_id')):\n", + " pos_list = hist['click_article_id'].tolist()\n", + " \n", + " if negsample > 0:\n", + " candidate_set = list(set(item_ids) - set(pos_list)) # 用户没看过的文章里面选择负样本\n", + " neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True) # 对于每个正样本,选择n个负样本\n", + " \n", + " # 长度只有一个的时候,需要把这条数据也放到训练集中,不然的话最终学到的embedding就会有缺失\n", + " if len(pos_list) == 1:\n", + " train_set.append((reviewerID, [pos_list[0]], pos_list[0],1,len(pos_list)))\n", + " test_set.append((reviewerID, [pos_list[0]], pos_list[0],1,len(pos_list)))\n", + " \n", + " # 滑窗构造正负样本\n", + " for i in range(1, len(pos_list)):\n", + " hist = pos_list[:i]\n", + " \n", + " if i != len(pos_list) - 1:\n", + " train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]))) # 正样本 [user_id, his_item, pos_item, label, len(his_item)]\n", + " for negi in range(negsample):\n", + " train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1]))) # 负样本 [user_id, his_item, neg_item, label, len(his_item)]\n", + " else:\n", + " # 将最长的那一个序列长度作为测试数据\n", + " test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1])))\n", + " \n", + " random.shuffle(train_set)\n", + " random.shuffle(test_set)\n", + " \n", + " return train_set, test_set\n", + "\n", + "# 将输入的数据进行padding,使得序列特征的长度都一致\n", + "def gen_model_input(train_set,user_profile,seq_max_len):\n", + "\n", + " train_uid = np.array([line[0] for line in train_set])\n", + " train_seq = [line[1] for line in train_set]\n", + " train_iid = np.array([line[2] for line in train_set])\n", + " train_label = np.array([line[3] for line in train_set])\n", + " train_hist_len = np.array([line[4] for line in train_set])\n", + "\n", + " train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)\n", + " train_model_input = {\"user_id\": train_uid, \"click_article_id\": train_iid, \"hist_article_id\": train_seq_pad,\n", + " \"hist_len\": train_hist_len}\n", + "\n", + " return train_model_input, train_label" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:13:18.124452Z", + "start_time": "2020-11-16T10:13:18.098284Z" + } + }, + "outputs": [], + "source": [ + "def youtubednn_u2i_dict(data, topk=20): \n", + " sparse_features = [\"click_article_id\", \"user_id\"]\n", + " SEQ_LEN = 30 # 用户点击序列的长度,短的填充,长的截断\n", + " \n", + " user_profile_ = data[[\"user_id\"]].drop_duplicates('user_id')\n", + " item_profile_ = data[[\"click_article_id\"]].drop_duplicates('click_article_id') \n", + " \n", + " # 类别编码\n", + " features = [\"click_article_id\", \"user_id\"]\n", + " feature_max_idx = {}\n", + " \n", + " for feature in features:\n", + " lbe = LabelEncoder()\n", + " data[feature] = lbe.fit_transform(data[feature])\n", + " feature_max_idx[feature] = data[feature].max() + 1\n", + " \n", + " # 提取user和item的画像,这里具体选择哪些特征还需要进一步的分析和考虑\n", + " user_profile = data[[\"user_id\"]].drop_duplicates('user_id')\n", + " item_profile = data[[\"click_article_id\"]].drop_duplicates('click_article_id') \n", + " \n", + " user_index_2_rawid = dict(zip(user_profile['user_id'], user_profile_['user_id']))\n", + " item_index_2_rawid = dict(zip(item_profile['click_article_id'], item_profile_['click_article_id']))\n", + " \n", + " # 划分训练和测试集\n", + " # 由于深度学习需要的数据量通常都是非常大的,所以为了保证召回的效果,往往会通过滑窗的形式扩充训练样本\n", + " train_set, test_set = gen_data_set(data, 0)\n", + " # 整理输入数据,具体的操作可以看上面的函数\n", + " train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)\n", + " test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)\n", + " \n", + " # 确定Embedding的维度\n", + " embedding_dim = 16\n", + " \n", + " # 将数据整理成模型可以直接输入的形式\n", + " user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),\n", + " VarLenSparseFeat(SparseFeat('hist_article_id', feature_max_idx['click_article_id'], embedding_dim,\n", + " embedding_name=\"click_article_id\"), SEQ_LEN, 'mean', 'hist_len'),]\n", + " item_feature_columns = [SparseFeat('click_article_id', feature_max_idx['click_article_id'], embedding_dim)]\n", + " \n", + " # 模型的定义 \n", + " # num_sampled: 负采样时的样本数量\n", + " model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim))\n", + " # 模型编译\n", + " model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss) \n", + " \n", + " # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练\n", + " history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0)\n", + " \n", + " # 训练完模型之后,提取训练的Embedding,包括user端和item端\n", + " test_user_model_input = test_model_input\n", + " all_item_model_input = {\"click_article_id\": item_profile['click_article_id'].values}\n", + "\n", + " user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n", + " item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n", + " \n", + " # 保存当前的item_embedding 和 user_embedding 排序的时候可能能够用到,但是需要注意保存的时候需要和原始的id对应\n", + " user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n", + " item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n", + " \n", + " # embedding保存之前归一化一下\n", + " user_embs = user_embs / np.linalg.norm(user_embs, axis=1, keepdims=True)\n", + " item_embs = item_embs / np.linalg.norm(item_embs, axis=1, keepdims=True)\n", + " \n", + " # 将Embedding转换成字典的形式方便查询\n", + " raw_user_id_emb_dict = {user_index_2_rawid[k]: \\\n", + " v for k, v in zip(user_profile['user_id'], user_embs)}\n", + " raw_item_id_emb_dict = {item_index_2_rawid[k]: \\\n", + " v for k, v in zip(item_profile['click_article_id'], item_embs)}\n", + " # 将Embedding保存到本地\n", + " pickle.dump(raw_user_id_emb_dict, open(save_path + 'user_youtube_emb.pkl', 'wb'))\n", + " pickle.dump(raw_item_id_emb_dict, open(save_path + 'item_youtube_emb.pkl', 'wb'))\n", + " \n", + " # faiss紧邻搜索,通过user_embedding 搜索与其相似性最高的topk个item\n", + " index = faiss.IndexFlatIP(embedding_dim)\n", + " # 上面已经进行了归一化,这里可以不进行归一化了\n", + "# faiss.normalize_L2(user_embs)\n", + "# faiss.normalize_L2(item_embs)\n", + " index.add(item_embs) # 将item向量构建索引\n", + " sim, idx = index.search(np.ascontiguousarray(user_embs), topk) # 通过user去查询最相似的topk个item\n", + " \n", + " user_recall_items_dict = collections.defaultdict(dict)\n", + " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(test_user_model_input['user_id'], sim, idx)):\n", + " target_raw_id = user_index_2_rawid[target_idx]\n", + " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", + " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", + " rele_raw_id = item_index_2_rawid[rele_idx]\n", + " user_recall_items_dict[target_raw_id][rele_raw_id] = user_recall_items_dict.get(target_raw_id, {})\\\n", + " .get(rele_raw_id, 0) + sim_value\n", + " \n", + " user_recall_items_dict = {k: sorted(v.items(), key=lambda x: x[1], reverse=True) for k, v in user_recall_items_dict.items()}\n", + " # 将召回的结果进行排序\n", + " \n", + " # 保存召回的结果\n", + " # 这里是直接通过向量的方式得到了召回结果,相比于上面的召回方法,上面的只是得到了i2i及u2u的相似性矩阵,还需要进行协同过滤召回才能得到召回结果\n", + " # 可以直接对这个召回结果进行评估,为了方便可以统一写一个评估函数对所有的召回结果进行评估\n", + " pickle.dump(user_recall_items_dict, open(save_path + 'youtube_u2i_dict.pkl', 'wb'))\n", + " return user_recall_items_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:21:46.420014Z", + "start_time": "2020-11-16T10:13:35.351131Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [02:02<00:00, 2038.57it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:253: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "keep_dims is deprecated, use keepdims instead\n", + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:253: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Deprecated in favor of operator or tf.math.divide.\n", + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "1149673/1149673 [==============================] - 216s 188us/sample - loss: 0.1326\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "250000it [00:32, 7720.75it/s]\n" + ] + } + ], + "source": [ + "# 由于这里需要做召回评估,所以讲训练集中的最后一次点击都提取了出来\n", + "if not metric_recall:\n", + " user_multi_recall_dict['youtubednn_recall'] = youtubednn_u2i_dict(all_click_df, topk=20)\n", + "else:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + " user_multi_recall_dict['youtubednn_recall'] = youtubednn_u2i_dict(trn_hist_click_df, topk=20)\n", + " # 召回效果评估\n", + " metrics_recall(user_multi_recall_dict['youtubednn_recall'], trn_last_click_df, topk=20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### itemcf recall\n", + "\n", + "上面已经通过协同过滤,Embedding检索的方式得到了文章的相似度矩阵,下面使用协同过滤的思想,给用户召回与其历史文章相似的文章。\n", + "这里在召回的时候,也是用了关联规则的方式:\n", + "1. 考虑相似文章与历史点击文章顺序的权重(细节看代码)\n", + "2. 考虑文章创建时间的权重,也就是考虑相似文章与历史点击文章创建时间差的权重\n", + "3. 考虑文章内容相似度权重(使用Embedding计算相似文章相似度,但是这里需要注意,在Embedding的时候并没有计算所有商品两两之间的相似度,所以相似的文章与历史点击文章不存在相似度,需要做特殊处理)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:48:40.580553Z", + "start_time": "2020-11-16T11:48:40.567130Z" + } + }, + "outputs": [], + "source": [ + "# 基于商品的召回i2i\n", + "def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim):\n", + " \"\"\"\n", + " 基于文章协同过滤的召回\n", + " :param user_id: 用户id\n", + " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + " :param i2i_sim: 字典,文章相似性矩阵\n", + " :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章\n", + " :param recall_item_num: 整数, 最后的召回文章数量\n", + " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全\n", + " :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵\n", + " \n", + " return: 召回的文章列表 [(item1, score1), (item2, score2)...]\n", + " \"\"\"\n", + " # 获取用户历史交互的文章\n", + " user_hist_items = user_item_time_dict[user_id]\n", + " user_hist_items_ = {user_id for user_id, _ in user_hist_items}\n", + " \n", + " item_rank = {}\n", + " for loc, (i, click_time) in enumerate(user_hist_items):\n", + " for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:\n", + " if j in user_hist_items_:\n", + " continue\n", + " \n", + " # 文章创建时间差权重\n", + " created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", + " # 相似文章和历史点击文章序列中历史文章所在的位置权重\n", + " loc_weight = (0.9 ** (len(user_hist_items) - loc))\n", + " \n", + " content_weight = 1.0\n", + " if emb_i2i_sim.get(i, {}).get(j, None) is not None:\n", + " content_weight += emb_i2i_sim[i][j]\n", + " if emb_i2i_sim.get(j, {}).get(i, None) is not None:\n", + " content_weight += emb_i2i_sim[j][i]\n", + " \n", + " item_rank.setdefault(j, 0)\n", + " item_rank[j] += created_time_weight * loc_weight * content_weight * wij\n", + " \n", + " # 不足10个,用热门商品补全\n", + " if len(item_rank) < recall_item_num:\n", + " for i, item in enumerate(item_topk_click):\n", + " if item in item_rank.items(): # 填充的item应该不在原来的列表中\n", + " continue\n", + " item_rank[item] = - i - 100 # 随便给个负数就行\n", + " if len(item_rank) == recall_item_num:\n", + " break\n", + " \n", + " item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]\n", + " \n", + " return item_rank" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### itemcf sim召回" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T14:41:23.433038Z", + "start_time": "2020-11-16T11:48:46.286350Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [2:51:13<00:00, 24.33it/s] \n" + ] + } + ], + "source": [ + "# 先进行itemcf召回, 为了召回评估,所以提取最后一次点击\n", + "\n", + "if metric_recall:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + "else:\n", + " trn_hist_click_df = all_click_df\n", + "\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "\n", + "i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))\n", + "emb_i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl', 'rb'))\n", + "\n", + "sim_item_topk = 20\n", + "recall_item_num = 10\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, \\\n", + " i2i_sim, sim_item_topk, recall_item_num, \\\n", + " item_topk_click, item_created_time_dict, emb_i2i_sim)\n", + "\n", + "user_multi_recall_dict['itemcf_sim_itemcf_recall'] = user_recall_items_dict\n", + "pickle.dump(user_multi_recall_dict['itemcf_sim_itemcf_recall'], open(save_path + 'itemcf_recall_dict.pkl', 'wb'))\n", + "\n", + "if metric_recall:\n", + " # 召回效果评估\n", + " metrics_recall(user_multi_recall_dict['itemcf_sim_itemcf_recall'], trn_last_click_df, topk=recall_item_num)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### embedding sim 召回" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T15:04:51.527795Z", + "start_time": "2020-11-16T14:59:03.907519Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [04:35<00:00, 905.85it/s] \n" + ] + } + ], + "source": [ + "# 这里是为了召回评估,所以提取最后一次点击\n", + "if metric_recall:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + "else:\n", + " trn_hist_click_df = all_click_df\n", + "\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))\n", + "\n", + "sim_item_topk = 20\n", + "recall_item_num = 10\n", + "\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, \n", + " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)\n", + " \n", + "user_multi_recall_dict['embedding_sim_item_recall'] = user_recall_items_dict\n", + "pickle.dump(user_multi_recall_dict['embedding_sim_item_recall'], open(save_path + 'embedding_sim_item_recall.pkl', 'wb'))\n", + "\n", + "if metric_recall:\n", + " # 召回效果评估\n", + " metrics_recall(user_multi_recall_dict['embedding_sim_item_recall'], trn_last_click_df, topk=recall_item_num)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### usercf召回\n", + "\n", + "基于用户协同过滤,核心思想是给用户推荐与其相似的用户历史点击文章,因为这里涉及到了相似用户的历史文章,这里仍然可以加上一些关联规则来给用户可能点击的文章进行加权,这里使用的关联规则主要是考虑相似用户的历史点击文章与被推荐用户历史点击商品的关系权重,而这里的关系就可以直接借鉴基于物品的协同过滤相似的做法,只不过这里是对被推荐物品关系的一个累加的过程,下面是使用的一些关系权重,及相关的代码:\n", + "\n", + "1. 计算被推荐用户历史点击文章与相似用户历史点击文章的相似度,文章创建时间差,相对位置的总和,作为各自的权重" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:09:32.293990Z", + "start_time": "2020-11-17T02:09:32.278678Z" + } + }, + "outputs": [], + "source": [ + "# 基于用户的召回 u2u2i\n", + "def user_based_recommend(user_id, user_item_time_dict, u2u_sim, sim_user_topk, recall_item_num, \n", + " item_topk_click, item_created_time_dict, emb_i2i_sim):\n", + " \"\"\"\n", + " 基于文章协同过滤的召回\n", + " :param user_id: 用户id\n", + " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + " :param u2u_sim: 字典,文章相似性矩阵\n", + " :param sim_user_topk: 整数, 选择与当前用户最相似的前k个用户\n", + " :param recall_item_num: 整数, 最后的召回文章数量\n", + " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全\n", + " :param item_created_time_dict: 文章创建时间列表\n", + " :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵\n", + " \n", + " return: 召回的文章列表 [(item1, score1), (item2, score2)...]\n", + " \"\"\"\n", + " # 历史交互\n", + " user_item_time_list = user_item_time_dict[user_id] # [(item1, time1), (item2, time2)..]\n", + " user_hist_items = set([i for i, t in user_item_time_list]) # 存在一个用户与某篇文章的多次交互, 这里得去重\n", + " \n", + " items_rank = {}\n", + " for sim_u, wuv in sorted(u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[:sim_user_topk]:\n", + " for i, click_time in user_item_time_dict[sim_u]:\n", + " if i in user_hist_items:\n", + " continue\n", + " items_rank.setdefault(i, 0)\n", + " \n", + " loc_weight = 1.0\n", + " content_weight = 1.0\n", + " created_time_weight = 1.0\n", + " \n", + " # 当前文章与该用户看的历史文章进行一个权重交互\n", + " for loc, (j, click_time) in enumerate(user_item_time_list):\n", + " # 点击时的相对位置权重\n", + " loc_weight += 0.9 ** (len(user_item_time_list) - loc)\n", + " # 内容相似性权重\n", + " if emb_i2i_sim.get(i, {}).get(j, None) is not None:\n", + " content_weight += emb_i2i_sim[i][j]\n", + " if emb_i2i_sim.get(j, {}).get(i, None) is not None:\n", + " content_weight += emb_i2i_sim[j][i]\n", + " \n", + " # 创建时间差权重\n", + " created_time_weight += np.exp(0.8 * np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", + " \n", + " items_rank[i] += loc_weight * content_weight * created_time_weight * wuv\n", + " \n", + " # 热度补全\n", + " if len(items_rank) < recall_item_num:\n", + " for i, item in enumerate(item_topk_click):\n", + " if item in items_rank.items(): # 填充的item应该不在原来的列表中\n", + " continue\n", + " items_rank[item] = - i - 100 # 随便给个复数就行\n", + " if len(items_rank) == recall_item_num:\n", + " break\n", + " \n", + " items_rank = sorted(items_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num] \n", + " \n", + " return items_rank" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### usercf sim召回" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:05:41.652501Z", + "start_time": "2020-11-16T07:05:40.953871Z" + } + }, + "outputs": [], + "source": [ + "# 这里是为了召回评估,所以提取最后一次点击\n", + "# 由于usercf中计算user之间的相似度的过程太费内存了,全量数据这里就没有跑,跑了一个采样之后的数据\n", + "if metric_recall:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + "else:\n", + " trn_hist_click_df = all_click_df\n", + " \n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "\n", + "u2u_sim = pickle.load(open(save_path + 'usercf_u2u_sim.pkl', 'rb'))\n", + "\n", + "sim_user_topk = 20\n", + "recall_item_num = 10\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, \\\n", + " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim) \n", + "\n", + "pickle.dump(user_recall_items_dict, open(save_path + 'usercf_u2u2i_recall.pkl', 'wb'))\n", + "\n", + "if metric_recall:\n", + " # 召回效果评估\n", + " metrics_recall(user_recall_items_dict, trn_last_click_df, topk=recall_item_num)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T03:09:35.853516Z", + "start_time": "2020-11-16T03:09:35.737625Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### user embedding sim召回\n", + "\n", + "虽然没有直接跑usercf的计算用户之间的相似度,为了验证上述基于用户的协同过滤的代码,下面使用了YoutubeDNN过程中产生的user embedding来进行向量检索每个user最相似的topk个user,在使用这里得到的u2u的相似性矩阵,使用usercf进行召回,具体代码如下" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:09:46.807811Z", + "start_time": "2020-11-17T02:09:46.798033Z" + } + }, + "outputs": [], + "source": [ + "# 使用Embedding的方式获取u2u的相似性矩阵\n", + "# topk指的是每个user, faiss搜索后返回最相似的topk个user\n", + "def u2u_embdding_sim(click_df, user_emb_dict, save_path, topk):\n", + " \n", + " user_list = []\n", + " user_emb_list = []\n", + " for user_id, user_emb in user_emb_dict.items():\n", + " user_list.append(user_id)\n", + " user_emb_list.append(user_emb)\n", + " \n", + " user_index_2_rawid_dict = {k: v for k, v in zip(range(len(user_list)), user_list)} \n", + " \n", + " user_emb_np = np.array(user_emb_list, dtype=np.float32)\n", + " \n", + " # 建立faiss索引\n", + " user_index = faiss.IndexFlatIP(user_emb_np.shape[1])\n", + " user_index.add(user_emb_np)\n", + " # 相似度查询,给每个索引位置上的向量返回topk个item以及相似度\n", + " sim, idx = user_index.search(user_emb_np, topk) # 返回的是列表\n", + " \n", + " # 将向量检索的结果保存成原始id的对应关系\n", + " user_sim_dict = collections.defaultdict(dict)\n", + " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(user_emb_np)), sim, idx)):\n", + " target_raw_id = user_index_2_rawid_dict[target_idx]\n", + " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", + " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", + " rele_raw_id = user_index_2_rawid_dict[rele_idx]\n", + " user_sim_dict[target_raw_id][rele_raw_id] = user_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value\n", + " \n", + " # 保存i2i相似度矩阵\n", + " pickle.dump(user_sim_dict, open(save_path + 'youtube_u2u_sim.pkl', 'wb')) \n", + " return user_sim_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:14:31.355905Z", + "start_time": "2020-11-17T02:09:53.236531Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "250000it [00:23, 10507.45it/s]\n" + ] + } + ], + "source": [ + "# 读取YoutubeDNN过程中产生的user embedding, 然后使用faiss计算用户之间的相似度\n", + "# 这里需要注意,这里得到的user embedding其实并不是很好,因为YoutubeDNN中使用的是用户点击序列来训练的user embedding,\n", + "# 如果序列普遍都比较短的话,其实效果并不是很好\n", + "user_emb_dict = pickle.load(open(save_path + 'user_youtube_emb.pkl', 'rb'))\n", + "u2u_sim = u2u_embdding_sim(all_click_df, user_emb_dict, save_path, topk=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "通过YoutubeDNN得到的user_embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:49:40.755431Z", + "start_time": "2020-11-17T02:28:47.003514Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [19:43<00:00, 211.22it/s]\n" + ] + } + ], + "source": [ + "# 使用召回评估函数验证当前召回方式的效果\n", + "if metric_recall:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + "else:\n", + " trn_hist_click_df = all_click_df\n", + "\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "u2u_sim = pickle.load(open(save_path + 'youtube_u2u_sim.pkl', 'rb'))\n", + "\n", + "sim_user_topk = 20\n", + "recall_item_num = 10\n", + "\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, \\\n", + " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)\n", + " \n", + "user_multi_recall_dict['youtubednn_usercf_recall'] = user_recall_items_dict\n", + "pickle.dump(user_multi_recall_dict['youtubednn_usercf_recall'], open(save_path + 'youtubednn_usercf_recall.pkl', 'wb'))\n", + "\n", + "if metric_recall:\n", + " # 召回效果评估\n", + " metrics_recall(user_multi_recall_dict['youtubednn_usercf_recall'], trn_last_click_df, topk=recall_item_num)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:07:44.326253Z", + "start_time": "2020-11-16T07:07:43.798931Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 冷启动问题" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**冷启动问题可以分成三类:文章冷启动,用户冷启动,系统冷启动。**\n", + "\n", + "- 文章冷启动:对于一个平台系统新加入的文章,该文章没有任何的交互记录,如何推荐给用户的问题。(对于我们场景可以认为是,日志数据中没有出现过的文章都可以认为是冷启动的文章)\n", + "- 用户冷启动:对于一个平台系统新来的用户,该用户还没有文章的交互信息,如何给该用户进行推荐。(对于我们场景就是,测试集中的用户是否在测试集对应的log数据中出现过,如果没有出现过,那么可以认为该用户是冷启动用户。但是有时候并没有这么严格,我们也可以自己设定某些指标来判别哪些用户是冷启动用户,比如通过使用时长,点击率,留存率等等)\n", + "- 系统冷启动:就是对于一个平台刚上线,还没有任何的相关历史数据,此时就是系统冷启动,其实也就是前面两种的一个综合。\n", + "\n", + "**当前场景下冷启动问题的分析:**\n", + "\n", + "对当前的数据进行分析会发现,日志中所有出现过的点击文章只有3w多个,而整个文章库中却有30多万,那么测试集中的用户最后一次点击是否会点击没有出现在日志中的文章呢?如果存在这种情况,说明用户点击的文章之前没有任何的交互信息,这也就是我们所说的文章冷启动。通过数据分析还可以发现,测试集用户只有一次点击的数据占得比例还不少,其实仅仅通过用户的一次点击就给用户推荐文章使用模型的方式也是比较难的,这里其实也可以考虑用户冷启动的问题,但是这里只给出物品冷启动的一些解决方案及代码,关于用户冷启动的话提一些可行性的做法。\n", + "\n", + "1. 文章冷启动(没有冷启动的探索问题) \n", + " 其实我们这里不是为了做文章的冷启动而做冷启动,而是猜测用户可能会点击一些没有在log数据中出现的文章,我们要做的就是如何从将近27万的文章中选择一些文章作为用户冷启动的文章,这里其实也可以看成是一种召回策略,我们这里就采用简单的比较好理解的基于规则的召回策略来获取用户可能点击的未出现在log数据中的文章。\n", + " 现在的问题变成了:如何给每个用户考虑从27万个商品中获取一小部分商品?随机选一些可能是一种方案。下面给出一些参考的方案。\n", + " 1. 首先基于Embedding召回一部分与用户历史相似的文章\n", + " 2. 从基于Embedding召回的文章中通过一些规则过滤掉一些文章,使得留下的文章用户更可能点击。我们这里的规则,可以是,留下那些与用户历史点击文章主题相同的文章,或者字数相差不大的文章。并且留下的文章尽量是与测试集用户最后一次点击时间更接近的文章,或者是当天的文章也行。\n", + "2. 用户冷启动 \n", + " 这里对测试集中的用户点击数据进行分析会发现,测试集中有百分之20的用户只有一次点击,那么这些点击特别少的用户的召回是不是可以单独做一些策略上的补充呢?或者是在排序后直接基于规则加上一些文章呢?这些都可以去尝试,这里没有提供具体的做法。\n", + " \n", + "**注意:** \n", + "\n", + "这里看似和基于embedding计算的item之间相似度然后做itemcf是一致的,但是现在我们的目的不一样,我们这里的目的是找到相似的向量,并且还没有出现在log日志中的商品,再加上一些其他的冷启动的策略,这里需要找回的数量会偏多一点,不然被筛选完之后可能都没有文章了" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T04:30:23.027164Z", + "start_time": "2020-11-17T04:23:09.960235Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [05:01<00:00, 828.60it/s] \n" + ] + } + ], + "source": [ + "# 先进行itemcf召回,这里不需要做召回评估,这里只是一种策略\n", + "trn_hist_click_df = all_click_df\n", + "\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))\n", + "\n", + "sim_item_topk = 150\n", + "recall_item_num = 100 # 稍微召回多一点文章,便于后续的规则筛选\n", + "\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, \n", + " recall_item_num, item_topk_click,item_created_time_dict, emb_i2i_sim)\n", + "pickle.dump(user_recall_items_dict, open(save_path + 'cold_start_items_raw_dict.pkl', 'wb'))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:11:39.267581Z", + "start_time": "2020-11-17T06:11:39.252563Z" + } + }, + "outputs": [], + "source": [ + "# 基于规则进行文章过滤\n", + "# 保留文章主题与用户历史浏览主题相似的文章\n", + "# 保留文章字数与用户历史浏览文章字数相差不大的文章\n", + "# 保留最后一次点击当天的文章\n", + "# 按照相似度返回最终的结果\n", + "\n", + "def get_click_article_ids_set(all_click_df):\n", + " return set(all_click_df.click_article_id.values)\n", + "\n", + "def cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \\\n", + " user_last_item_created_time_dict, item_type_dict, item_words_dict, \n", + " item_created_time_dict, click_article_ids_set, recall_item_num):\n", + " \"\"\"\n", + " 冷启动的情况下召回一些文章\n", + " :param user_recall_items_dict: 基于内容embedding相似性召回来的很多文章, 字典, {user1: [(item1, item2), ..], }\n", + " :param user_hist_item_typs_dict: 字典, 用户点击的文章的主题映射\n", + " :param user_hist_item_words_dict: 字典, 用户点击的历史文章的字数映射\n", + " :param user_last_item_created_time_idct: 字典,用户点击的历史文章创建时间映射\n", + " :param item_tpye_idct: 字典,文章主题映射\n", + " :param item_words_dict: 字典,文章字数映射\n", + " :param item_created_time_dict: 字典, 文章创建时间映射\n", + " :param click_article_ids_set: 集合,用户点击过得文章, 也就是日志里面出现过的文章\n", + " :param recall_item_num: 召回文章的数量, 这个指的是没有出现在日志里面的文章数量\n", + " \"\"\"\n", + " \n", + " cold_start_user_items_dict = {}\n", + " for user, item_list in tqdm(user_recall_items_dict.items()):\n", + " cold_start_user_items_dict.setdefault(user, [])\n", + " for item, score in item_list:\n", + " # 获取历史文章信息\n", + " hist_item_type_set = user_hist_item_typs_dict[user]\n", + " hist_mean_words = user_hist_item_words_dict[user]\n", + " hist_last_item_created_time = user_last_item_created_time_dict[user]\n", + " hist_last_item_created_time = datetime.fromtimestamp(hist_last_item_created_time)\n", + " \n", + " # 获取当前召回文章的信息\n", + " curr_item_type = item_type_dict[item]\n", + " curr_item_words = item_words_dict[item]\n", + " curr_item_created_time = item_created_time_dict[item]\n", + " curr_item_created_time = datetime.fromtimestamp(curr_item_created_time)\n", + "\n", + " # 首先,文章不能出现在用户的历史点击中, 然后根据文章主题,文章单词数,文章创建时间进行筛选\n", + " if curr_item_type not in hist_item_type_set or \\\n", + " item in click_article_ids_set or \\\n", + " abs(curr_item_words - hist_mean_words) > 200 or \\\n", + " abs((curr_item_created_time - hist_last_item_created_time).days) > 90: \n", + " continue\n", + " \n", + " cold_start_user_items_dict[user].append((item, score)) # {user1: [(item1, score1), (item2, score2)..]...}\n", + " \n", + " # 需要控制一下冷启动召回的数量\n", + " cold_start_user_items_dict = {k: sorted(v, key=lambda x:x[1], reverse=True)[:recall_item_num] \\\n", + " for k, v in cold_start_user_items_dict.items()}\n", + " \n", + " pickle.dump(cold_start_user_items_dict, open(save_path + 'cold_start_user_items_dict.pkl', 'wb'))\n", + " \n", + " return cold_start_user_items_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:35:38.758278Z", + "start_time": "2020-11-17T06:31:40.164332Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [01:49<00:00, 2293.37it/s]\n" + ] + } + ], + "source": [ + "all_click_df_ = all_click_df.copy()\n", + "all_click_df_ = all_click_df_.merge(item_info_df, how='left', on='click_article_id')\n", + "user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict = get_user_hist_item_info_dict(all_click_df_)\n", + "click_article_ids_set = get_click_article_ids_set(all_click_df)\n", + "# 需要注意的是\n", + "# 这里使用了很多规则来筛选冷启动的文章,所以前面再召回的阶段就应该尽可能的多召回一些文章,否则很容易被删掉\n", + "cold_start_user_items_dict = cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \\\n", + " user_last_item_created_time_dict, item_type_dict, item_words_dict, \\\n", + " item_created_time_dict, click_article_ids_set, recall_item_num)\n", + "\n", + "user_multi_recall_dict['cold_start_recall'] = cold_start_user_items_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:13:33.099298Z", + "start_time": "2020-11-16T07:13:32.655036Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 多路召回合并\n", + "多路召回合并就是将前面所有的召回策略得到的用户文章列表合并起来,下面是对前面所有召回结果的汇总\n", + "1. 基于itemcf计算的item之间的相似度sim进行的召回 \n", + "2. 基于embedding搜索得到的item之间的相似度进行的召回\n", + "3. YoutubeDNN召回\n", + "4. YoutubeDNN得到的user之间的相似度进行的召回\n", + "5. 基于冷启动策略的召回\n", + "\n", + "**注意:** \n", + "在做召回评估的时候就会发现有些召回的效果不错有些召回的效果很差,所以对每一路召回的结果,我们可以认为的定义一些权重,来做最终的相似度融合" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T07:02:16.033971Z", + "start_time": "2020-11-17T07:02:16.019819Z" + } + }, + "outputs": [], + "source": [ + "def combine_recall_results(user_multi_recall_dict, weight_dict=None, topk=25):\n", + " final_recall_items_dict = {}\n", + " \n", + " # 对每一种召回结果按照用户进行归一化,方便后面多种召回结果,相同用户的物品之间权重相加\n", + " def norm_user_recall_items_sim(sorted_item_list):\n", + " # 如果冷启动中没有文章或者只有一篇文章,直接返回,出现这种情况的原因可能是冷启动召回的文章数量太少了,\n", + " # 基于规则筛选之后就没有文章了, 这里还可以做一些其他的策略性的筛选\n", + " if len(sorted_item_list) < 2:\n", + " return sorted_item_list\n", + " \n", + " min_sim = sorted_item_list[-1][1]\n", + " max_sim = sorted_item_list[0][1]\n", + " \n", + " norm_sorted_item_list = []\n", + " for item, score in sorted_item_list:\n", + " if max_sim > 0:\n", + " norm_score = 1.0 * (score - min_sim) / (max_sim - min_sim) if max_sim > min_sim else 1.0\n", + " else:\n", + " norm_score = 0.0\n", + " norm_sorted_item_list.append((item, norm_score))\n", + " \n", + " return norm_sorted_item_list\n", + " \n", + " print('多路召回合并...')\n", + " for method, user_recall_items in tqdm(user_multi_recall_dict.items()):\n", + " print(method + '...')\n", + " # 在计算最终召回结果的时候,也可以为每一种召回结果设置一个权重\n", + " if weight_dict == None:\n", + " recall_method_weight = 1\n", + " else:\n", + " recall_method_weight = weight_dict[method]\n", + " \n", + " for user_id, sorted_item_list in user_recall_items.items(): # 进行归一化\n", + " user_recall_items[user_id] = norm_user_recall_items_sim(sorted_item_list)\n", + " \n", + " for user_id, sorted_item_list in user_recall_items.items():\n", + " # print('user_id')\n", + " final_recall_items_dict.setdefault(user_id, {})\n", + " for item, score in sorted_item_list:\n", + " final_recall_items_dict[user_id].setdefault(item, 0)\n", + " final_recall_items_dict[user_id][item] += recall_method_weight * score \n", + " \n", + " final_recall_items_dict_rank = {}\n", + " # 多路召回时也可以控制最终的召回数量\n", + " for user, recall_item_dict in final_recall_items_dict.items():\n", + " final_recall_items_dict_rank[user] = sorted(recall_item_dict.items(), key=lambda x: x[1], reverse=True)[:topk]\n", + "\n", + " # 将多路召回后的最终结果字典保存到本地\n", + " pickle.dump(final_recall_items_dict_rank, open(os.path.join(save_path, 'final_recall_items_dict.pkl'),'wb'))\n", + "\n", + " return final_recall_items_dict_rank" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T07:02:21.078455Z", + "start_time": "2020-11-17T07:02:21.074060Z" + } + }, + "outputs": [], + "source": [ + "# 这里直接对多路召回的权重给了一个相同的值,其实可以根据前面召回的情况来调整参数的值\n", + "weight_dict = {'itemcf_sim_itemcf_recall': 1.0,\n", + " 'embedding_sim_item_recall': 1.0,\n", + " 'youtubednn_recall': 1.0,\n", + " 'youtubednn_usercf_recall': 1.0, \n", + " 'cold_start_recall': 1.0}" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T07:04:35.747924Z", + "start_time": "2020-11-17T07:02:26.889573Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/5 [00:00= topk\n", - " \n", - " del recall_df['pred_score']\n", - " submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()\n", - " \n", - " submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]\n", - " # 按照提交格式定义列名\n", - " submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', \n", - " 3: 'article_3', 4: 'article_4', 5: 'article_5'})\n", - " \n", - " save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'\n", - " submit.to_csv(save_name, index=False, header=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:04.332198Z", - "start_time": "2020-11-18T04:21:04.325020Z" - } - }, - "outputs": [], - "source": [ - "# 排序结果归一化\n", - "def norm_sim(sim_df, weight=0.0):\n", - " # print(sim_df.head())\n", - " min_sim = sim_df.min()\n", - " max_sim = sim_df.max()\n", - " if max_sim == min_sim:\n", - " sim_df = sim_df.apply(lambda sim: 1.0)\n", - " else:\n", - " sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))\n", - "\n", - " sim_df = sim_df.apply(lambda sim: sim + weight) # plus one\n", - " return sim_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## LGB排序模型" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:07.787698Z", - "start_time": "2020-11-18T04:21:07.536514Z" - } - }, - "outputs": [], - "source": [ - "# 防止中间出错之后重新读取数据\n", - "trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()\n", - "\n", - "if offline:\n", - " val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()\n", - " \n", - "tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:10.839656Z", - "start_time": "2020-11-18T04:21:10.833109Z" - } - }, - "outputs": [], - "source": [ - "# 定义特征列\n", - "lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', \n", - " 'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',\n", - " 'click_environment','click_deviceGroup', 'click_os', 'click_country', \n", - " 'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',\n", - " 'words_hbo', 'category_id', 'created_at_ts','words_count']" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:14.126608Z", - "start_time": "2020-11-18T04:21:13.493653Z" - } - }, - "outputs": [], - "source": [ - "# 排序模型分组\n", - "trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n", - "g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", - "\n", - "if offline:\n", - " val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n", - " g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:16.136151Z", - "start_time": "2020-11-18T04:21:16.124444Z" - } - }, - "outputs": [], - "source": [ - "# 排序模型定义\n", - "lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", - " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", - " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:22.965433Z", - "start_time": "2020-11-18T04:21:17.799127Z" - } - }, - "outputs": [], - "source": [ - "# 排序模型训练\n", - "if offline:\n", - " lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,\n", - " eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], \n", - " eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n", - "else:\n", - " lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:28.616665Z", - "start_time": "2020-11-18T04:21:24.672280Z" - } - }, - "outputs": [], - "source": [ - "# 模型预测\n", - "tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n", - "\n", - "# 将这里的排序结果保存一份,用户后面的模型融合\n", - "tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:40.253692Z", - "start_time": "2020-11-18T04:21:30.546587Z" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 排序模型\n", + "通过召回的操作, 我们已经进行了问题规模的缩减, 对于每个用户, 选择出了N篇文章作为了候选集,并基于召回的候选集构建了与用户历史相关的特征,以及用户本身的属性特征,文章本省的属性特征,以及用户与文章之间的特征,下面就是使用机器学习模型来对构造好的特征进行学习,然后对测试集进行预测,得到测试集中的每个候选集用户点击的概率,返回点击概率最大的topk个文章,作为最终的结果。\n", + "\n", + "排序阶段选择了三个比较有代表性的排序模型,它们分别是:\n", + "\n", + "1. LGB的排序模型\n", + "2. LGB的分类模型\n", + "3. 深度学习的分类模型DIN\n", + "\n", + "得到了最终的排序模型输出的结果之后,还选择了两种比较经典的模型集成的方法:\n", + "\n", + "1. 输出结果加权融合\n", + "2. Staking(将模型的输出结果再使用一个简单模型进行预测)" + ] }, - "scrolled": true - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]\n", - "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", - "submit(rank_results, topk=5, model_name='lgb_ranker')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:22:26.195838Z", - "start_time": "2020-11-18T04:21:46.115002Z" + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:20:39.770642Z", + "start_time": "2020-11-18T04:20:38.500875Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "from tqdm import tqdm\n", + "import gc, os\n", + "import time\n", + "from datetime import datetime\n", + "import lightgbm as lgb\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1]\tvalid_0's ndcg@1: 0.909975\tvalid_0's ndcg@2: 0.963068\tvalid_0's ndcg@3: 0.96533\tvalid_0's ndcg@4: 0.965729\tvalid_0's ndcg@5: 0.965864\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.9143\tvalid_0's ndcg@2: 0.964711\tvalid_0's ndcg@3: 0.966961\tvalid_0's ndcg@4: 0.967338\tvalid_0's ndcg@5: 0.967483\n", - "[3]\tvalid_0's ndcg@1: 0.9181\tvalid_0's ndcg@2: 0.966114\tvalid_0's ndcg@3: 0.968289\tvalid_0's ndcg@4: 0.968773\tvalid_0's ndcg@5: 0.96887\n", - "[4]\tvalid_0's ndcg@1: 0.925575\tvalid_0's ndcg@2: 0.969093\tvalid_0's ndcg@3: 0.971193\tvalid_0's ndcg@4: 0.971603\tvalid_0's ndcg@5: 0.97169\n", - "[5]\tvalid_0's ndcg@1: 0.9267\tvalid_0's ndcg@2: 0.969635\tvalid_0's ndcg@3: 0.97166\tvalid_0's ndcg@4: 0.972037\tvalid_0's ndcg@5: 0.972133\n", - "[6]\tvalid_0's ndcg@1: 0.927\tvalid_0's ndcg@2: 0.969682\tvalid_0's ndcg@3: 0.971757\tvalid_0's ndcg@4: 0.972134\tvalid_0's ndcg@5: 0.972231\n", - "[7]\tvalid_0's ndcg@1: 0.928825\tvalid_0's ndcg@2: 0.970451\tvalid_0's ndcg@3: 0.972476\tvalid_0's ndcg@4: 0.97282\tvalid_0's ndcg@5: 0.972927\n", - "[8]\tvalid_0's ndcg@1: 0.930025\tvalid_0's ndcg@2: 0.970988\tvalid_0's ndcg@3: 0.972951\tvalid_0's ndcg@4: 0.973295\tvalid_0's ndcg@5: 0.973402\n", - "[9]\tvalid_0's ndcg@1: 0.931125\tvalid_0's ndcg@2: 0.971347\tvalid_0's ndcg@3: 0.973384\tvalid_0's ndcg@4: 0.973707\tvalid_0's ndcg@5: 0.973794\n", - "[10]\tvalid_0's ndcg@1: 0.9311\tvalid_0's ndcg@2: 0.971385\tvalid_0's ndcg@3: 0.973372\tvalid_0's ndcg@4: 0.973717\tvalid_0's ndcg@5: 0.973794\n", - "[11]\tvalid_0's ndcg@1: 0.930975\tvalid_0's ndcg@2: 0.971433\tvalid_0's ndcg@3: 0.973333\tvalid_0's ndcg@4: 0.973699\tvalid_0's ndcg@5: 0.973767\n", - "[12]\tvalid_0's ndcg@1: 0.93145\tvalid_0's ndcg@2: 0.971656\tvalid_0's ndcg@3: 0.973493\tvalid_0's ndcg@4: 0.973881\tvalid_0's ndcg@5: 0.973949\n", - "[13]\tvalid_0's ndcg@1: 0.932525\tvalid_0's ndcg@2: 0.971927\tvalid_0's ndcg@3: 0.973839\tvalid_0's ndcg@4: 0.974227\tvalid_0's ndcg@5: 0.974304\n", - "[14]\tvalid_0's ndcg@1: 0.932575\tvalid_0's ndcg@2: 0.971898\tvalid_0's ndcg@3: 0.973823\tvalid_0's ndcg@4: 0.974243\tvalid_0's ndcg@5: 0.97432\n", - "[15]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972239\tvalid_0's ndcg@3: 0.974189\tvalid_0's ndcg@4: 0.974587\tvalid_0's ndcg@5: 0.974665\n", - "[16]\tvalid_0's ndcg@1: 0.933475\tvalid_0's ndcg@2: 0.972309\tvalid_0's ndcg@3: 0.974209\tvalid_0's ndcg@4: 0.974596\tvalid_0's ndcg@5: 0.974674\n", - "[17]\tvalid_0's ndcg@1: 0.933725\tvalid_0's ndcg@2: 0.972369\tvalid_0's ndcg@3: 0.974307\tvalid_0's ndcg@4: 0.974684\tvalid_0's ndcg@5: 0.974761\n", - "[18]\tvalid_0's ndcg@1: 0.9339\tvalid_0's ndcg@2: 0.972497\tvalid_0's ndcg@3: 0.974372\tvalid_0's ndcg@4: 0.974749\tvalid_0's ndcg@5: 0.974836\n", - "[19]\tvalid_0's ndcg@1: 0.9345\tvalid_0's ndcg@2: 0.972845\tvalid_0's ndcg@3: 0.974645\tvalid_0's ndcg@4: 0.974979\tvalid_0's ndcg@5: 0.975085\n", - "[20]\tvalid_0's ndcg@1: 0.9349\tvalid_0's ndcg@2: 0.973103\tvalid_0's ndcg@3: 0.97484\tvalid_0's ndcg@4: 0.975174\tvalid_0's ndcg@5: 0.975271\n", - "[21]\tvalid_0's ndcg@1: 0.935\tvalid_0's ndcg@2: 0.973092\tvalid_0's ndcg@3: 0.97488\tvalid_0's ndcg@4: 0.975192\tvalid_0's ndcg@5: 0.975289\n", - "[22]\tvalid_0's ndcg@1: 0.93525\tvalid_0's ndcg@2: 0.9732\tvalid_0's ndcg@3: 0.974988\tvalid_0's ndcg@4: 0.975289\tvalid_0's ndcg@5: 0.975386\n", - "[23]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.972949\tvalid_0's ndcg@3: 0.974824\tvalid_0's ndcg@4: 0.975136\tvalid_0's ndcg@5: 0.975223\n", - "[24]\tvalid_0's ndcg@1: 0.93545\tvalid_0's ndcg@2: 0.973274\tvalid_0's ndcg@3: 0.975087\tvalid_0's ndcg@4: 0.975388\tvalid_0's ndcg@5: 0.975475\n", - "[25]\tvalid_0's ndcg@1: 0.9356\tvalid_0's ndcg@2: 0.973345\tvalid_0's ndcg@3: 0.97512\tvalid_0's ndcg@4: 0.975443\tvalid_0's ndcg@5: 0.97553\n", - "[26]\tvalid_0's ndcg@1: 0.93525\tvalid_0's ndcg@2: 0.9732\tvalid_0's ndcg@3: 0.975\tvalid_0's ndcg@4: 0.975313\tvalid_0's ndcg@5: 0.9754\n", - "[27]\tvalid_0's ndcg@1: 0.935175\tvalid_0's ndcg@2: 0.97322\tvalid_0's ndcg@3: 0.974983\tvalid_0's ndcg@4: 0.975295\tvalid_0's ndcg@5: 0.975382\n", - "[28]\tvalid_0's ndcg@1: 0.935425\tvalid_0's ndcg@2: 0.973328\tvalid_0's ndcg@3: 0.975041\tvalid_0's ndcg@4: 0.975374\tvalid_0's ndcg@5: 0.975471\n", - "[29]\tvalid_0's ndcg@1: 0.935275\tvalid_0's ndcg@2: 0.973225\tvalid_0's ndcg@3: 0.974963\tvalid_0's ndcg@4: 0.975297\tvalid_0's ndcg@5: 0.975403\n", - "[30]\tvalid_0's ndcg@1: 0.9353\tvalid_0's ndcg@2: 0.973235\tvalid_0's ndcg@3: 0.97501\tvalid_0's ndcg@4: 0.975311\tvalid_0's ndcg@5: 0.975418\n", - "[31]\tvalid_0's ndcg@1: 0.9356\tvalid_0's ndcg@2: 0.973361\tvalid_0's ndcg@3: 0.975099\tvalid_0's ndcg@4: 0.975422\tvalid_0's ndcg@5: 0.975528\n", - "[32]\tvalid_0's ndcg@1: 0.9364\tvalid_0's ndcg@2: 0.973641\tvalid_0's ndcg@3: 0.975391\tvalid_0's ndcg@4: 0.975714\tvalid_0's ndcg@5: 0.97582\n", - "[33]\tvalid_0's ndcg@1: 0.9367\tvalid_0's ndcg@2: 0.973751\tvalid_0's ndcg@3: 0.975501\tvalid_0's ndcg@4: 0.975824\tvalid_0's ndcg@5: 0.975931\n", - "[34]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.973902\tvalid_0's ndcg@3: 0.975677\tvalid_0's ndcg@4: 0.975989\tvalid_0's ndcg@5: 0.976095\n", - "[35]\tvalid_0's ndcg@1: 0.9377\tvalid_0's ndcg@2: 0.974105\tvalid_0's ndcg@3: 0.975892\tvalid_0's ndcg@4: 0.976194\tvalid_0's ndcg@5: 0.9763\n", - "[36]\tvalid_0's ndcg@1: 0.938\tvalid_0's ndcg@2: 0.974184\tvalid_0's ndcg@3: 0.975984\tvalid_0's ndcg@4: 0.976296\tvalid_0's ndcg@5: 0.976402\n", - "[37]\tvalid_0's ndcg@1: 0.93845\tvalid_0's ndcg@2: 0.974366\tvalid_0's ndcg@3: 0.976166\tvalid_0's ndcg@4: 0.976467\tvalid_0's ndcg@5: 0.976574\n", - "[38]\tvalid_0's ndcg@1: 0.938925\tvalid_0's ndcg@2: 0.974557\tvalid_0's ndcg@3: 0.976332\tvalid_0's ndcg@4: 0.976655\tvalid_0's ndcg@5: 0.976751\n", - "[39]\tvalid_0's ndcg@1: 0.93865\tvalid_0's ndcg@2: 0.974471\tvalid_0's ndcg@3: 0.976234\tvalid_0's ndcg@4: 0.976557\tvalid_0's ndcg@5: 0.976653\n", - "[40]\tvalid_0's ndcg@1: 0.938325\tvalid_0's ndcg@2: 0.974335\tvalid_0's ndcg@3: 0.97611\tvalid_0's ndcg@4: 0.976433\tvalid_0's ndcg@5: 0.97653\n", - "[41]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.974669\tvalid_0's ndcg@3: 0.976431\tvalid_0's ndcg@4: 0.976743\tvalid_0's ndcg@5: 0.97683\n", - "[42]\tvalid_0's ndcg@1: 0.939375\tvalid_0's ndcg@2: 0.974833\tvalid_0's ndcg@3: 0.976546\tvalid_0's ndcg@4: 0.976858\tvalid_0's ndcg@5: 0.976945\n", - "[43]\tvalid_0's ndcg@1: 0.939625\tvalid_0's ndcg@2: 0.974878\tvalid_0's ndcg@3: 0.976628\tvalid_0's ndcg@4: 0.97694\tvalid_0's ndcg@5: 0.977027\n", - "[44]\tvalid_0's ndcg@1: 0.9395\tvalid_0's ndcg@2: 0.974832\tvalid_0's ndcg@3: 0.97657\tvalid_0's ndcg@4: 0.976893\tvalid_0's ndcg@5: 0.97698\n", - "[45]\tvalid_0's ndcg@1: 0.939775\tvalid_0's ndcg@2: 0.974949\tvalid_0's ndcg@3: 0.976674\tvalid_0's ndcg@4: 0.976997\tvalid_0's ndcg@5: 0.977084\n", - "[46]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.974945\tvalid_0's ndcg@3: 0.976708\tvalid_0's ndcg@4: 0.97702\tvalid_0's ndcg@5: 0.977107\n", - "[47]\tvalid_0's ndcg@1: 0.94005\tvalid_0's ndcg@2: 0.975004\tvalid_0's ndcg@3: 0.976766\tvalid_0's ndcg@4: 0.977078\tvalid_0's ndcg@5: 0.977175\n", - "[48]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975189\tvalid_0's ndcg@3: 0.976939\tvalid_0's ndcg@4: 0.97723\tvalid_0's ndcg@5: 0.977327\n", - "[49]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975189\tvalid_0's ndcg@3: 0.976939\tvalid_0's ndcg@4: 0.97723\tvalid_0's ndcg@5: 0.977327\n", - "[50]\tvalid_0's ndcg@1: 0.9405\tvalid_0's ndcg@2: 0.975264\tvalid_0's ndcg@3: 0.976989\tvalid_0's ndcg@4: 0.977291\tvalid_0's ndcg@5: 0.977368\n", - "[51]\tvalid_0's ndcg@1: 0.941125\tvalid_0's ndcg@2: 0.975526\tvalid_0's ndcg@3: 0.977226\tvalid_0's ndcg@4: 0.977528\tvalid_0's ndcg@5: 0.977605\n", - "[52]\tvalid_0's ndcg@1: 0.941\tvalid_0's ndcg@2: 0.97548\tvalid_0's ndcg@3: 0.977193\tvalid_0's ndcg@4: 0.977484\tvalid_0's ndcg@5: 0.977561\n", - "[53]\tvalid_0's ndcg@1: 0.9411\tvalid_0's ndcg@2: 0.975596\tvalid_0's ndcg@3: 0.977259\tvalid_0's ndcg@4: 0.977539\tvalid_0's ndcg@5: 0.977616\n", - "[54]\tvalid_0's ndcg@1: 0.9412\tvalid_0's ndcg@2: 0.975712\tvalid_0's ndcg@3: 0.977299\tvalid_0's ndcg@4: 0.97759\tvalid_0's ndcg@5: 0.977667\n", - "[55]\tvalid_0's ndcg@1: 0.94155\tvalid_0's ndcg@2: 0.975841\tvalid_0's ndcg@3: 0.977429\tvalid_0's ndcg@4: 0.977719\tvalid_0's ndcg@5: 0.977797\n", - "[56]\tvalid_0's ndcg@1: 0.941825\tvalid_0's ndcg@2: 0.975943\tvalid_0's ndcg@3: 0.97753\tvalid_0's ndcg@4: 0.977821\tvalid_0's ndcg@5: 0.977898\n", - "[57]\tvalid_0's ndcg@1: 0.9416\tvalid_0's ndcg@2: 0.975891\tvalid_0's ndcg@3: 0.977429\tvalid_0's ndcg@4: 0.977741\tvalid_0's ndcg@5: 0.977818\n", - "[58]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.975969\tvalid_0's ndcg@3: 0.977494\tvalid_0's ndcg@4: 0.977795\tvalid_0's ndcg@5: 0.977873\n", - "[59]\tvalid_0's ndcg@1: 0.942025\tvalid_0's ndcg@2: 0.975985\tvalid_0's ndcg@3: 0.977547\tvalid_0's ndcg@4: 0.977881\tvalid_0's ndcg@5: 0.977958\n", - "[60]\tvalid_0's ndcg@1: 0.94205\tvalid_0's ndcg@2: 0.975994\tvalid_0's ndcg@3: 0.977569\tvalid_0's ndcg@4: 0.977892\tvalid_0's ndcg@5: 0.977969\n", - "[61]\tvalid_0's ndcg@1: 0.94205\tvalid_0's ndcg@2: 0.975947\tvalid_0's ndcg@3: 0.977559\tvalid_0's ndcg@4: 0.977882\tvalid_0's ndcg@5: 0.97796\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取排序特征" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[62]\tvalid_0's ndcg@1: 0.942225\tvalid_0's ndcg@2: 0.976027\tvalid_0's ndcg@3: 0.97764\tvalid_0's ndcg@4: 0.977941\tvalid_0's ndcg@5: 0.978028\n", - "[63]\tvalid_0's ndcg@1: 0.942125\tvalid_0's ndcg@2: 0.976022\tvalid_0's ndcg@3: 0.977622\tvalid_0's ndcg@4: 0.977912\tvalid_0's ndcg@5: 0.977999\n", - "[64]\tvalid_0's ndcg@1: 0.942675\tvalid_0's ndcg@2: 0.976193\tvalid_0's ndcg@3: 0.977793\tvalid_0's ndcg@4: 0.978105\tvalid_0's ndcg@5: 0.978192\n", - "[65]\tvalid_0's ndcg@1: 0.942725\tvalid_0's ndcg@2: 0.976227\tvalid_0's ndcg@3: 0.977802\tvalid_0's ndcg@4: 0.978125\tvalid_0's ndcg@5: 0.978212\n", - "[66]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976132\tvalid_0's ndcg@3: 0.977695\tvalid_0's ndcg@4: 0.978018\tvalid_0's ndcg@5: 0.978105\n", - "[67]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976092\tvalid_0's ndcg@3: 0.977679\tvalid_0's ndcg@4: 0.978002\tvalid_0's ndcg@5: 0.978089\n", - "[68]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976148\tvalid_0's ndcg@3: 0.977698\tvalid_0's ndcg@4: 0.978021\tvalid_0's ndcg@5: 0.978108\n", - "[69]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976123\tvalid_0's ndcg@3: 0.977686\tvalid_0's ndcg@4: 0.978009\tvalid_0's ndcg@5: 0.978096\n", - "[70]\tvalid_0's ndcg@1: 0.942625\tvalid_0's ndcg@2: 0.976222\tvalid_0's ndcg@3: 0.977785\tvalid_0's ndcg@4: 0.978097\tvalid_0's ndcg@5: 0.978184\n", - "[71]\tvalid_0's ndcg@1: 0.942575\tvalid_0's ndcg@2: 0.976188\tvalid_0's ndcg@3: 0.977763\tvalid_0's ndcg@4: 0.978075\tvalid_0's ndcg@5: 0.978162\n", - "[72]\tvalid_0's ndcg@1: 0.9427\tvalid_0's ndcg@2: 0.976234\tvalid_0's ndcg@3: 0.977809\tvalid_0's ndcg@4: 0.978121\tvalid_0's ndcg@5: 0.978208\n", - "[73]\tvalid_0's ndcg@1: 0.9428\tvalid_0's ndcg@2: 0.976255\tvalid_0's ndcg@3: 0.977843\tvalid_0's ndcg@4: 0.978155\tvalid_0's ndcg@5: 0.978242\n", - "[74]\tvalid_0's ndcg@1: 0.94295\tvalid_0's ndcg@2: 0.97631\tvalid_0's ndcg@3: 0.977898\tvalid_0's ndcg@4: 0.97821\tvalid_0's ndcg@5: 0.978297\n", - "[75]\tvalid_0's ndcg@1: 0.943\tvalid_0's ndcg@2: 0.976329\tvalid_0's ndcg@3: 0.977941\tvalid_0's ndcg@4: 0.978232\tvalid_0's ndcg@5: 0.978319\n", - "[76]\tvalid_0's ndcg@1: 0.9433\tvalid_0's ndcg@2: 0.976471\tvalid_0's ndcg@3: 0.978059\tvalid_0's ndcg@4: 0.97836\tvalid_0's ndcg@5: 0.978437\n", - "[77]\tvalid_0's ndcg@1: 0.94315\tvalid_0's ndcg@2: 0.976416\tvalid_0's ndcg@3: 0.977991\tvalid_0's ndcg@4: 0.978314\tvalid_0's ndcg@5: 0.978381\n", - "[78]\tvalid_0's ndcg@1: 0.943675\tvalid_0's ndcg@2: 0.976657\tvalid_0's ndcg@3: 0.978194\tvalid_0's ndcg@4: 0.978517\tvalid_0's ndcg@5: 0.978585\n", - "[79]\tvalid_0's ndcg@1: 0.94365\tvalid_0's ndcg@2: 0.976663\tvalid_0's ndcg@3: 0.978188\tvalid_0's ndcg@4: 0.978501\tvalid_0's ndcg@5: 0.978578\n", - "[80]\tvalid_0's ndcg@1: 0.943725\tvalid_0's ndcg@2: 0.976628\tvalid_0's ndcg@3: 0.978203\tvalid_0's ndcg@4: 0.978515\tvalid_0's ndcg@5: 0.978593\n", - "[81]\tvalid_0's ndcg@1: 0.943975\tvalid_0's ndcg@2: 0.97672\tvalid_0's ndcg@3: 0.978295\tvalid_0's ndcg@4: 0.978607\tvalid_0's ndcg@5: 0.978685\n", - "[82]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.976822\tvalid_0's ndcg@3: 0.978397\tvalid_0's ndcg@4: 0.97872\tvalid_0's ndcg@5: 0.978787\n", - "[83]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.976788\tvalid_0's ndcg@3: 0.978375\tvalid_0's ndcg@4: 0.978698\tvalid_0's ndcg@5: 0.978766\n", - "[84]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.97679\tvalid_0's ndcg@3: 0.97839\tvalid_0's ndcg@4: 0.978702\tvalid_0's ndcg@5: 0.97878\n", - "[85]\tvalid_0's ndcg@1: 0.9443\tvalid_0's ndcg@2: 0.976809\tvalid_0's ndcg@3: 0.978421\tvalid_0's ndcg@4: 0.978723\tvalid_0's ndcg@5: 0.9788\n", - "[86]\tvalid_0's ndcg@1: 0.944525\tvalid_0's ndcg@2: 0.976939\tvalid_0's ndcg@3: 0.978502\tvalid_0's ndcg@4: 0.978814\tvalid_0's ndcg@5: 0.978891\n", - "[87]\tvalid_0's ndcg@1: 0.944625\tvalid_0's ndcg@2: 0.976976\tvalid_0's ndcg@3: 0.978551\tvalid_0's ndcg@4: 0.978852\tvalid_0's ndcg@5: 0.97893\n", - "[88]\tvalid_0's ndcg@1: 0.944925\tvalid_0's ndcg@2: 0.977102\tvalid_0's ndcg@3: 0.978677\tvalid_0's ndcg@4: 0.978968\tvalid_0's ndcg@5: 0.979045\n", - "[89]\tvalid_0's ndcg@1: 0.945125\tvalid_0's ndcg@2: 0.977208\tvalid_0's ndcg@3: 0.978758\tvalid_0's ndcg@4: 0.979048\tvalid_0's ndcg@5: 0.979126\n", - "[90]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977135\tvalid_0's ndcg@3: 0.978735\tvalid_0's ndcg@4: 0.979026\tvalid_0's ndcg@5: 0.979104\n", - "[91]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977208\tvalid_0's ndcg@3: 0.978858\tvalid_0's ndcg@4: 0.979138\tvalid_0's ndcg@5: 0.979215\n", - "[92]\tvalid_0's ndcg@1: 0.9455\tvalid_0's ndcg@2: 0.977267\tvalid_0's ndcg@3: 0.978905\tvalid_0's ndcg@4: 0.979174\tvalid_0's ndcg@5: 0.979251\n", - "[93]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977193\tvalid_0's ndcg@3: 0.978818\tvalid_0's ndcg@4: 0.979098\tvalid_0's ndcg@5: 0.979176\n", - "[94]\tvalid_0's ndcg@1: 0.94545\tvalid_0's ndcg@2: 0.97728\tvalid_0's ndcg@3: 0.97888\tvalid_0's ndcg@4: 0.97916\tvalid_0's ndcg@5: 0.979238\n", - "[95]\tvalid_0's ndcg@1: 0.9458\tvalid_0's ndcg@2: 0.977394\tvalid_0's ndcg@3: 0.979006\tvalid_0's ndcg@4: 0.979286\tvalid_0's ndcg@5: 0.979364\n", - "[96]\tvalid_0's ndcg@1: 0.946075\tvalid_0's ndcg@2: 0.977527\tvalid_0's ndcg@3: 0.979114\tvalid_0's ndcg@4: 0.979394\tvalid_0's ndcg@5: 0.979472\n", - "[97]\tvalid_0's ndcg@1: 0.946475\tvalid_0's ndcg@2: 0.977659\tvalid_0's ndcg@3: 0.979259\tvalid_0's ndcg@4: 0.979539\tvalid_0's ndcg@5: 0.979616\n", - "[98]\tvalid_0's ndcg@1: 0.94675\tvalid_0's ndcg@2: 0.97776\tvalid_0's ndcg@3: 0.97936\tvalid_0's ndcg@4: 0.979651\tvalid_0's ndcg@5: 0.979719\n", - "[99]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.977831\tvalid_0's ndcg@3: 0.979419\tvalid_0's ndcg@4: 0.97971\tvalid_0's ndcg@5: 0.979777\n", - "[100]\tvalid_0's ndcg@1: 0.9468\tvalid_0's ndcg@2: 0.977794\tvalid_0's ndcg@3: 0.979369\tvalid_0's ndcg@4: 0.979671\tvalid_0's ndcg@5: 0.979739\n", - "Did not meet early stopping. Best iteration is:\n", - "[99]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.977831\tvalid_0's ndcg@3: 0.979419\tvalid_0's ndcg@4: 0.97971\tvalid_0's ndcg@5: 0.979777\n", - "[1]\tvalid_0's ndcg@1: 0.909075\tvalid_0's ndcg@2: 0.963019\tvalid_0's ndcg@3: 0.965069\tvalid_0's ndcg@4: 0.965543\tvalid_0's ndcg@5: 0.965601\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.9123\tvalid_0's ndcg@2: 0.964273\tvalid_0's ndcg@3: 0.966248\tvalid_0's ndcg@4: 0.966722\tvalid_0's ndcg@5: 0.966789\n", - "[3]\tvalid_0's ndcg@1: 0.915075\tvalid_0's ndcg@2: 0.965691\tvalid_0's ndcg@3: 0.967466\tvalid_0's ndcg@4: 0.967854\tvalid_0's ndcg@5: 0.967922\n", - "[4]\tvalid_0's ndcg@1: 0.91845\tvalid_0's ndcg@2: 0.967047\tvalid_0's ndcg@3: 0.968735\tvalid_0's ndcg@4: 0.969133\tvalid_0's ndcg@5: 0.969201\n", - "[5]\tvalid_0's ndcg@1: 0.92355\tvalid_0's ndcg@2: 0.968961\tvalid_0's ndcg@3: 0.970674\tvalid_0's ndcg@4: 0.97104\tvalid_0's ndcg@5: 0.971098\n", - "[6]\tvalid_0's ndcg@1: 0.9253\tvalid_0's ndcg@2: 0.969607\tvalid_0's ndcg@3: 0.971345\tvalid_0's ndcg@4: 0.971689\tvalid_0's ndcg@5: 0.971747\n", - "[7]\tvalid_0's ndcg@1: 0.926225\tvalid_0's ndcg@2: 0.969933\tvalid_0's ndcg@3: 0.971708\tvalid_0's ndcg@4: 0.972031\tvalid_0's ndcg@5: 0.972079\n", - "[8]\tvalid_0's ndcg@1: 0.926475\tvalid_0's ndcg@2: 0.970104\tvalid_0's ndcg@3: 0.971804\tvalid_0's ndcg@4: 0.972116\tvalid_0's ndcg@5: 0.972184\n", - "[9]\tvalid_0's ndcg@1: 0.9277\tvalid_0's ndcg@2: 0.970682\tvalid_0's ndcg@3: 0.972307\tvalid_0's ndcg@4: 0.972598\tvalid_0's ndcg@5: 0.972675\n", - "[10]\tvalid_0's ndcg@1: 0.92775\tvalid_0's ndcg@2: 0.970653\tvalid_0's ndcg@3: 0.972316\tvalid_0's ndcg@4: 0.972617\tvalid_0's ndcg@5: 0.972685\n", - "[11]\tvalid_0's ndcg@1: 0.9283\tvalid_0's ndcg@2: 0.97084\tvalid_0's ndcg@3: 0.97254\tvalid_0's ndcg@4: 0.97281\tvalid_0's ndcg@5: 0.972887\n", - "[12]\tvalid_0's ndcg@1: 0.9287\tvalid_0's ndcg@2: 0.971051\tvalid_0's ndcg@3: 0.972701\tvalid_0's ndcg@4: 0.97297\tvalid_0's ndcg@5: 0.973048\n", - "[13]\tvalid_0's ndcg@1: 0.9297\tvalid_0's ndcg@2: 0.971389\tvalid_0's ndcg@3: 0.973001\tvalid_0's ndcg@4: 0.973313\tvalid_0's ndcg@5: 0.9734\n", - "[14]\tvalid_0's ndcg@1: 0.92955\tvalid_0's ndcg@2: 0.971444\tvalid_0's ndcg@3: 0.972994\tvalid_0's ndcg@4: 0.973284\tvalid_0's ndcg@5: 0.973371\n", - "[15]\tvalid_0's ndcg@1: 0.930225\tvalid_0's ndcg@2: 0.97174\tvalid_0's ndcg@3: 0.973253\tvalid_0's ndcg@4: 0.973543\tvalid_0's ndcg@5: 0.97363\n", - "[16]\tvalid_0's ndcg@1: 0.930425\tvalid_0's ndcg@2: 0.971798\tvalid_0's ndcg@3: 0.973298\tvalid_0's ndcg@4: 0.97361\tvalid_0's ndcg@5: 0.973698\n", - "[17]\tvalid_0's ndcg@1: 0.93125\tvalid_0's ndcg@2: 0.971992\tvalid_0's ndcg@3: 0.97358\tvalid_0's ndcg@4: 0.973903\tvalid_0's ndcg@5: 0.97398\n", - "[18]\tvalid_0's ndcg@1: 0.931925\tvalid_0's ndcg@2: 0.972257\tvalid_0's ndcg@3: 0.973845\tvalid_0's ndcg@4: 0.974146\tvalid_0's ndcg@5: 0.974224\n", - "[19]\tvalid_0's ndcg@1: 0.932375\tvalid_0's ndcg@2: 0.972376\tvalid_0's ndcg@3: 0.974038\tvalid_0's ndcg@4: 0.974318\tvalid_0's ndcg@5: 0.974376\n", - "[20]\tvalid_0's ndcg@1: 0.932\tvalid_0's ndcg@2: 0.972269\tvalid_0's ndcg@3: 0.973907\tvalid_0's ndcg@4: 0.974187\tvalid_0's ndcg@5: 0.974245\n", - "[21]\tvalid_0's ndcg@1: 0.932725\tvalid_0's ndcg@2: 0.972568\tvalid_0's ndcg@3: 0.974181\tvalid_0's ndcg@4: 0.974471\tvalid_0's ndcg@5: 0.974529\n", - "[22]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.972735\tvalid_0's ndcg@3: 0.974298\tvalid_0's ndcg@4: 0.974599\tvalid_0's ndcg@5: 0.974657\n", - "[23]\tvalid_0's ndcg@1: 0.932925\tvalid_0's ndcg@2: 0.972642\tvalid_0's ndcg@3: 0.974255\tvalid_0's ndcg@4: 0.974545\tvalid_0's ndcg@5: 0.974594\n", - "[24]\tvalid_0's ndcg@1: 0.933175\tvalid_0's ndcg@2: 0.972734\tvalid_0's ndcg@3: 0.974347\tvalid_0's ndcg@4: 0.974638\tvalid_0's ndcg@5: 0.974686\n", - "[25]\tvalid_0's ndcg@1: 0.9331\tvalid_0's ndcg@2: 0.972754\tvalid_0's ndcg@3: 0.974366\tvalid_0's ndcg@4: 0.974636\tvalid_0's ndcg@5: 0.974674\n" - ] + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:20:41.843180Z", + "start_time": "2020-11-18T04:20:41.837287Z" + } + }, + "outputs": [], + "source": [ + "data_path = './data_raw/'\n", + "save_path = './temp_results/'\n", + "offline = False" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[26]\tvalid_0's ndcg@1: 0.933275\tvalid_0's ndcg@2: 0.972787\tvalid_0's ndcg@3: 0.974424\tvalid_0's ndcg@4: 0.974694\tvalid_0's ndcg@5: 0.974732\n", - "[27]\tvalid_0's ndcg@1: 0.93325\tvalid_0's ndcg@2: 0.972809\tvalid_0's ndcg@3: 0.974434\tvalid_0's ndcg@4: 0.974703\tvalid_0's ndcg@5: 0.974732\n", - "[28]\tvalid_0's ndcg@1: 0.933625\tvalid_0's ndcg@2: 0.972932\tvalid_0's ndcg@3: 0.974557\tvalid_0's ndcg@4: 0.974826\tvalid_0's ndcg@5: 0.974855\n", - "[29]\tvalid_0's ndcg@1: 0.933725\tvalid_0's ndcg@2: 0.972937\tvalid_0's ndcg@3: 0.974587\tvalid_0's ndcg@4: 0.974856\tvalid_0's ndcg@5: 0.974885\n", - "[30]\tvalid_0's ndcg@1: 0.93355\tvalid_0's ndcg@2: 0.972873\tvalid_0's ndcg@3: 0.974523\tvalid_0's ndcg@4: 0.974792\tvalid_0's ndcg@5: 0.974821\n", - "[31]\tvalid_0's ndcg@1: 0.9342\tvalid_0's ndcg@2: 0.973065\tvalid_0's ndcg@3: 0.974753\tvalid_0's ndcg@4: 0.975022\tvalid_0's ndcg@5: 0.975051\n", - "[32]\tvalid_0's ndcg@1: 0.93435\tvalid_0's ndcg@2: 0.973152\tvalid_0's ndcg@3: 0.974815\tvalid_0's ndcg@4: 0.975084\tvalid_0's ndcg@5: 0.975113\n", - "[33]\tvalid_0's ndcg@1: 0.934475\tvalid_0's ndcg@2: 0.97323\tvalid_0's ndcg@3: 0.974855\tvalid_0's ndcg@4: 0.975135\tvalid_0's ndcg@5: 0.975164\n", - "[34]\tvalid_0's ndcg@1: 0.9342\tvalid_0's ndcg@2: 0.973113\tvalid_0's ndcg@3: 0.974738\tvalid_0's ndcg@4: 0.975028\tvalid_0's ndcg@5: 0.975057\n", - "[35]\tvalid_0's ndcg@1: 0.93455\tvalid_0's ndcg@2: 0.973258\tvalid_0's ndcg@3: 0.97487\tvalid_0's ndcg@4: 0.975172\tvalid_0's ndcg@5: 0.975201\n", - "[36]\tvalid_0's ndcg@1: 0.9344\tvalid_0's ndcg@2: 0.973265\tvalid_0's ndcg@3: 0.974828\tvalid_0's ndcg@4: 0.975129\tvalid_0's ndcg@5: 0.975158\n", - "[37]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.973438\tvalid_0's ndcg@3: 0.975013\tvalid_0's ndcg@4: 0.975304\tvalid_0's ndcg@5: 0.975323\n", - "[38]\tvalid_0's ndcg@1: 0.934975\tvalid_0's ndcg@2: 0.973541\tvalid_0's ndcg@3: 0.975066\tvalid_0's ndcg@4: 0.975367\tvalid_0's ndcg@5: 0.975386\n", - "[39]\tvalid_0's ndcg@1: 0.935275\tvalid_0's ndcg@2: 0.973667\tvalid_0's ndcg@3: 0.975192\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975502\n", - "[40]\tvalid_0's ndcg@1: 0.9352\tvalid_0's ndcg@2: 0.973624\tvalid_0's ndcg@3: 0.975174\tvalid_0's ndcg@4: 0.975454\tvalid_0's ndcg@5: 0.975473\n", - "[41]\tvalid_0's ndcg@1: 0.935325\tvalid_0's ndcg@2: 0.973686\tvalid_0's ndcg@3: 0.975223\tvalid_0's ndcg@4: 0.975503\tvalid_0's ndcg@5: 0.975522\n", - "[42]\tvalid_0's ndcg@1: 0.93545\tvalid_0's ndcg@2: 0.973716\tvalid_0's ndcg@3: 0.975266\tvalid_0's ndcg@4: 0.975546\tvalid_0's ndcg@5: 0.975565\n", - "[43]\tvalid_0's ndcg@1: 0.93615\tvalid_0's ndcg@2: 0.974022\tvalid_0's ndcg@3: 0.975534\tvalid_0's ndcg@4: 0.975814\tvalid_0's ndcg@5: 0.975843\n", - "[44]\tvalid_0's ndcg@1: 0.936225\tvalid_0's ndcg@2: 0.974112\tvalid_0's ndcg@3: 0.975562\tvalid_0's ndcg@4: 0.975853\tvalid_0's ndcg@5: 0.975882\n", - "[45]\tvalid_0's ndcg@1: 0.9365\tvalid_0's ndcg@2: 0.974167\tvalid_0's ndcg@3: 0.975654\tvalid_0's ndcg@4: 0.975945\tvalid_0's ndcg@5: 0.975974\n", - "[46]\tvalid_0's ndcg@1: 0.93665\tvalid_0's ndcg@2: 0.974206\tvalid_0's ndcg@3: 0.975694\tvalid_0's ndcg@4: 0.975995\tvalid_0's ndcg@5: 0.976024\n", - "[47]\tvalid_0's ndcg@1: 0.93685\tvalid_0's ndcg@2: 0.974311\tvalid_0's ndcg@3: 0.975786\tvalid_0's ndcg@4: 0.976077\tvalid_0's ndcg@5: 0.976106\n", - "[48]\tvalid_0's ndcg@1: 0.937025\tvalid_0's ndcg@2: 0.974408\tvalid_0's ndcg@3: 0.975845\tvalid_0's ndcg@4: 0.976147\tvalid_0's ndcg@5: 0.976185\n", - "[49]\tvalid_0's ndcg@1: 0.936975\tvalid_0's ndcg@2: 0.974342\tvalid_0's ndcg@3: 0.975829\tvalid_0's ndcg@4: 0.97612\tvalid_0's ndcg@5: 0.976159\n", - "[50]\tvalid_0's ndcg@1: 0.9371\tvalid_0's ndcg@2: 0.974388\tvalid_0's ndcg@3: 0.97585\tvalid_0's ndcg@4: 0.976152\tvalid_0's ndcg@5: 0.976191\n", - "[51]\tvalid_0's ndcg@1: 0.937025\tvalid_0's ndcg@2: 0.974329\tvalid_0's ndcg@3: 0.975841\tvalid_0's ndcg@4: 0.976121\tvalid_0's ndcg@5: 0.97616\n", - "[52]\tvalid_0's ndcg@1: 0.9377\tvalid_0's ndcg@2: 0.974578\tvalid_0's ndcg@3: 0.976078\tvalid_0's ndcg@4: 0.976369\tvalid_0's ndcg@5: 0.976407\n", - "[53]\tvalid_0's ndcg@1: 0.9378\tvalid_0's ndcg@2: 0.974615\tvalid_0's ndcg@3: 0.976115\tvalid_0's ndcg@4: 0.976405\tvalid_0's ndcg@5: 0.976444\n", - "[54]\tvalid_0's ndcg@1: 0.938\tvalid_0's ndcg@2: 0.974689\tvalid_0's ndcg@3: 0.976214\tvalid_0's ndcg@4: 0.976483\tvalid_0's ndcg@5: 0.976521\n", - "[55]\tvalid_0's ndcg@1: 0.938225\tvalid_0's ndcg@2: 0.974803\tvalid_0's ndcg@3: 0.976303\tvalid_0's ndcg@4: 0.976572\tvalid_0's ndcg@5: 0.976611\n", - "[56]\tvalid_0's ndcg@1: 0.938175\tvalid_0's ndcg@2: 0.9748\tvalid_0's ndcg@3: 0.976275\tvalid_0's ndcg@4: 0.976555\tvalid_0's ndcg@5: 0.976594\n", - "[57]\tvalid_0's ndcg@1: 0.938525\tvalid_0's ndcg@2: 0.974914\tvalid_0's ndcg@3: 0.976414\tvalid_0's ndcg@4: 0.976683\tvalid_0's ndcg@5: 0.976722\n", - "[58]\tvalid_0's ndcg@1: 0.93875\tvalid_0's ndcg@2: 0.975028\tvalid_0's ndcg@3: 0.976503\tvalid_0's ndcg@4: 0.976773\tvalid_0's ndcg@5: 0.976811\n", - "[59]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975198\tvalid_0's ndcg@3: 0.976648\tvalid_0's ndcg@4: 0.976918\tvalid_0's ndcg@5: 0.976956\n", - "[60]\tvalid_0's ndcg@1: 0.939025\tvalid_0's ndcg@2: 0.975177\tvalid_0's ndcg@3: 0.976615\tvalid_0's ndcg@4: 0.976884\tvalid_0's ndcg@5: 0.976923\n", - "[61]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.975205\tvalid_0's ndcg@3: 0.976642\tvalid_0's ndcg@4: 0.976912\tvalid_0's ndcg@5: 0.97695\n", - "[62]\tvalid_0's ndcg@1: 0.93965\tvalid_0's ndcg@2: 0.975424\tvalid_0's ndcg@3: 0.976836\tvalid_0's ndcg@4: 0.977116\tvalid_0's ndcg@5: 0.977155\n", - "[63]\tvalid_0's ndcg@1: 0.940075\tvalid_0's ndcg@2: 0.975596\tvalid_0's ndcg@3: 0.976996\tvalid_0's ndcg@4: 0.977276\tvalid_0's ndcg@5: 0.977315\n", - "[64]\tvalid_0's ndcg@1: 0.940375\tvalid_0's ndcg@2: 0.975723\tvalid_0's ndcg@3: 0.977123\tvalid_0's ndcg@4: 0.977392\tvalid_0's ndcg@5: 0.977431\n", - "[65]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975766\tvalid_0's ndcg@3: 0.977154\tvalid_0's ndcg@4: 0.977423\tvalid_0's ndcg@5: 0.977462\n", - "[66]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.975744\tvalid_0's ndcg@3: 0.977156\tvalid_0's ndcg@4: 0.977426\tvalid_0's ndcg@5: 0.977464\n", - "[67]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.97576\tvalid_0's ndcg@3: 0.977172\tvalid_0's ndcg@4: 0.977431\tvalid_0's ndcg@5: 0.977469\n", - "[68]\tvalid_0's ndcg@1: 0.940675\tvalid_0's ndcg@2: 0.975849\tvalid_0's ndcg@3: 0.977249\tvalid_0's ndcg@4: 0.977508\tvalid_0's ndcg@5: 0.977546\n", - "[69]\tvalid_0's ndcg@1: 0.9413\tvalid_0's ndcg@2: 0.976017\tvalid_0's ndcg@3: 0.977454\tvalid_0's ndcg@4: 0.977724\tvalid_0's ndcg@5: 0.977762\n", - "[70]\tvalid_0's ndcg@1: 0.94105\tvalid_0's ndcg@2: 0.975925\tvalid_0's ndcg@3: 0.977362\tvalid_0's ndcg@4: 0.977631\tvalid_0's ndcg@5: 0.97767\n", - "[71]\tvalid_0's ndcg@1: 0.94105\tvalid_0's ndcg@2: 0.975925\tvalid_0's ndcg@3: 0.97735\tvalid_0's ndcg@4: 0.97763\tvalid_0's ndcg@5: 0.977668\n", - "[72]\tvalid_0's ndcg@1: 0.941325\tvalid_0's ndcg@2: 0.976058\tvalid_0's ndcg@3: 0.97747\tvalid_0's ndcg@4: 0.977739\tvalid_0's ndcg@5: 0.977778\n", - "[73]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.976076\tvalid_0's ndcg@3: 0.977476\tvalid_0's ndcg@4: 0.977756\tvalid_0's ndcg@5: 0.977795\n", - "[74]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.97619\tvalid_0's ndcg@3: 0.97759\tvalid_0's ndcg@4: 0.97788\tvalid_0's ndcg@5: 0.977919\n", - "[75]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.97619\tvalid_0's ndcg@3: 0.977602\tvalid_0's ndcg@4: 0.977882\tvalid_0's ndcg@5: 0.977921\n", - "[76]\tvalid_0's ndcg@1: 0.94195\tvalid_0's ndcg@2: 0.976273\tvalid_0's ndcg@3: 0.977685\tvalid_0's ndcg@4: 0.977965\tvalid_0's ndcg@5: 0.978004\n", - "[77]\tvalid_0's ndcg@1: 0.9419\tvalid_0's ndcg@2: 0.97627\tvalid_0's ndcg@3: 0.97767\tvalid_0's ndcg@4: 0.97795\tvalid_0's ndcg@5: 0.977989\n", - "[78]\tvalid_0's ndcg@1: 0.94235\tvalid_0's ndcg@2: 0.976452\tvalid_0's ndcg@3: 0.977839\tvalid_0's ndcg@4: 0.978119\tvalid_0's ndcg@5: 0.978158\n", - "[79]\tvalid_0's ndcg@1: 0.94265\tvalid_0's ndcg@2: 0.976562\tvalid_0's ndcg@3: 0.977937\tvalid_0's ndcg@4: 0.978228\tvalid_0's ndcg@5: 0.978267\n", - "[80]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976667\tvalid_0's ndcg@3: 0.978067\tvalid_0's ndcg@4: 0.978347\tvalid_0's ndcg@5: 0.978385\n", - "[81]\tvalid_0's ndcg@1: 0.94305\tvalid_0's ndcg@2: 0.97671\tvalid_0's ndcg@3: 0.978098\tvalid_0's ndcg@4: 0.978378\tvalid_0's ndcg@5: 0.978416\n", - "[82]\tvalid_0's ndcg@1: 0.943175\tvalid_0's ndcg@2: 0.97674\tvalid_0's ndcg@3: 0.978115\tvalid_0's ndcg@4: 0.978417\tvalid_0's ndcg@5: 0.978456\n", - "[83]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976752\tvalid_0's ndcg@3: 0.97814\tvalid_0's ndcg@4: 0.978441\tvalid_0's ndcg@5: 0.97848\n", - "[84]\tvalid_0's ndcg@1: 0.943375\tvalid_0's ndcg@2: 0.976767\tvalid_0's ndcg@3: 0.978179\tvalid_0's ndcg@4: 0.978481\tvalid_0's ndcg@5: 0.97852\n", - "[85]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976721\tvalid_0's ndcg@3: 0.978146\tvalid_0's ndcg@4: 0.978437\tvalid_0's ndcg@5: 0.978475\n", - "[86]\tvalid_0's ndcg@1: 0.9434\tvalid_0's ndcg@2: 0.976792\tvalid_0's ndcg@3: 0.978204\tvalid_0's ndcg@4: 0.978506\tvalid_0's ndcg@5: 0.978535\n", - "[87]\tvalid_0's ndcg@1: 0.943475\tvalid_0's ndcg@2: 0.976851\tvalid_0's ndcg@3: 0.978239\tvalid_0's ndcg@4: 0.97854\tvalid_0's ndcg@5: 0.978569\n", - "[88]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976882\tvalid_0's ndcg@3: 0.978282\tvalid_0's ndcg@4: 0.978572\tvalid_0's ndcg@5: 0.978611\n", - "[89]\tvalid_0's ndcg@1: 0.943775\tvalid_0's ndcg@2: 0.976915\tvalid_0's ndcg@3: 0.97834\tvalid_0's ndcg@4: 0.97863\tvalid_0's ndcg@5: 0.978669\n" - ] + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:20:53.358138Z", + "start_time": "2020-11-18T04:20:44.232944Z" + } + }, + "outputs": [], + "source": [ + "# 重新读取数据的时候,发现click_article_id是一个浮点数,所以将其转换成int类型\n", + "trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')\n", + "trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)\n", + "\n", + "if offline:\n", + " val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')\n", + " val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')\n", + "tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)\n", + "\n", + "# 做特征的时候为了方便,给测试集也打上了一个无效的标签,这里直接删掉就行\n", + "del tst_user_item_feats_df['label']" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[90]\tvalid_0's ndcg@1: 0.943925\tvalid_0's ndcg@2: 0.976986\tvalid_0's ndcg@3: 0.978398\tvalid_0's ndcg@4: 0.978689\tvalid_0's ndcg@5: 0.978728\n", - "[91]\tvalid_0's ndcg@1: 0.943875\tvalid_0's ndcg@2: 0.976999\tvalid_0's ndcg@3: 0.978399\tvalid_0's ndcg@4: 0.978679\tvalid_0's ndcg@5: 0.978717\n", - "[92]\tvalid_0's ndcg@1: 0.94395\tvalid_0's ndcg@2: 0.977058\tvalid_0's ndcg@3: 0.978421\tvalid_0's ndcg@4: 0.978711\tvalid_0's ndcg@5: 0.97876\n", - "[93]\tvalid_0's ndcg@1: 0.944075\tvalid_0's ndcg@2: 0.977104\tvalid_0's ndcg@3: 0.978479\tvalid_0's ndcg@4: 0.978759\tvalid_0's ndcg@5: 0.978807\n", - "[94]\tvalid_0's ndcg@1: 0.944175\tvalid_0's ndcg@2: 0.977125\tvalid_0's ndcg@3: 0.978513\tvalid_0's ndcg@4: 0.978793\tvalid_0's ndcg@5: 0.978841\n", - "[95]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.977153\tvalid_0's ndcg@3: 0.97854\tvalid_0's ndcg@4: 0.97882\tvalid_0's ndcg@5: 0.978869\n", - "[96]\tvalid_0's ndcg@1: 0.944225\tvalid_0's ndcg@2: 0.977144\tvalid_0's ndcg@3: 0.978531\tvalid_0's ndcg@4: 0.978811\tvalid_0's ndcg@5: 0.97886\n", - "[97]\tvalid_0's ndcg@1: 0.94435\tvalid_0's ndcg@2: 0.977221\tvalid_0's ndcg@3: 0.978584\tvalid_0's ndcg@4: 0.978864\tvalid_0's ndcg@5: 0.978912\n", - "[98]\tvalid_0's ndcg@1: 0.944575\tvalid_0's ndcg@2: 0.977289\tvalid_0's ndcg@3: 0.978651\tvalid_0's ndcg@4: 0.978942\tvalid_0's ndcg@5: 0.97899\n", - "[99]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.977341\tvalid_0's ndcg@3: 0.978691\tvalid_0's ndcg@4: 0.978993\tvalid_0's ndcg@5: 0.979032\n", - "[100]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977482\tvalid_0's ndcg@3: 0.978857\tvalid_0's ndcg@4: 0.979148\tvalid_0's ndcg@5: 0.979187\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977482\tvalid_0's ndcg@3: 0.978857\tvalid_0's ndcg@4: 0.979148\tvalid_0's ndcg@5: 0.979187\n", - "[1]\tvalid_0's ndcg@1: 0.911575\tvalid_0's ndcg@2: 0.964384\tvalid_0's ndcg@3: 0.966321\tvalid_0's ndcg@4: 0.966623\tvalid_0's ndcg@5: 0.966671\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.9136\tvalid_0's ndcg@2: 0.965257\tvalid_0's ndcg@3: 0.967107\tvalid_0's ndcg@4: 0.967398\tvalid_0's ndcg@5: 0.967456\n", - "[3]\tvalid_0's ndcg@1: 0.917425\tvalid_0's ndcg@2: 0.966732\tvalid_0's ndcg@3: 0.968545\tvalid_0's ndcg@4: 0.968814\tvalid_0's ndcg@5: 0.968882\n", - "[4]\tvalid_0's ndcg@1: 0.9222\tvalid_0's ndcg@2: 0.968558\tvalid_0's ndcg@3: 0.970383\tvalid_0's ndcg@4: 0.970619\tvalid_0's ndcg@5: 0.970668\n", - "[5]\tvalid_0's ndcg@1: 0.925875\tvalid_0's ndcg@2: 0.969914\tvalid_0's ndcg@3: 0.971714\tvalid_0's ndcg@4: 0.971972\tvalid_0's ndcg@5: 0.972021\n", - "[6]\tvalid_0's ndcg@1: 0.926875\tvalid_0's ndcg@2: 0.970425\tvalid_0's ndcg@3: 0.972112\tvalid_0's ndcg@4: 0.972371\tvalid_0's ndcg@5: 0.972419\n", - "[7]\tvalid_0's ndcg@1: 0.927475\tvalid_0's ndcg@2: 0.970631\tvalid_0's ndcg@3: 0.972306\tvalid_0's ndcg@4: 0.972586\tvalid_0's ndcg@5: 0.972634\n", - "[8]\tvalid_0's ndcg@1: 0.93015\tvalid_0's ndcg@2: 0.971649\tvalid_0's ndcg@3: 0.973287\tvalid_0's ndcg@4: 0.973567\tvalid_0's ndcg@5: 0.973625\n", - "[9]\tvalid_0's ndcg@1: 0.9312\tvalid_0's ndcg@2: 0.972084\tvalid_0's ndcg@3: 0.973684\tvalid_0's ndcg@4: 0.973964\tvalid_0's ndcg@5: 0.974022\n", - "[10]\tvalid_0's ndcg@1: 0.93225\tvalid_0's ndcg@2: 0.972456\tvalid_0's ndcg@3: 0.974081\tvalid_0's ndcg@4: 0.974361\tvalid_0's ndcg@5: 0.974409\n", - "[11]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.972704\tvalid_0's ndcg@3: 0.974379\tvalid_0's ndcg@4: 0.974648\tvalid_0's ndcg@5: 0.974696\n", - "[12]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972949\tvalid_0's ndcg@3: 0.974574\tvalid_0's ndcg@4: 0.974832\tvalid_0's ndcg@5: 0.974881\n", - "[13]\tvalid_0's ndcg@1: 0.93415\tvalid_0's ndcg@2: 0.97322\tvalid_0's ndcg@3: 0.97482\tvalid_0's ndcg@4: 0.975079\tvalid_0's ndcg@5: 0.975127\n", - "[14]\tvalid_0's ndcg@1: 0.9352\tvalid_0's ndcg@2: 0.973671\tvalid_0's ndcg@3: 0.975246\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975531\n", - "[15]\tvalid_0's ndcg@1: 0.9358\tvalid_0's ndcg@2: 0.973877\tvalid_0's ndcg@3: 0.975452\tvalid_0's ndcg@4: 0.975699\tvalid_0's ndcg@5: 0.975748\n", - "[16]\tvalid_0's ndcg@1: 0.935825\tvalid_0's ndcg@2: 0.973917\tvalid_0's ndcg@3: 0.975442\tvalid_0's ndcg@4: 0.975712\tvalid_0's ndcg@5: 0.97576\n", - "[17]\tvalid_0's ndcg@1: 0.936475\tvalid_0's ndcg@2: 0.97411\tvalid_0's ndcg@3: 0.975697\tvalid_0's ndcg@4: 0.975956\tvalid_0's ndcg@5: 0.975995\n", - "[18]\tvalid_0's ndcg@1: 0.936925\tvalid_0's ndcg@2: 0.974292\tvalid_0's ndcg@3: 0.975867\tvalid_0's ndcg@4: 0.976114\tvalid_0's ndcg@5: 0.976163\n", - "[19]\tvalid_0's ndcg@1: 0.937525\tvalid_0's ndcg@2: 0.974545\tvalid_0's ndcg@3: 0.976095\tvalid_0's ndcg@4: 0.976342\tvalid_0's ndcg@5: 0.976391\n", - "[20]\tvalid_0's ndcg@1: 0.937775\tvalid_0's ndcg@2: 0.974653\tvalid_0's ndcg@3: 0.976203\tvalid_0's ndcg@4: 0.976429\tvalid_0's ndcg@5: 0.976487\n", - "[21]\tvalid_0's ndcg@1: 0.938825\tvalid_0's ndcg@2: 0.975072\tvalid_0's ndcg@3: 0.976597\tvalid_0's ndcg@4: 0.976823\tvalid_0's ndcg@5: 0.976881\n", - "[22]\tvalid_0's ndcg@1: 0.93885\tvalid_0's ndcg@2: 0.975097\tvalid_0's ndcg@3: 0.976609\tvalid_0's ndcg@4: 0.976846\tvalid_0's ndcg@5: 0.976895\n", - "[23]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975246\tvalid_0's ndcg@3: 0.976733\tvalid_0's ndcg@4: 0.976959\tvalid_0's ndcg@5: 0.977008\n", - "[24]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975246\tvalid_0's ndcg@3: 0.976721\tvalid_0's ndcg@4: 0.976947\tvalid_0's ndcg@5: 0.977005\n", - "[25]\tvalid_0's ndcg@1: 0.9396\tvalid_0's ndcg@2: 0.975421\tvalid_0's ndcg@3: 0.976909\tvalid_0's ndcg@4: 0.977124\tvalid_0's ndcg@5: 0.977182\n", - "[26]\tvalid_0's ndcg@1: 0.9393\tvalid_0's ndcg@2: 0.975342\tvalid_0's ndcg@3: 0.976804\tvalid_0's ndcg@4: 0.97702\tvalid_0's ndcg@5: 0.977078\n", - "[27]\tvalid_0's ndcg@1: 0.93925\tvalid_0's ndcg@2: 0.975323\tvalid_0's ndcg@3: 0.976798\tvalid_0's ndcg@4: 0.977014\tvalid_0's ndcg@5: 0.977062\n", - "[28]\tvalid_0's ndcg@1: 0.93925\tvalid_0's ndcg@2: 0.975308\tvalid_0's ndcg@3: 0.976783\tvalid_0's ndcg@4: 0.977009\tvalid_0's ndcg@5: 0.977057\n", - "[29]\tvalid_0's ndcg@1: 0.94\tvalid_0's ndcg@2: 0.975569\tvalid_0's ndcg@3: 0.977056\tvalid_0's ndcg@4: 0.977282\tvalid_0's ndcg@5: 0.977331\n", - "[30]\tvalid_0's ndcg@1: 0.940325\tvalid_0's ndcg@2: 0.975673\tvalid_0's ndcg@3: 0.977173\tvalid_0's ndcg@4: 0.977399\tvalid_0's ndcg@5: 0.977447\n", - "[31]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975731\tvalid_0's ndcg@3: 0.977243\tvalid_0's ndcg@4: 0.977469\tvalid_0's ndcg@5: 0.977518\n", - "[32]\tvalid_0's ndcg@1: 0.940625\tvalid_0's ndcg@2: 0.975831\tvalid_0's ndcg@3: 0.977306\tvalid_0's ndcg@4: 0.977521\tvalid_0's ndcg@5: 0.97757\n", - "[33]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975766\tvalid_0's ndcg@3: 0.977241\tvalid_0's ndcg@4: 0.977457\tvalid_0's ndcg@5: 0.977505\n", - "[34]\tvalid_0's ndcg@1: 0.940625\tvalid_0's ndcg@2: 0.975831\tvalid_0's ndcg@3: 0.977306\tvalid_0's ndcg@4: 0.977521\tvalid_0's ndcg@5: 0.97757\n", - "[35]\tvalid_0's ndcg@1: 0.940725\tvalid_0's ndcg@2: 0.975868\tvalid_0's ndcg@3: 0.977343\tvalid_0's ndcg@4: 0.977558\tvalid_0's ndcg@5: 0.977606\n", - "[36]\tvalid_0's ndcg@1: 0.94115\tvalid_0's ndcg@2: 0.976056\tvalid_0's ndcg@3: 0.977506\tvalid_0's ndcg@4: 0.977722\tvalid_0's ndcg@5: 0.97777\n", - "[37]\tvalid_0's ndcg@1: 0.9414\tvalid_0's ndcg@2: 0.976133\tvalid_0's ndcg@3: 0.977595\tvalid_0's ndcg@4: 0.977811\tvalid_0's ndcg@5: 0.977859\n", - "[38]\tvalid_0's ndcg@1: 0.94175\tvalid_0's ndcg@2: 0.976278\tvalid_0's ndcg@3: 0.977715\tvalid_0's ndcg@4: 0.977941\tvalid_0's ndcg@5: 0.97799\n", - "[39]\tvalid_0's ndcg@1: 0.942075\tvalid_0's ndcg@2: 0.976366\tvalid_0's ndcg@3: 0.977841\tvalid_0's ndcg@4: 0.978056\tvalid_0's ndcg@5: 0.978105\n", - "[40]\tvalid_0's ndcg@1: 0.94215\tvalid_0's ndcg@2: 0.976409\tvalid_0's ndcg@3: 0.977872\tvalid_0's ndcg@4: 0.978087\tvalid_0's ndcg@5: 0.978136\n", - "[41]\tvalid_0's ndcg@1: 0.94245\tvalid_0's ndcg@2: 0.97652\tvalid_0's ndcg@3: 0.977983\tvalid_0's ndcg@4: 0.978198\tvalid_0's ndcg@5: 0.978246\n", - "[42]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976682\tvalid_0's ndcg@3: 0.97817\tvalid_0's ndcg@4: 0.978385\tvalid_0's ndcg@5: 0.978434\n", - "[43]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976682\tvalid_0's ndcg@3: 0.97817\tvalid_0's ndcg@4: 0.978385\tvalid_0's ndcg@5: 0.978434\n", - "[44]\tvalid_0's ndcg@1: 0.94285\tvalid_0's ndcg@2: 0.976636\tvalid_0's ndcg@3: 0.978111\tvalid_0's ndcg@4: 0.978337\tvalid_0's ndcg@5: 0.978386\n", - "[45]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.9768\tvalid_0's ndcg@3: 0.978262\tvalid_0's ndcg@4: 0.978488\tvalid_0's ndcg@5: 0.978537\n", - "[46]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976913\tvalid_0's ndcg@3: 0.978388\tvalid_0's ndcg@4: 0.978614\tvalid_0's ndcg@5: 0.978663\n", - "[47]\tvalid_0's ndcg@1: 0.943525\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.97836\tvalid_0's ndcg@4: 0.978576\tvalid_0's ndcg@5: 0.978634\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 返回排序后的结果" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[48]\tvalid_0's ndcg@1: 0.943525\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.978373\tvalid_0's ndcg@4: 0.978577\tvalid_0's ndcg@5: 0.978636\n", - "[49]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976913\tvalid_0's ndcg@3: 0.978388\tvalid_0's ndcg@4: 0.978614\tvalid_0's ndcg@5: 0.978663\n", - "[50]\tvalid_0's ndcg@1: 0.943975\tvalid_0's ndcg@2: 0.97702\tvalid_0's ndcg@3: 0.97852\tvalid_0's ndcg@4: 0.978746\tvalid_0's ndcg@5: 0.978794\n", - "[51]\tvalid_0's ndcg@1: 0.9441\tvalid_0's ndcg@2: 0.97705\tvalid_0's ndcg@3: 0.97855\tvalid_0's ndcg@4: 0.978787\tvalid_0's ndcg@5: 0.978836\n", - "[52]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.977121\tvalid_0's ndcg@3: 0.978609\tvalid_0's ndcg@4: 0.978846\tvalid_0's ndcg@5: 0.978894\n", - "[53]\tvalid_0's ndcg@1: 0.944225\tvalid_0's ndcg@2: 0.977081\tvalid_0's ndcg@3: 0.978618\tvalid_0's ndcg@4: 0.978834\tvalid_0's ndcg@5: 0.978882\n", - "[54]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.977071\tvalid_0's ndcg@3: 0.978609\tvalid_0's ndcg@4: 0.978824\tvalid_0's ndcg@5: 0.978873\n", - "[55]\tvalid_0's ndcg@1: 0.94435\tvalid_0's ndcg@2: 0.977143\tvalid_0's ndcg@3: 0.978668\tvalid_0's ndcg@4: 0.978883\tvalid_0's ndcg@5: 0.978931\n", - "[56]\tvalid_0's ndcg@1: 0.9444\tvalid_0's ndcg@2: 0.977177\tvalid_0's ndcg@3: 0.978702\tvalid_0's ndcg@4: 0.978906\tvalid_0's ndcg@5: 0.978955\n", - "[57]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.977263\tvalid_0's ndcg@3: 0.978788\tvalid_0's ndcg@4: 0.979003\tvalid_0's ndcg@5: 0.979051\n", - "[58]\tvalid_0's ndcg@1: 0.9448\tvalid_0's ndcg@2: 0.977293\tvalid_0's ndcg@3: 0.978843\tvalid_0's ndcg@4: 0.979047\tvalid_0's ndcg@5: 0.979096\n", - "[59]\tvalid_0's ndcg@1: 0.9452\tvalid_0's ndcg@2: 0.977472\tvalid_0's ndcg@3: 0.978997\tvalid_0's ndcg@4: 0.979202\tvalid_0's ndcg@5: 0.97925\n", - "[60]\tvalid_0's ndcg@1: 0.9455\tvalid_0's ndcg@2: 0.97763\tvalid_0's ndcg@3: 0.979118\tvalid_0's ndcg@4: 0.979322\tvalid_0's ndcg@5: 0.979371\n", - "[61]\tvalid_0's ndcg@1: 0.945725\tvalid_0's ndcg@2: 0.977682\tvalid_0's ndcg@3: 0.979194\tvalid_0's ndcg@4: 0.979399\tvalid_0's ndcg@5: 0.979447\n", - "[62]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977812\tvalid_0's ndcg@3: 0.979312\tvalid_0's ndcg@4: 0.979495\tvalid_0's ndcg@5: 0.979543\n", - "[63]\tvalid_0's ndcg@1: 0.946\tvalid_0's ndcg@2: 0.977878\tvalid_0's ndcg@3: 0.97934\tvalid_0's ndcg@4: 0.979523\tvalid_0's ndcg@5: 0.979572\n", - "[64]\tvalid_0's ndcg@1: 0.946525\tvalid_0's ndcg@2: 0.978056\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979714\tvalid_0's ndcg@5: 0.979762\n", - "[65]\tvalid_0's ndcg@1: 0.9467\tvalid_0's ndcg@2: 0.978105\tvalid_0's ndcg@3: 0.979592\tvalid_0's ndcg@4: 0.979775\tvalid_0's ndcg@5: 0.979823\n", - "[66]\tvalid_0's ndcg@1: 0.9465\tvalid_0's ndcg@2: 0.978046\tvalid_0's ndcg@3: 0.979534\tvalid_0's ndcg@4: 0.979706\tvalid_0's ndcg@5: 0.979755\n", - "[67]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.978127\tvalid_0's ndcg@3: 0.979614\tvalid_0's ndcg@4: 0.979776\tvalid_0's ndcg@5: 0.979824\n", - "[68]\tvalid_0's ndcg@1: 0.9467\tvalid_0's ndcg@2: 0.97812\tvalid_0's ndcg@3: 0.979608\tvalid_0's ndcg@4: 0.97978\tvalid_0's ndcg@5: 0.979828\n", - "[69]\tvalid_0's ndcg@1: 0.946875\tvalid_0's ndcg@2: 0.978216\tvalid_0's ndcg@3: 0.979679\tvalid_0's ndcg@4: 0.979851\tvalid_0's ndcg@5: 0.9799\n", - "[70]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.978194\tvalid_0's ndcg@3: 0.979682\tvalid_0's ndcg@4: 0.979854\tvalid_0's ndcg@5: 0.979902\n", - "[71]\tvalid_0's ndcg@1: 0.947025\tvalid_0's ndcg@2: 0.978209\tvalid_0's ndcg@3: 0.979721\tvalid_0's ndcg@4: 0.979893\tvalid_0's ndcg@5: 0.979942\n", - "[72]\tvalid_0's ndcg@1: 0.9472\tvalid_0's ndcg@2: 0.978273\tvalid_0's ndcg@3: 0.979773\tvalid_0's ndcg@4: 0.979956\tvalid_0's ndcg@5: 0.980005\n", - "[73]\tvalid_0's ndcg@1: 0.947475\tvalid_0's ndcg@2: 0.978391\tvalid_0's ndcg@3: 0.979878\tvalid_0's ndcg@4: 0.980061\tvalid_0's ndcg@5: 0.980109\n", - "[74]\tvalid_0's ndcg@1: 0.94715\tvalid_0's ndcg@2: 0.978271\tvalid_0's ndcg@3: 0.979758\tvalid_0's ndcg@4: 0.979941\tvalid_0's ndcg@5: 0.97999\n", - "[75]\tvalid_0's ndcg@1: 0.947275\tvalid_0's ndcg@2: 0.978333\tvalid_0's ndcg@3: 0.979808\tvalid_0's ndcg@4: 0.979991\tvalid_0's ndcg@5: 0.980039\n", - "[76]\tvalid_0's ndcg@1: 0.9474\tvalid_0's ndcg@2: 0.97841\tvalid_0's ndcg@3: 0.979873\tvalid_0's ndcg@4: 0.980045\tvalid_0's ndcg@5: 0.980093\n", - "[77]\tvalid_0's ndcg@1: 0.94745\tvalid_0's ndcg@2: 0.97846\tvalid_0's ndcg@3: 0.979898\tvalid_0's ndcg@4: 0.98007\tvalid_0's ndcg@5: 0.980118\n", - "[78]\tvalid_0's ndcg@1: 0.94775\tvalid_0's ndcg@2: 0.978555\tvalid_0's ndcg@3: 0.980005\tvalid_0's ndcg@4: 0.980177\tvalid_0's ndcg@5: 0.980226\n", - "[79]\tvalid_0's ndcg@1: 0.947875\tvalid_0's ndcg@2: 0.978617\tvalid_0's ndcg@3: 0.980055\tvalid_0's ndcg@4: 0.980238\tvalid_0's ndcg@5: 0.980276\n", - "[80]\tvalid_0's ndcg@1: 0.947875\tvalid_0's ndcg@2: 0.978617\tvalid_0's ndcg@3: 0.980055\tvalid_0's ndcg@4: 0.980238\tvalid_0's ndcg@5: 0.980276\n", - "[81]\tvalid_0's ndcg@1: 0.948175\tvalid_0's ndcg@2: 0.978744\tvalid_0's ndcg@3: 0.980169\tvalid_0's ndcg@4: 0.980352\tvalid_0's ndcg@5: 0.98039\n", - "[82]\tvalid_0's ndcg@1: 0.948375\tvalid_0's ndcg@2: 0.97888\tvalid_0's ndcg@3: 0.980255\tvalid_0's ndcg@4: 0.980438\tvalid_0's ndcg@5: 0.980477\n", - "[83]\tvalid_0's ndcg@1: 0.94825\tvalid_0's ndcg@2: 0.978834\tvalid_0's ndcg@3: 0.980209\tvalid_0's ndcg@4: 0.980392\tvalid_0's ndcg@5: 0.980431\n", - "[84]\tvalid_0's ndcg@1: 0.948275\tvalid_0's ndcg@2: 0.978844\tvalid_0's ndcg@3: 0.980219\tvalid_0's ndcg@4: 0.980402\tvalid_0's ndcg@5: 0.98044\n", - "[85]\tvalid_0's ndcg@1: 0.948475\tvalid_0's ndcg@2: 0.978917\tvalid_0's ndcg@3: 0.980292\tvalid_0's ndcg@4: 0.980475\tvalid_0's ndcg@5: 0.980514\n", - "[86]\tvalid_0's ndcg@1: 0.948975\tvalid_0's ndcg@2: 0.979102\tvalid_0's ndcg@3: 0.980477\tvalid_0's ndcg@4: 0.98066\tvalid_0's ndcg@5: 0.980699\n", - "[87]\tvalid_0's ndcg@1: 0.948975\tvalid_0's ndcg@2: 0.979086\tvalid_0's ndcg@3: 0.980474\tvalid_0's ndcg@4: 0.980657\tvalid_0's ndcg@5: 0.980695\n", - "[88]\tvalid_0's ndcg@1: 0.949025\tvalid_0's ndcg@2: 0.979136\tvalid_0's ndcg@3: 0.980499\tvalid_0's ndcg@4: 0.980682\tvalid_0's ndcg@5: 0.98072\n", - "[89]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979285\tvalid_0's ndcg@3: 0.98061\tvalid_0's ndcg@4: 0.980793\tvalid_0's ndcg@5: 0.980832\n", - "[90]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979269\tvalid_0's ndcg@3: 0.980607\tvalid_0's ndcg@4: 0.98079\tvalid_0's ndcg@5: 0.980828\n", - "[91]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979269\tvalid_0's ndcg@3: 0.980607\tvalid_0's ndcg@4: 0.98079\tvalid_0's ndcg@5: 0.980828\n", - "[92]\tvalid_0's ndcg@1: 0.9494\tvalid_0's ndcg@2: 0.97929\tvalid_0's ndcg@3: 0.98064\tvalid_0's ndcg@4: 0.980823\tvalid_0's ndcg@5: 0.980862\n", - "[93]\tvalid_0's ndcg@1: 0.949375\tvalid_0's ndcg@2: 0.979297\tvalid_0's ndcg@3: 0.980634\tvalid_0's ndcg@4: 0.980817\tvalid_0's ndcg@5: 0.980856\n", - "[94]\tvalid_0's ndcg@1: 0.949525\tvalid_0's ndcg@2: 0.979336\tvalid_0's ndcg@3: 0.980686\tvalid_0's ndcg@4: 0.980869\tvalid_0's ndcg@5: 0.980908\n", - "[95]\tvalid_0's ndcg@1: 0.949825\tvalid_0's ndcg@2: 0.979416\tvalid_0's ndcg@3: 0.980791\tvalid_0's ndcg@4: 0.980974\tvalid_0's ndcg@5: 0.981012\n", - "[96]\tvalid_0's ndcg@1: 0.94975\tvalid_0's ndcg@2: 0.979404\tvalid_0's ndcg@3: 0.980779\tvalid_0's ndcg@4: 0.980951\tvalid_0's ndcg@5: 0.98099\n", - "[97]\tvalid_0's ndcg@1: 0.950025\tvalid_0's ndcg@2: 0.979537\tvalid_0's ndcg@3: 0.980874\tvalid_0's ndcg@4: 0.981057\tvalid_0's ndcg@5: 0.981096\n", - "[98]\tvalid_0's ndcg@1: 0.9501\tvalid_0's ndcg@2: 0.979564\tvalid_0's ndcg@3: 0.980889\tvalid_0's ndcg@4: 0.981083\tvalid_0's ndcg@5: 0.981122\n", - "[99]\tvalid_0's ndcg@1: 0.950275\tvalid_0's ndcg@2: 0.979629\tvalid_0's ndcg@3: 0.980967\tvalid_0's ndcg@4: 0.98115\tvalid_0's ndcg@5: 0.981188\n", - "[100]\tvalid_0's ndcg@1: 0.950325\tvalid_0's ndcg@2: 0.979647\tvalid_0's ndcg@3: 0.980985\tvalid_0's ndcg@4: 0.981168\tvalid_0's ndcg@5: 0.981207\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's ndcg@1: 0.950325\tvalid_0's ndcg@2: 0.979647\tvalid_0's ndcg@3: 0.980985\tvalid_0's ndcg@4: 0.981168\tvalid_0's ndcg@5: 0.981207\n", - "[1]\tvalid_0's ndcg@1: 0.910175\tvalid_0's ndcg@2: 0.96382\tvalid_0's ndcg@3: 0.965707\tvalid_0's ndcg@4: 0.966009\tvalid_0's ndcg@5: 0.966086\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.91415\tvalid_0's ndcg@2: 0.965492\tvalid_0's ndcg@3: 0.967254\tvalid_0's ndcg@4: 0.967556\tvalid_0's ndcg@5: 0.967604\n", - "[3]\tvalid_0's ndcg@1: 0.916025\tvalid_0's ndcg@2: 0.966389\tvalid_0's ndcg@3: 0.967976\tvalid_0's ndcg@4: 0.968278\tvalid_0's ndcg@5: 0.968355\n", - "[4]\tvalid_0's ndcg@1: 0.919\tvalid_0's ndcg@2: 0.967392\tvalid_0's ndcg@3: 0.96903\tvalid_0's ndcg@4: 0.969364\tvalid_0's ndcg@5: 0.969431\n", - "[5]\tvalid_0's ndcg@1: 0.921125\tvalid_0's ndcg@2: 0.968192\tvalid_0's ndcg@3: 0.969855\tvalid_0's ndcg@4: 0.970156\tvalid_0's ndcg@5: 0.970224\n", - "[6]\tvalid_0's ndcg@1: 0.921675\tvalid_0's ndcg@2: 0.968411\tvalid_0's ndcg@3: 0.970111\tvalid_0's ndcg@4: 0.97037\tvalid_0's ndcg@5: 0.970437\n", - "[7]\tvalid_0's ndcg@1: 0.9237\tvalid_0's ndcg@2: 0.969332\tvalid_0's ndcg@3: 0.970882\tvalid_0's ndcg@4: 0.97113\tvalid_0's ndcg@5: 0.971217\n", - "[8]\tvalid_0's ndcg@1: 0.925775\tvalid_0's ndcg@2: 0.970129\tvalid_0's ndcg@3: 0.971642\tvalid_0's ndcg@4: 0.971922\tvalid_0's ndcg@5: 0.97199\n", - "[9]\tvalid_0's ndcg@1: 0.926775\tvalid_0's ndcg@2: 0.970435\tvalid_0's ndcg@3: 0.971985\tvalid_0's ndcg@4: 0.972276\tvalid_0's ndcg@5: 0.972334\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:01.809368Z", + "start_time": "2020-11-18T04:21:01.799641Z" + } + }, + "outputs": [], + "source": [ + "def submit(recall_df, topk=5, model_name=None):\n", + " recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])\n", + " recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 判断是不是每个用户都有5篇文章及以上\n", + " tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())\n", + " assert tmp.min() >= topk\n", + " \n", + " del recall_df['pred_score']\n", + " submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()\n", + " \n", + " submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]\n", + " # 按照提交格式定义列名\n", + " submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', \n", + " 3: 'article_3', 4: 'article_4', 5: 'article_5'})\n", + " \n", + " save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'\n", + " submit.to_csv(save_name, index=False, header=True)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[10]\tvalid_0's ndcg@1: 0.9277\tvalid_0's ndcg@2: 0.970761\tvalid_0's ndcg@3: 0.972311\tvalid_0's ndcg@4: 0.972612\tvalid_0's ndcg@5: 0.97267\n", - "[11]\tvalid_0's ndcg@1: 0.928975\tvalid_0's ndcg@2: 0.97131\tvalid_0's ndcg@3: 0.972798\tvalid_0's ndcg@4: 0.973089\tvalid_0's ndcg@5: 0.973166\n", - "[12]\tvalid_0's ndcg@1: 0.929375\tvalid_0's ndcg@2: 0.971505\tvalid_0's ndcg@3: 0.972968\tvalid_0's ndcg@4: 0.973259\tvalid_0's ndcg@5: 0.973326\n", - "[13]\tvalid_0's ndcg@1: 0.929375\tvalid_0's ndcg@2: 0.971426\tvalid_0's ndcg@3: 0.972939\tvalid_0's ndcg@4: 0.97324\tvalid_0's ndcg@5: 0.973318\n", - "[14]\tvalid_0's ndcg@1: 0.929775\tvalid_0's ndcg@2: 0.971621\tvalid_0's ndcg@3: 0.973121\tvalid_0's ndcg@4: 0.973412\tvalid_0's ndcg@5: 0.97348\n", - "[15]\tvalid_0's ndcg@1: 0.9304\tvalid_0's ndcg@2: 0.971868\tvalid_0's ndcg@3: 0.97338\tvalid_0's ndcg@4: 0.97365\tvalid_0's ndcg@5: 0.973717\n", - "[16]\tvalid_0's ndcg@1: 0.930975\tvalid_0's ndcg@2: 0.972096\tvalid_0's ndcg@3: 0.973558\tvalid_0's ndcg@4: 0.973849\tvalid_0's ndcg@5: 0.973926\n", - "[17]\tvalid_0's ndcg@1: 0.93105\tvalid_0's ndcg@2: 0.972108\tvalid_0's ndcg@3: 0.973583\tvalid_0's ndcg@4: 0.973884\tvalid_0's ndcg@5: 0.973952\n", - "[18]\tvalid_0's ndcg@1: 0.931725\tvalid_0's ndcg@2: 0.972373\tvalid_0's ndcg@3: 0.97386\tvalid_0's ndcg@4: 0.974129\tvalid_0's ndcg@5: 0.974207\n", - "[19]\tvalid_0's ndcg@1: 0.932175\tvalid_0's ndcg@2: 0.972681\tvalid_0's ndcg@3: 0.974068\tvalid_0's ndcg@4: 0.974348\tvalid_0's ndcg@5: 0.974406\n", - "[20]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.973019\tvalid_0's ndcg@3: 0.974382\tvalid_0's ndcg@4: 0.974673\tvalid_0's ndcg@5: 0.974731\n", - "[21]\tvalid_0's ndcg@1: 0.933075\tvalid_0's ndcg@2: 0.97306\tvalid_0's ndcg@3: 0.974423\tvalid_0's ndcg@4: 0.974703\tvalid_0's ndcg@5: 0.97477\n", - "[22]\tvalid_0's ndcg@1: 0.93375\tvalid_0's ndcg@2: 0.973262\tvalid_0's ndcg@3: 0.974649\tvalid_0's ndcg@4: 0.974929\tvalid_0's ndcg@5: 0.975007\n", - "[23]\tvalid_0's ndcg@1: 0.933675\tvalid_0's ndcg@2: 0.973219\tvalid_0's ndcg@3: 0.974606\tvalid_0's ndcg@4: 0.974886\tvalid_0's ndcg@5: 0.974973\n", - "[24]\tvalid_0's ndcg@1: 0.934\tvalid_0's ndcg@2: 0.97337\tvalid_0's ndcg@3: 0.974745\tvalid_0's ndcg@4: 0.975014\tvalid_0's ndcg@5: 0.975101\n", - "[25]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.973674\tvalid_0's ndcg@3: 0.975062\tvalid_0's ndcg@4: 0.975342\tvalid_0's ndcg@5: 0.97541\n", - "[26]\tvalid_0's ndcg@1: 0.93495\tvalid_0's ndcg@2: 0.973721\tvalid_0's ndcg@3: 0.975096\tvalid_0's ndcg@4: 0.975365\tvalid_0's ndcg@5: 0.975452\n", - "[27]\tvalid_0's ndcg@1: 0.9358\tvalid_0's ndcg@2: 0.974082\tvalid_0's ndcg@3: 0.975444\tvalid_0's ndcg@4: 0.975713\tvalid_0's ndcg@5: 0.975781\n", - "[28]\tvalid_0's ndcg@1: 0.935325\tvalid_0's ndcg@2: 0.973875\tvalid_0's ndcg@3: 0.975275\tvalid_0's ndcg@4: 0.975512\tvalid_0's ndcg@5: 0.975599\n", - "[29]\tvalid_0's ndcg@1: 0.935925\tvalid_0's ndcg@2: 0.974159\tvalid_0's ndcg@3: 0.975522\tvalid_0's ndcg@4: 0.975759\tvalid_0's ndcg@5: 0.975836\n", - "[30]\tvalid_0's ndcg@1: 0.9362\tvalid_0's ndcg@2: 0.974214\tvalid_0's ndcg@3: 0.975589\tvalid_0's ndcg@4: 0.975847\tvalid_0's ndcg@5: 0.975924\n", - "[31]\tvalid_0's ndcg@1: 0.93625\tvalid_0's ndcg@2: 0.974216\tvalid_0's ndcg@3: 0.975629\tvalid_0's ndcg@4: 0.975876\tvalid_0's ndcg@5: 0.975944\n", - "[32]\tvalid_0's ndcg@1: 0.93665\tvalid_0's ndcg@2: 0.974427\tvalid_0's ndcg@3: 0.975814\tvalid_0's ndcg@4: 0.97603\tvalid_0's ndcg@5: 0.976107\n", - "[33]\tvalid_0's ndcg@1: 0.936775\tvalid_0's ndcg@2: 0.974505\tvalid_0's ndcg@3: 0.975855\tvalid_0's ndcg@4: 0.976081\tvalid_0's ndcg@5: 0.976158\n", - "[34]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.974643\tvalid_0's ndcg@3: 0.975993\tvalid_0's ndcg@4: 0.976219\tvalid_0's ndcg@5: 0.976296\n", - "[35]\tvalid_0's ndcg@1: 0.937675\tvalid_0's ndcg@2: 0.974805\tvalid_0's ndcg@3: 0.97618\tvalid_0's ndcg@4: 0.976406\tvalid_0's ndcg@5: 0.976484\n", - "[36]\tvalid_0's ndcg@1: 0.9382\tvalid_0's ndcg@2: 0.974983\tvalid_0's ndcg@3: 0.976371\tvalid_0's ndcg@4: 0.976597\tvalid_0's ndcg@5: 0.976674\n", - "[37]\tvalid_0's ndcg@1: 0.938175\tvalid_0's ndcg@2: 0.974974\tvalid_0's ndcg@3: 0.976349\tvalid_0's ndcg@4: 0.976586\tvalid_0's ndcg@5: 0.976663\n", - "[38]\tvalid_0's ndcg@1: 0.938675\tvalid_0's ndcg@2: 0.975143\tvalid_0's ndcg@3: 0.976518\tvalid_0's ndcg@4: 0.976776\tvalid_0's ndcg@5: 0.976844\n", - "[39]\tvalid_0's ndcg@1: 0.938575\tvalid_0's ndcg@2: 0.975106\tvalid_0's ndcg@3: 0.976481\tvalid_0's ndcg@4: 0.976739\tvalid_0's ndcg@5: 0.976807\n", - "[40]\tvalid_0's ndcg@1: 0.938675\tvalid_0's ndcg@2: 0.97519\tvalid_0's ndcg@3: 0.976528\tvalid_0's ndcg@4: 0.976775\tvalid_0's ndcg@5: 0.976853\n", - "[41]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.975347\tvalid_0's ndcg@3: 0.976697\tvalid_0's ndcg@4: 0.976934\tvalid_0's ndcg@5: 0.977001\n", - "[42]\tvalid_0's ndcg@1: 0.939825\tvalid_0's ndcg@2: 0.975599\tvalid_0's ndcg@3: 0.976961\tvalid_0's ndcg@4: 0.977198\tvalid_0's ndcg@5: 0.977266\n", - "[43]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.975639\tvalid_0's ndcg@3: 0.976977\tvalid_0's ndcg@4: 0.977214\tvalid_0's ndcg@5: 0.977282\n", - "[44]\tvalid_0's ndcg@1: 0.9398\tvalid_0's ndcg@2: 0.975605\tvalid_0's ndcg@3: 0.976955\tvalid_0's ndcg@4: 0.977192\tvalid_0's ndcg@5: 0.97726\n", - "[45]\tvalid_0's ndcg@1: 0.9401\tvalid_0's ndcg@2: 0.9757\tvalid_0's ndcg@3: 0.977075\tvalid_0's ndcg@4: 0.977291\tvalid_0's ndcg@5: 0.977368\n", - "[46]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975845\tvalid_0's ndcg@3: 0.977183\tvalid_0's ndcg@4: 0.97742\tvalid_0's ndcg@5: 0.977497\n", - "[47]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.975854\tvalid_0's ndcg@3: 0.977204\tvalid_0's ndcg@4: 0.97743\tvalid_0's ndcg@5: 0.977508\n", - "[48]\tvalid_0's ndcg@1: 0.940575\tvalid_0's ndcg@2: 0.975923\tvalid_0's ndcg@3: 0.977273\tvalid_0's ndcg@4: 0.977488\tvalid_0's ndcg@5: 0.977556\n", - "[49]\tvalid_0's ndcg@1: 0.9407\tvalid_0's ndcg@2: 0.975922\tvalid_0's ndcg@3: 0.977297\tvalid_0's ndcg@4: 0.977501\tvalid_0's ndcg@5: 0.977588\n", - "[50]\tvalid_0's ndcg@1: 0.940725\tvalid_0's ndcg@2: 0.975947\tvalid_0's ndcg@3: 0.977322\tvalid_0's ndcg@4: 0.977505\tvalid_0's ndcg@5: 0.977592\n", - "[51]\tvalid_0's ndcg@1: 0.9406\tvalid_0's ndcg@2: 0.975837\tvalid_0's ndcg@3: 0.97725\tvalid_0's ndcg@4: 0.977422\tvalid_0's ndcg@5: 0.977509\n", - "[52]\tvalid_0's ndcg@1: 0.941075\tvalid_0's ndcg@2: 0.975997\tvalid_0's ndcg@3: 0.977422\tvalid_0's ndcg@4: 0.977594\tvalid_0's ndcg@5: 0.977691\n", - "[53]\tvalid_0's ndcg@1: 0.940925\tvalid_0's ndcg@2: 0.975989\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.977538\tvalid_0's ndcg@5: 0.977644\n", - "[54]\tvalid_0's ndcg@1: 0.94125\tvalid_0's ndcg@2: 0.976062\tvalid_0's ndcg@3: 0.977487\tvalid_0's ndcg@4: 0.977659\tvalid_0's ndcg@5: 0.977756\n", - "[55]\tvalid_0's ndcg@1: 0.94145\tvalid_0's ndcg@2: 0.976183\tvalid_0's ndcg@3: 0.97757\tvalid_0's ndcg@4: 0.977742\tvalid_0's ndcg@5: 0.977839\n", - "[56]\tvalid_0's ndcg@1: 0.941475\tvalid_0's ndcg@2: 0.976176\tvalid_0's ndcg@3: 0.977576\tvalid_0's ndcg@4: 0.977748\tvalid_0's ndcg@5: 0.977845\n", - "[57]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.976139\tvalid_0's ndcg@3: 0.977539\tvalid_0's ndcg@4: 0.977712\tvalid_0's ndcg@5: 0.977808\n", - "[58]\tvalid_0's ndcg@1: 0.941675\tvalid_0's ndcg@2: 0.97625\tvalid_0's ndcg@3: 0.97765\tvalid_0's ndcg@4: 0.977822\tvalid_0's ndcg@5: 0.977919\n", - "[59]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.976253\tvalid_0's ndcg@3: 0.977653\tvalid_0's ndcg@4: 0.977836\tvalid_0's ndcg@5: 0.977932\n", - "[60]\tvalid_0's ndcg@1: 0.941675\tvalid_0's ndcg@2: 0.976234\tvalid_0's ndcg@3: 0.977634\tvalid_0's ndcg@4: 0.977817\tvalid_0's ndcg@5: 0.977914\n", - "[61]\tvalid_0's ndcg@1: 0.9419\tvalid_0's ndcg@2: 0.976333\tvalid_0's ndcg@3: 0.977745\tvalid_0's ndcg@4: 0.977918\tvalid_0's ndcg@5: 0.978005\n", - "[62]\tvalid_0's ndcg@1: 0.941975\tvalid_0's ndcg@2: 0.976345\tvalid_0's ndcg@3: 0.977757\tvalid_0's ndcg@4: 0.97794\tvalid_0's ndcg@5: 0.978027\n", - "[63]\tvalid_0's ndcg@1: 0.9423\tvalid_0's ndcg@2: 0.976496\tvalid_0's ndcg@3: 0.977871\tvalid_0's ndcg@4: 0.978065\tvalid_0's ndcg@5: 0.978152\n", - "[64]\tvalid_0's ndcg@1: 0.942625\tvalid_0's ndcg@2: 0.976632\tvalid_0's ndcg@3: 0.977995\tvalid_0's ndcg@4: 0.978188\tvalid_0's ndcg@5: 0.978275\n", - "[65]\tvalid_0's ndcg@1: 0.942575\tvalid_0's ndcg@2: 0.976629\tvalid_0's ndcg@3: 0.977979\tvalid_0's ndcg@4: 0.978173\tvalid_0's ndcg@5: 0.97826\n", - "[66]\tvalid_0's ndcg@1: 0.942725\tvalid_0's ndcg@2: 0.976685\tvalid_0's ndcg@3: 0.978035\tvalid_0's ndcg@4: 0.978229\tvalid_0's ndcg@5: 0.978316\n", - "[67]\tvalid_0's ndcg@1: 0.94275\tvalid_0's ndcg@2: 0.976678\tvalid_0's ndcg@3: 0.978041\tvalid_0's ndcg@4: 0.978224\tvalid_0's ndcg@5: 0.97832\n", - "[68]\tvalid_0's ndcg@1: 0.94275\tvalid_0's ndcg@2: 0.976694\tvalid_0's ndcg@3: 0.978044\tvalid_0's ndcg@4: 0.978227\tvalid_0's ndcg@5: 0.978324\n", - "[69]\tvalid_0's ndcg@1: 0.943\tvalid_0's ndcg@2: 0.976834\tvalid_0's ndcg@3: 0.978146\tvalid_0's ndcg@4: 0.978329\tvalid_0's ndcg@5: 0.978426\n", - "[70]\tvalid_0's ndcg@1: 0.943025\tvalid_0's ndcg@2: 0.976827\tvalid_0's ndcg@3: 0.978152\tvalid_0's ndcg@4: 0.978324\tvalid_0's ndcg@5: 0.978431\n", - "[71]\tvalid_0's ndcg@1: 0.9432\tvalid_0's ndcg@2: 0.976923\tvalid_0's ndcg@3: 0.978236\tvalid_0's ndcg@4: 0.978397\tvalid_0's ndcg@5: 0.978504\n", - "[72]\tvalid_0's ndcg@1: 0.943225\tvalid_0's ndcg@2: 0.976917\tvalid_0's ndcg@3: 0.978254\tvalid_0's ndcg@4: 0.978405\tvalid_0's ndcg@5: 0.978511\n", - "[73]\tvalid_0's ndcg@1: 0.94315\tvalid_0's ndcg@2: 0.976936\tvalid_0's ndcg@3: 0.978236\tvalid_0's ndcg@4: 0.978409\tvalid_0's ndcg@5: 0.978496\n" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:04.332198Z", + "start_time": "2020-11-18T04:21:04.325020Z" + } + }, + "outputs": [], + "source": [ + "# 排序结果归一化\n", + "def norm_sim(sim_df, weight=0.0):\n", + " # print(sim_df.head())\n", + " min_sim = sim_df.min()\n", + " max_sim = sim_df.max()\n", + " if max_sim == min_sim:\n", + " sim_df = sim_df.apply(lambda sim: 1.0)\n", + " else:\n", + " sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))\n", + "\n", + " sim_df = sim_df.apply(lambda sim: sim + weight) # plus one\n", + " return sim_df" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[74]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976957\tvalid_0's ndcg@3: 0.97827\tvalid_0's ndcg@4: 0.978431\tvalid_0's ndcg@5: 0.978528\n", - "[75]\tvalid_0's ndcg@1: 0.943075\tvalid_0's ndcg@2: 0.976861\tvalid_0's ndcg@3: 0.978199\tvalid_0's ndcg@4: 0.97836\tvalid_0's ndcg@5: 0.978457\n", - "[76]\tvalid_0's ndcg@1: 0.94335\tvalid_0's ndcg@2: 0.976963\tvalid_0's ndcg@3: 0.978288\tvalid_0's ndcg@4: 0.978471\tvalid_0's ndcg@5: 0.978568\n", - "[77]\tvalid_0's ndcg@1: 0.94345\tvalid_0's ndcg@2: 0.977031\tvalid_0's ndcg@3: 0.978331\tvalid_0's ndcg@4: 0.978514\tvalid_0's ndcg@5: 0.978611\n", - "[78]\tvalid_0's ndcg@1: 0.943475\tvalid_0's ndcg@2: 0.977088\tvalid_0's ndcg@3: 0.97835\tvalid_0's ndcg@4: 0.978533\tvalid_0's ndcg@5: 0.97863\n", - "[79]\tvalid_0's ndcg@1: 0.943625\tvalid_0's ndcg@2: 0.977096\tvalid_0's ndcg@3: 0.978396\tvalid_0's ndcg@4: 0.978579\tvalid_0's ndcg@5: 0.978676\n", - "[80]\tvalid_0's ndcg@1: 0.943825\tvalid_0's ndcg@2: 0.977154\tvalid_0's ndcg@3: 0.978479\tvalid_0's ndcg@4: 0.978651\tvalid_0's ndcg@5: 0.978748\n", - "[81]\tvalid_0's ndcg@1: 0.943775\tvalid_0's ndcg@2: 0.977135\tvalid_0's ndcg@3: 0.97846\tvalid_0's ndcg@4: 0.978633\tvalid_0's ndcg@5: 0.978729\n", - "[82]\tvalid_0's ndcg@1: 0.9443\tvalid_0's ndcg@2: 0.977361\tvalid_0's ndcg@3: 0.978673\tvalid_0's ndcg@4: 0.978845\tvalid_0's ndcg@5: 0.978933\n", - "[83]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.977324\tvalid_0's ndcg@3: 0.978624\tvalid_0's ndcg@4: 0.978796\tvalid_0's ndcg@5: 0.978893\n", - "[84]\tvalid_0's ndcg@1: 0.94405\tvalid_0's ndcg@2: 0.977253\tvalid_0's ndcg@3: 0.978565\tvalid_0's ndcg@4: 0.978737\tvalid_0's ndcg@5: 0.978834\n", - "[85]\tvalid_0's ndcg@1: 0.944175\tvalid_0's ndcg@2: 0.977283\tvalid_0's ndcg@3: 0.978633\tvalid_0's ndcg@4: 0.978795\tvalid_0's ndcg@5: 0.978882\n", - "[86]\tvalid_0's ndcg@1: 0.9445\tvalid_0's ndcg@2: 0.97745\tvalid_0's ndcg@3: 0.978763\tvalid_0's ndcg@4: 0.978924\tvalid_0's ndcg@5: 0.979011\n", - "[87]\tvalid_0's ndcg@1: 0.9445\tvalid_0's ndcg@2: 0.977419\tvalid_0's ndcg@3: 0.978756\tvalid_0's ndcg@4: 0.978918\tvalid_0's ndcg@5: 0.979005\n", - "[88]\tvalid_0's ndcg@1: 0.944825\tvalid_0's ndcg@2: 0.977554\tvalid_0's ndcg@3: 0.978867\tvalid_0's ndcg@4: 0.979039\tvalid_0's ndcg@5: 0.979126\n", - "[89]\tvalid_0's ndcg@1: 0.9454\tvalid_0's ndcg@2: 0.977767\tvalid_0's ndcg@3: 0.979079\tvalid_0's ndcg@4: 0.979262\tvalid_0's ndcg@5: 0.97934\n", - "[90]\tvalid_0's ndcg@1: 0.945375\tvalid_0's ndcg@2: 0.977773\tvalid_0's ndcg@3: 0.979073\tvalid_0's ndcg@4: 0.979256\tvalid_0's ndcg@5: 0.979334\n", - "[91]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977792\tvalid_0's ndcg@3: 0.979092\tvalid_0's ndcg@4: 0.979275\tvalid_0's ndcg@5: 0.979352\n", - "[92]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977776\tvalid_0's ndcg@3: 0.979088\tvalid_0's ndcg@4: 0.979261\tvalid_0's ndcg@5: 0.979348\n", - "[93]\tvalid_0's ndcg@1: 0.945375\tvalid_0's ndcg@2: 0.977757\tvalid_0's ndcg@3: 0.979082\tvalid_0's ndcg@4: 0.979244\tvalid_0's ndcg@5: 0.979331\n", - "[94]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977761\tvalid_0's ndcg@3: 0.979061\tvalid_0's ndcg@4: 0.979223\tvalid_0's ndcg@5: 0.97931\n", - "[95]\tvalid_0's ndcg@1: 0.9454\tvalid_0's ndcg@2: 0.977798\tvalid_0's ndcg@3: 0.979086\tvalid_0's ndcg@4: 0.979258\tvalid_0's ndcg@5: 0.979345\n", - "[96]\tvalid_0's ndcg@1: 0.945825\tvalid_0's ndcg@2: 0.977955\tvalid_0's ndcg@3: 0.97923\tvalid_0's ndcg@4: 0.979413\tvalid_0's ndcg@5: 0.9795\n", - "[97]\tvalid_0's ndcg@1: 0.945925\tvalid_0's ndcg@2: 0.97796\tvalid_0's ndcg@3: 0.97926\tvalid_0's ndcg@4: 0.979443\tvalid_0's ndcg@5: 0.979531\n", - "[98]\tvalid_0's ndcg@1: 0.9464\tvalid_0's ndcg@2: 0.97812\tvalid_0's ndcg@3: 0.97942\tvalid_0's ndcg@4: 0.979625\tvalid_0's ndcg@5: 0.979702\n", - "[99]\tvalid_0's ndcg@1: 0.94655\tvalid_0's ndcg@2: 0.978191\tvalid_0's ndcg@3: 0.979479\tvalid_0's ndcg@4: 0.979683\tvalid_0's ndcg@5: 0.97977\n", - "[100]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.978244\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979725\tvalid_0's ndcg@5: 0.979812\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.978244\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979725\tvalid_0's ndcg@5: 0.979812\n", - "[1]\tvalid_0's ndcg@1: 0.910175\tvalid_0's ndcg@2: 0.963031\tvalid_0's ndcg@3: 0.965281\tvalid_0's ndcg@4: 0.965819\tvalid_0's ndcg@5: 0.965887\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.9141\tvalid_0's ndcg@2: 0.964748\tvalid_0's ndcg@3: 0.96681\tvalid_0's ndcg@4: 0.967316\tvalid_0's ndcg@5: 0.967394\n", - "[3]\tvalid_0's ndcg@1: 0.915925\tvalid_0's ndcg@2: 0.9655\tvalid_0's ndcg@3: 0.967575\tvalid_0's ndcg@4: 0.968028\tvalid_0's ndcg@5: 0.968105\n", - "[4]\tvalid_0's ndcg@1: 0.91915\tvalid_0's ndcg@2: 0.966943\tvalid_0's ndcg@3: 0.968968\tvalid_0's ndcg@4: 0.969334\tvalid_0's ndcg@5: 0.969373\n", - "[5]\tvalid_0's ndcg@1: 0.920625\tvalid_0's ndcg@2: 0.967598\tvalid_0's ndcg@3: 0.969498\tvalid_0's ndcg@4: 0.969896\tvalid_0's ndcg@5: 0.969944\n", - "[6]\tvalid_0's ndcg@1: 0.922625\tvalid_0's ndcg@2: 0.968336\tvalid_0's ndcg@3: 0.970261\tvalid_0's ndcg@4: 0.970659\tvalid_0's ndcg@5: 0.970688\n", - "[7]\tvalid_0's ndcg@1: 0.923625\tvalid_0's ndcg@2: 0.968768\tvalid_0's ndcg@3: 0.970656\tvalid_0's ndcg@4: 0.971043\tvalid_0's ndcg@5: 0.971072\n", - "[8]\tvalid_0's ndcg@1: 0.925825\tvalid_0's ndcg@2: 0.969612\tvalid_0's ndcg@3: 0.971462\tvalid_0's ndcg@4: 0.97186\tvalid_0's ndcg@5: 0.971879\n", - "[9]\tvalid_0's ndcg@1: 0.926475\tvalid_0's ndcg@2: 0.969899\tvalid_0's ndcg@3: 0.971711\tvalid_0's ndcg@4: 0.97211\tvalid_0's ndcg@5: 0.972129\n", - "[10]\tvalid_0's ndcg@1: 0.927775\tvalid_0's ndcg@2: 0.97041\tvalid_0's ndcg@3: 0.972185\tvalid_0's ndcg@4: 0.972594\tvalid_0's ndcg@5: 0.972614\n", - "[11]\tvalid_0's ndcg@1: 0.92885\tvalid_0's ndcg@2: 0.970838\tvalid_0's ndcg@3: 0.972588\tvalid_0's ndcg@4: 0.973008\tvalid_0's ndcg@5: 0.973028\n", - "[12]\tvalid_0's ndcg@1: 0.930325\tvalid_0's ndcg@2: 0.971367\tvalid_0's ndcg@3: 0.973129\tvalid_0's ndcg@4: 0.973549\tvalid_0's ndcg@5: 0.973569\n", - "[13]\tvalid_0's ndcg@1: 0.931125\tvalid_0's ndcg@2: 0.971631\tvalid_0's ndcg@3: 0.973443\tvalid_0's ndcg@4: 0.973842\tvalid_0's ndcg@5: 0.973871\n", - "[14]\tvalid_0's ndcg@1: 0.931525\tvalid_0's ndcg@2: 0.971778\tvalid_0's ndcg@3: 0.973616\tvalid_0's ndcg@4: 0.973993\tvalid_0's ndcg@5: 0.974022\n", - "[15]\tvalid_0's ndcg@1: 0.9311\tvalid_0's ndcg@2: 0.9717\tvalid_0's ndcg@3: 0.973475\tvalid_0's ndcg@4: 0.973852\tvalid_0's ndcg@5: 0.973872\n", - "[16]\tvalid_0's ndcg@1: 0.931775\tvalid_0's ndcg@2: 0.971902\tvalid_0's ndcg@3: 0.973702\tvalid_0's ndcg@4: 0.97409\tvalid_0's ndcg@5: 0.974109\n", - "[17]\tvalid_0's ndcg@1: 0.931425\tvalid_0's ndcg@2: 0.971805\tvalid_0's ndcg@3: 0.97358\tvalid_0's ndcg@4: 0.973967\tvalid_0's ndcg@5: 0.973986\n", - "[18]\tvalid_0's ndcg@1: 0.931575\tvalid_0's ndcg@2: 0.971876\tvalid_0's ndcg@3: 0.973651\tvalid_0's ndcg@4: 0.974027\tvalid_0's ndcg@5: 0.974047\n", - "[19]\tvalid_0's ndcg@1: 0.932\tvalid_0's ndcg@2: 0.97208\tvalid_0's ndcg@3: 0.973805\tvalid_0's ndcg@4: 0.974192\tvalid_0's ndcg@5: 0.974212\n", - "[20]\tvalid_0's ndcg@1: 0.932075\tvalid_0's ndcg@2: 0.972092\tvalid_0's ndcg@3: 0.973829\tvalid_0's ndcg@4: 0.974217\tvalid_0's ndcg@5: 0.974236\n", - "[21]\tvalid_0's ndcg@1: 0.932675\tvalid_0's ndcg@2: 0.972282\tvalid_0's ndcg@3: 0.974057\tvalid_0's ndcg@4: 0.974444\tvalid_0's ndcg@5: 0.974454\n", - "[22]\tvalid_0's ndcg@1: 0.932925\tvalid_0's ndcg@2: 0.972358\tvalid_0's ndcg@3: 0.974146\tvalid_0's ndcg@4: 0.974533\tvalid_0's ndcg@5: 0.974543\n", - "[23]\tvalid_0's ndcg@1: 0.93325\tvalid_0's ndcg@2: 0.972478\tvalid_0's ndcg@3: 0.974253\tvalid_0's ndcg@4: 0.974651\tvalid_0's ndcg@5: 0.974661\n", - "[24]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972539\tvalid_0's ndcg@3: 0.974351\tvalid_0's ndcg@4: 0.974739\tvalid_0's ndcg@5: 0.974749\n", - "[25]\tvalid_0's ndcg@1: 0.93475\tvalid_0's ndcg@2: 0.973\tvalid_0's ndcg@3: 0.974788\tvalid_0's ndcg@4: 0.975197\tvalid_0's ndcg@5: 0.975206\n", - "[26]\tvalid_0's ndcg@1: 0.935075\tvalid_0's ndcg@2: 0.97312\tvalid_0's ndcg@3: 0.974895\tvalid_0's ndcg@4: 0.975315\tvalid_0's ndcg@5: 0.975325\n", - "[27]\tvalid_0's ndcg@1: 0.9349\tvalid_0's ndcg@2: 0.973103\tvalid_0's ndcg@3: 0.974865\tvalid_0's ndcg@4: 0.975264\tvalid_0's ndcg@5: 0.975273\n", - "[28]\tvalid_0's ndcg@1: 0.935075\tvalid_0's ndcg@2: 0.973152\tvalid_0's ndcg@3: 0.974939\tvalid_0's ndcg@4: 0.975327\tvalid_0's ndcg@5: 0.975336\n", - "[29]\tvalid_0's ndcg@1: 0.935475\tvalid_0's ndcg@2: 0.973315\tvalid_0's ndcg@3: 0.975128\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975492\n", - "[30]\tvalid_0's ndcg@1: 0.93595\tvalid_0's ndcg@2: 0.973522\tvalid_0's ndcg@3: 0.975297\tvalid_0's ndcg@4: 0.975663\tvalid_0's ndcg@5: 0.975673\n", - "[31]\tvalid_0's ndcg@1: 0.93595\tvalid_0's ndcg@2: 0.973506\tvalid_0's ndcg@3: 0.975281\tvalid_0's ndcg@4: 0.975658\tvalid_0's ndcg@5: 0.975668\n", - "[32]\tvalid_0's ndcg@1: 0.93675\tvalid_0's ndcg@2: 0.973833\tvalid_0's ndcg@3: 0.975595\tvalid_0's ndcg@4: 0.975961\tvalid_0's ndcg@5: 0.975971\n", - "[33]\tvalid_0's ndcg@1: 0.936475\tvalid_0's ndcg@2: 0.973763\tvalid_0's ndcg@3: 0.975488\tvalid_0's ndcg@4: 0.975865\tvalid_0's ndcg@5: 0.975874\n", - "[34]\tvalid_0's ndcg@1: 0.9367\tvalid_0's ndcg@2: 0.973893\tvalid_0's ndcg@3: 0.975568\tvalid_0's ndcg@4: 0.975956\tvalid_0's ndcg@5: 0.975966\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LGB排序模型" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[35]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.974059\tvalid_0's ndcg@3: 0.975722\tvalid_0's ndcg@4: 0.97612\tvalid_0's ndcg@5: 0.97613\n", - "[36]\tvalid_0's ndcg@1: 0.9374\tvalid_0's ndcg@2: 0.974183\tvalid_0's ndcg@3: 0.975846\tvalid_0's ndcg@4: 0.976223\tvalid_0's ndcg@5: 0.976232\n", - "[37]\tvalid_0's ndcg@1: 0.9374\tvalid_0's ndcg@2: 0.974183\tvalid_0's ndcg@3: 0.975846\tvalid_0's ndcg@4: 0.976223\tvalid_0's ndcg@5: 0.976232\n", - "[38]\tvalid_0's ndcg@1: 0.938725\tvalid_0's ndcg@2: 0.974672\tvalid_0's ndcg@3: 0.97636\tvalid_0's ndcg@4: 0.976715\tvalid_0's ndcg@5: 0.976725\n", - "[39]\tvalid_0's ndcg@1: 0.93865\tvalid_0's ndcg@2: 0.974676\tvalid_0's ndcg@3: 0.976364\tvalid_0's ndcg@4: 0.976697\tvalid_0's ndcg@5: 0.976707\n", - "[40]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.974867\tvalid_0's ndcg@3: 0.97653\tvalid_0's ndcg@4: 0.976874\tvalid_0's ndcg@5: 0.976884\n", - "[41]\tvalid_0's ndcg@1: 0.9396\tvalid_0's ndcg@2: 0.975042\tvalid_0's ndcg@3: 0.976705\tvalid_0's ndcg@4: 0.97705\tvalid_0's ndcg@5: 0.977059\n", - "[42]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.975072\tvalid_0's ndcg@3: 0.976784\tvalid_0's ndcg@4: 0.977129\tvalid_0's ndcg@5: 0.977138\n", - "[43]\tvalid_0's ndcg@1: 0.940075\tvalid_0's ndcg@2: 0.97517\tvalid_0's ndcg@3: 0.97687\tvalid_0's ndcg@4: 0.977215\tvalid_0's ndcg@5: 0.977225\n", - "[44]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.97534\tvalid_0's ndcg@3: 0.977015\tvalid_0's ndcg@4: 0.97736\tvalid_0's ndcg@5: 0.97737\n", - "[45]\tvalid_0's ndcg@1: 0.94055\tvalid_0's ndcg@2: 0.975409\tvalid_0's ndcg@3: 0.977059\tvalid_0's ndcg@4: 0.977403\tvalid_0's ndcg@5: 0.977413\n", - "[46]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975415\tvalid_0's ndcg@3: 0.97704\tvalid_0's ndcg@4: 0.977396\tvalid_0's ndcg@5: 0.977405\n", - "[47]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975363\tvalid_0's ndcg@3: 0.977013\tvalid_0's ndcg@4: 0.977357\tvalid_0's ndcg@5: 0.977367\n", - "[48]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975388\tvalid_0's ndcg@3: 0.977025\tvalid_0's ndcg@4: 0.97737\tvalid_0's ndcg@5: 0.977379\n", - "[49]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975447\tvalid_0's ndcg@3: 0.977097\tvalid_0's ndcg@4: 0.977409\tvalid_0's ndcg@5: 0.977419\n", - "[50]\tvalid_0's ndcg@1: 0.941075\tvalid_0's ndcg@2: 0.975666\tvalid_0's ndcg@3: 0.977303\tvalid_0's ndcg@4: 0.977615\tvalid_0's ndcg@5: 0.977625\n", - "[51]\tvalid_0's ndcg@1: 0.94135\tvalid_0's ndcg@2: 0.975751\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.97771\tvalid_0's ndcg@5: 0.97772\n", - "[52]\tvalid_0's ndcg@1: 0.9413\tvalid_0's ndcg@2: 0.975717\tvalid_0's ndcg@3: 0.977355\tvalid_0's ndcg@4: 0.977688\tvalid_0's ndcg@5: 0.977698\n", - "[53]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.975713\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.977699\tvalid_0's ndcg@5: 0.977718\n", - "[54]\tvalid_0's ndcg@1: 0.94185\tvalid_0's ndcg@2: 0.975857\tvalid_0's ndcg@3: 0.977557\tvalid_0's ndcg@4: 0.977869\tvalid_0's ndcg@5: 0.977889\n", - "[55]\tvalid_0's ndcg@1: 0.941925\tvalid_0's ndcg@2: 0.975837\tvalid_0's ndcg@3: 0.9776\tvalid_0's ndcg@4: 0.977891\tvalid_0's ndcg@5: 0.97791\n", - "[56]\tvalid_0's ndcg@1: 0.942325\tvalid_0's ndcg@2: 0.975969\tvalid_0's ndcg@3: 0.977719\tvalid_0's ndcg@4: 0.978032\tvalid_0's ndcg@5: 0.978051\n", - "[57]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976022\tvalid_0's ndcg@3: 0.977772\tvalid_0's ndcg@4: 0.978073\tvalid_0's ndcg@5: 0.978093\n", - "[58]\tvalid_0's ndcg@1: 0.9425\tvalid_0's ndcg@2: 0.976081\tvalid_0's ndcg@3: 0.977806\tvalid_0's ndcg@4: 0.978108\tvalid_0's ndcg@5: 0.978127\n", - "[59]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976076\tvalid_0's ndcg@3: 0.977788\tvalid_0's ndcg@4: 0.978079\tvalid_0's ndcg@5: 0.978098\n", - "[60]\tvalid_0's ndcg@1: 0.942375\tvalid_0's ndcg@2: 0.976067\tvalid_0's ndcg@3: 0.977779\tvalid_0's ndcg@4: 0.97807\tvalid_0's ndcg@5: 0.978089\n", - "[61]\tvalid_0's ndcg@1: 0.942225\tvalid_0's ndcg@2: 0.976043\tvalid_0's ndcg@3: 0.97773\tvalid_0's ndcg@4: 0.978021\tvalid_0's ndcg@5: 0.97804\n", - "[62]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976117\tvalid_0's ndcg@3: 0.977792\tvalid_0's ndcg@4: 0.978093\tvalid_0's ndcg@5: 0.978112\n", - "[63]\tvalid_0's ndcg@1: 0.942675\tvalid_0's ndcg@2: 0.976193\tvalid_0's ndcg@3: 0.977881\tvalid_0's ndcg@4: 0.978182\tvalid_0's ndcg@5: 0.978201\n", - "[64]\tvalid_0's ndcg@1: 0.942925\tvalid_0's ndcg@2: 0.976254\tvalid_0's ndcg@3: 0.977966\tvalid_0's ndcg@4: 0.978268\tvalid_0's ndcg@5: 0.978287\n", - "[65]\tvalid_0's ndcg@1: 0.9431\tvalid_0's ndcg@2: 0.97635\tvalid_0's ndcg@3: 0.978025\tvalid_0's ndcg@4: 0.978337\tvalid_0's ndcg@5: 0.978357\n", - "[66]\tvalid_0's ndcg@1: 0.9434\tvalid_0's ndcg@2: 0.976445\tvalid_0's ndcg@3: 0.978132\tvalid_0's ndcg@4: 0.978445\tvalid_0's ndcg@5: 0.978464\n", - "[67]\tvalid_0's ndcg@1: 0.943275\tvalid_0's ndcg@2: 0.976399\tvalid_0's ndcg@3: 0.978074\tvalid_0's ndcg@4: 0.978397\tvalid_0's ndcg@5: 0.978416\n", - "[68]\tvalid_0's ndcg@1: 0.943325\tvalid_0's ndcg@2: 0.976401\tvalid_0's ndcg@3: 0.978089\tvalid_0's ndcg@4: 0.978412\tvalid_0's ndcg@5: 0.978431\n", - "[69]\tvalid_0's ndcg@1: 0.943675\tvalid_0's ndcg@2: 0.976578\tvalid_0's ndcg@3: 0.97819\tvalid_0's ndcg@4: 0.978546\tvalid_0's ndcg@5: 0.978565\n", - "[70]\tvalid_0's ndcg@1: 0.944025\tvalid_0's ndcg@2: 0.976707\tvalid_0's ndcg@3: 0.97832\tvalid_0's ndcg@4: 0.978675\tvalid_0's ndcg@5: 0.978694\n", - "[71]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.976772\tvalid_0's ndcg@3: 0.978384\tvalid_0's ndcg@4: 0.97874\tvalid_0's ndcg@5: 0.978759\n", - "[72]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.976822\tvalid_0's ndcg@3: 0.978409\tvalid_0's ndcg@4: 0.978765\tvalid_0's ndcg@5: 0.978784\n", - "[73]\tvalid_0's ndcg@1: 0.94445\tvalid_0's ndcg@2: 0.976864\tvalid_0's ndcg@3: 0.978464\tvalid_0's ndcg@4: 0.97883\tvalid_0's ndcg@5: 0.978849\n", - "[74]\tvalid_0's ndcg@1: 0.9446\tvalid_0's ndcg@2: 0.976919\tvalid_0's ndcg@3: 0.978519\tvalid_0's ndcg@4: 0.978885\tvalid_0's ndcg@5: 0.978905\n", - "[75]\tvalid_0's ndcg@1: 0.9446\tvalid_0's ndcg@2: 0.976919\tvalid_0's ndcg@3: 0.978519\tvalid_0's ndcg@4: 0.978885\tvalid_0's ndcg@5: 0.978905\n", - "[76]\tvalid_0's ndcg@1: 0.944625\tvalid_0's ndcg@2: 0.97696\tvalid_0's ndcg@3: 0.978535\tvalid_0's ndcg@4: 0.978901\tvalid_0's ndcg@5: 0.978921\n", - "[77]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.976979\tvalid_0's ndcg@3: 0.978554\tvalid_0's ndcg@4: 0.97892\tvalid_0's ndcg@5: 0.978939\n", - "[78]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.976979\tvalid_0's ndcg@3: 0.978554\tvalid_0's ndcg@4: 0.97892\tvalid_0's ndcg@5: 0.978939\n", - "[79]\tvalid_0's ndcg@1: 0.944525\tvalid_0's ndcg@2: 0.976907\tvalid_0's ndcg@3: 0.978507\tvalid_0's ndcg@4: 0.978863\tvalid_0's ndcg@5: 0.978882\n", - "[80]\tvalid_0's ndcg@1: 0.94455\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.97851\tvalid_0's ndcg@4: 0.978865\tvalid_0's ndcg@5: 0.978885\n", - "[81]\tvalid_0's ndcg@1: 0.944725\tvalid_0's ndcg@2: 0.97695\tvalid_0's ndcg@3: 0.978575\tvalid_0's ndcg@4: 0.978919\tvalid_0's ndcg@5: 0.978948\n", - "[82]\tvalid_0's ndcg@1: 0.945225\tvalid_0's ndcg@2: 0.977103\tvalid_0's ndcg@3: 0.978765\tvalid_0's ndcg@4: 0.97911\tvalid_0's ndcg@5: 0.979129\n", - "[83]\tvalid_0's ndcg@1: 0.945125\tvalid_0's ndcg@2: 0.977066\tvalid_0's ndcg@3: 0.978716\tvalid_0's ndcg@4: 0.979071\tvalid_0's ndcg@5: 0.97909\n", - "[84]\tvalid_0's ndcg@1: 0.945225\tvalid_0's ndcg@2: 0.97715\tvalid_0's ndcg@3: 0.978775\tvalid_0's ndcg@4: 0.97912\tvalid_0's ndcg@5: 0.979139\n", - "[85]\tvalid_0's ndcg@1: 0.945025\tvalid_0's ndcg@2: 0.977092\tvalid_0's ndcg@3: 0.978692\tvalid_0's ndcg@4: 0.979047\tvalid_0's ndcg@5: 0.979067\n", - "[86]\tvalid_0's ndcg@1: 0.9452\tvalid_0's ndcg@2: 0.977172\tvalid_0's ndcg@3: 0.97876\tvalid_0's ndcg@4: 0.979115\tvalid_0's ndcg@5: 0.979135\n", - "[87]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977178\tvalid_0's ndcg@3: 0.97879\tvalid_0's ndcg@4: 0.979156\tvalid_0's ndcg@5: 0.979166\n", - "[88]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977178\tvalid_0's ndcg@3: 0.978815\tvalid_0's ndcg@4: 0.979149\tvalid_0's ndcg@5: 0.979168\n", - "[89]\tvalid_0's ndcg@1: 0.94555\tvalid_0's ndcg@2: 0.977333\tvalid_0's ndcg@3: 0.978933\tvalid_0's ndcg@4: 0.979267\tvalid_0's ndcg@5: 0.979277\n", - "[90]\tvalid_0's ndcg@1: 0.9459\tvalid_0's ndcg@2: 0.977462\tvalid_0's ndcg@3: 0.979062\tvalid_0's ndcg@4: 0.979396\tvalid_0's ndcg@5: 0.979406\n", - "[91]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977481\tvalid_0's ndcg@3: 0.979081\tvalid_0's ndcg@4: 0.979414\tvalid_0's ndcg@5: 0.979424\n", - "[92]\tvalid_0's ndcg@1: 0.945875\tvalid_0's ndcg@2: 0.977437\tvalid_0's ndcg@3: 0.97905\tvalid_0's ndcg@4: 0.979384\tvalid_0's ndcg@5: 0.979393\n", - "[93]\tvalid_0's ndcg@1: 0.945875\tvalid_0's ndcg@2: 0.977421\tvalid_0's ndcg@3: 0.979046\tvalid_0's ndcg@4: 0.97938\tvalid_0's ndcg@5: 0.97939\n", - "[94]\tvalid_0's ndcg@1: 0.9459\tvalid_0's ndcg@2: 0.977431\tvalid_0's ndcg@3: 0.979068\tvalid_0's ndcg@4: 0.979391\tvalid_0's ndcg@5: 0.979401\n", - "[95]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977449\tvalid_0's ndcg@3: 0.979074\tvalid_0's ndcg@4: 0.979408\tvalid_0's ndcg@5: 0.979418\n", - "[96]\tvalid_0's ndcg@1: 0.946075\tvalid_0's ndcg@2: 0.977527\tvalid_0's ndcg@3: 0.979127\tvalid_0's ndcg@4: 0.979461\tvalid_0's ndcg@5: 0.97947\n" - ] + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:07.787698Z", + "start_time": "2020-11-18T04:21:07.536514Z" + } + }, + "outputs": [], + "source": [ + "# 防止中间出错之后重新读取数据\n", + "trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()\n", + "\n", + "if offline:\n", + " val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()\n", + " \n", + "tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[97]\tvalid_0's ndcg@1: 0.946375\tvalid_0's ndcg@2: 0.977622\tvalid_0's ndcg@3: 0.979222\tvalid_0's ndcg@4: 0.979577\tvalid_0's ndcg@5: 0.979577\n", - "[98]\tvalid_0's ndcg@1: 0.946625\tvalid_0's ndcg@2: 0.977714\tvalid_0's ndcg@3: 0.979339\tvalid_0's ndcg@4: 0.979673\tvalid_0's ndcg@5: 0.979673\n", - "[99]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.977739\tvalid_0's ndcg@3: 0.979352\tvalid_0's ndcg@4: 0.979685\tvalid_0's ndcg@5: 0.979685\n", - "[100]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.97778\tvalid_0's ndcg@3: 0.97938\tvalid_0's ndcg@4: 0.979703\tvalid_0's ndcg@5: 0.979703\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.97778\tvalid_0's ndcg@3: 0.97938\tvalid_0's ndcg@4: 0.979703\tvalid_0's ndcg@5: 0.979703\n" - ] - } - ], - "source": [ - "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", - "# 这一部分与前面的单独训练和验证是分开的\n", - "def get_kfold_users(trn_df, n=5):\n", - " user_ids = trn_df['user_id'].unique()\n", - " user_set = [user_ids[i::n] for i in range(n)]\n", - " return user_set\n", - "\n", - "k_fold = 5\n", - "trn_df = trn_user_item_feats_df_rank_model\n", - "user_set = get_kfold_users(trn_df, n=k_fold)\n", - "\n", - "score_list = []\n", - "score_df = trn_df[['user_id', 'click_article_id','label']]\n", - "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", - "\n", - "# 五折交叉验证,并将中间结果保存用于staking\n", - "for n_fold, valid_user in enumerate(user_set):\n", - " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", - " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", - " \n", - " # 训练集与验证集的用户分组\n", - " train_idx.sort_values(by=['user_id'], inplace=True)\n", - " g_train = train_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", - " \n", - " valid_idx.sort_values(by=['user_id'], inplace=True)\n", - " g_val = valid_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", - " \n", - " # 定义模型\n", - " lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", - " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", - " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) \n", - " # 训练模型\n", - " lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,\n", - " eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], \n", - " eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n", - " \n", - " # 预测验证集结果\n", - " valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n", - " \n", - " # 对输出结果进行归一化\n", - " valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))\n", - " \n", - " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", - " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", - " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", - " \n", - " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", - " if not offline:\n", - " sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)\n", - " \n", - "score_df_ = pd.concat(score_list, axis=0)\n", - "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", - "# 保存训练集交叉验证产生的新特征\n", - "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)\n", - " \n", - "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", - "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n", - "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n", - "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n", - "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - "\n", - "# 保存测试集交叉验证的新特征\n", - "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:22:52.604397Z", - "start_time": "2020-11-18T04:22:43.253034Z" - } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "# 单模型生成提交结果\n", - "rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]\n", - "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", - "submit(rank_results, topk=5, model_name='lgb_ranker')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## LGB分类模型" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:22:58.259730Z", - "start_time": "2020-11-18T04:22:58.254297Z" - } - }, - "outputs": [], - "source": [ - "# 模型及参数的定义\n", - "lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", - " max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", - " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) " - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:23:11.258774Z", - "start_time": "2020-11-18T04:23:00.861936Z" - } - }, - "outputs": [], - "source": [ - "# 模型训练\n", - "if offline:\n", - " lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],\n", - " eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], \n", - " eval_metric=['auc', ],early_stopping_rounds=50, )\n", - "else:\n", - " lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:23:19.591396Z", - "start_time": "2020-11-18T04:23:13.813850Z" - } - }, - "outputs": [], - "source": [ - "# 模型预测\n", - "tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]\n", - "\n", - "# 将这里的排序结果保存一份,用户后面的模型融合\n", - "tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:23:32.352931Z", - "start_time": "2020-11-18T04:23:22.346609Z" - } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]\n", - "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", - "submit(rank_results, topk=5, model_name='lgb_cls')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:24:11.241196Z", - "start_time": "2020-11-18T04:23:41.377394Z" + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:10.839656Z", + "start_time": "2020-11-18T04:21:10.833109Z" + } + }, + "outputs": [], + "source": [ + "# 定义特征列\n", + "lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', \n", + " 'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',\n", + " 'click_environment','click_deviceGroup', 'click_os', 'click_country', \n", + " 'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',\n", + " 'words_hbo', 'category_id', 'created_at_ts','words_count']" + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1]\tvalid_0's auc: 0.764896\tvalid_0's binary_logloss: 0.522153\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.767857\tvalid_0's binary_logloss: 0.52057\n", - "[3]\tvalid_0's auc: 0.783096\tvalid_0's binary_logloss: 0.519584\n", - "[4]\tvalid_0's auc: 0.784354\tvalid_0's binary_logloss: 0.518485\n", - "[5]\tvalid_0's auc: 0.790554\tvalid_0's binary_logloss: 0.516886\n", - "[6]\tvalid_0's auc: 0.791954\tvalid_0's binary_logloss: 0.515334\n", - "[7]\tvalid_0's auc: 0.794257\tvalid_0's binary_logloss: 0.514032\n", - "[8]\tvalid_0's auc: 0.795222\tvalid_0's binary_logloss: 0.512516\n", - "[9]\tvalid_0's auc: 0.795417\tvalid_0's binary_logloss: 0.511671\n", - "[10]\tvalid_0's auc: 0.795913\tvalid_0's binary_logloss: 0.510226\n", - "[11]\tvalid_0's auc: 0.798222\tvalid_0's binary_logloss: 0.508858\n", - "[12]\tvalid_0's auc: 0.79825\tvalid_0's binary_logloss: 0.507928\n", - "[13]\tvalid_0's auc: 0.798842\tvalid_0's binary_logloss: 0.50708\n", - "[14]\tvalid_0's auc: 0.798935\tvalid_0's binary_logloss: 0.505752\n", - "[15]\tvalid_0's auc: 0.799543\tvalid_0's binary_logloss: 0.504388\n", - "[16]\tvalid_0's auc: 0.800844\tvalid_0's binary_logloss: 0.503126\n", - "[17]\tvalid_0's auc: 0.800855\tvalid_0's binary_logloss: 0.501809\n", - "[18]\tvalid_0's auc: 0.801653\tvalid_0's binary_logloss: 0.500676\n", - "[19]\tvalid_0's auc: 0.801518\tvalid_0's binary_logloss: 0.49987\n", - "[20]\tvalid_0's auc: 0.801662\tvalid_0's binary_logloss: 0.498625\n", - "[21]\tvalid_0's auc: 0.802093\tvalid_0's binary_logloss: 0.498113\n", - "[22]\tvalid_0's auc: 0.803071\tvalid_0's binary_logloss: 0.496933\n", - "[23]\tvalid_0's auc: 0.803222\tvalid_0's binary_logloss: 0.495864\n", - "[24]\tvalid_0's auc: 0.802927\tvalid_0's binary_logloss: 0.494691\n", - "[25]\tvalid_0's auc: 0.802581\tvalid_0's binary_logloss: 0.493543\n", - "[26]\tvalid_0's auc: 0.802965\tvalid_0's binary_logloss: 0.492444\n", - "[27]\tvalid_0's auc: 0.80298\tvalid_0's binary_logloss: 0.491336\n", - "[28]\tvalid_0's auc: 0.803226\tvalid_0's binary_logloss: 0.490275\n", - "[29]\tvalid_0's auc: 0.803436\tvalid_0's binary_logloss: 0.489126\n", - "[30]\tvalid_0's auc: 0.803796\tvalid_0's binary_logloss: 0.48802\n", - "[31]\tvalid_0's auc: 0.803601\tvalid_0's binary_logloss: 0.486988\n", - "[32]\tvalid_0's auc: 0.804416\tvalid_0's binary_logloss: 0.485972\n", - "[33]\tvalid_0's auc: 0.804529\tvalid_0's binary_logloss: 0.484939\n", - "[34]\tvalid_0's auc: 0.804534\tvalid_0's binary_logloss: 0.483927\n", - "[35]\tvalid_0's auc: 0.804819\tvalid_0's binary_logloss: 0.483271\n", - "[36]\tvalid_0's auc: 0.804774\tvalid_0's binary_logloss: 0.482273\n", - "[37]\tvalid_0's auc: 0.805237\tvalid_0's binary_logloss: 0.481639\n", - "[38]\tvalid_0's auc: 0.805546\tvalid_0's binary_logloss: 0.480959\n", - "[39]\tvalid_0's auc: 0.805598\tvalid_0's binary_logloss: 0.479955\n", - "[40]\tvalid_0's auc: 0.806011\tvalid_0's binary_logloss: 0.47903\n", - "[41]\tvalid_0's auc: 0.806664\tvalid_0's binary_logloss: 0.478439\n", - "[42]\tvalid_0's auc: 0.807021\tvalid_0's binary_logloss: 0.477798\n", - "[43]\tvalid_0's auc: 0.80726\tvalid_0's binary_logloss: 0.476829\n", - "[44]\tvalid_0's auc: 0.807157\tvalid_0's binary_logloss: 0.475976\n", - "[45]\tvalid_0's auc: 0.807788\tvalid_0's binary_logloss: 0.475056\n", - "[46]\tvalid_0's auc: 0.80805\tvalid_0's binary_logloss: 0.474446\n", - "[47]\tvalid_0's auc: 0.808097\tvalid_0's binary_logloss: 0.473576\n", - "[48]\tvalid_0's auc: 0.80815\tvalid_0's binary_logloss: 0.472676\n", - "[49]\tvalid_0's auc: 0.808304\tvalid_0's binary_logloss: 0.471918\n", - "[50]\tvalid_0's auc: 0.808749\tvalid_0's binary_logloss: 0.471481\n", - "[51]\tvalid_0's auc: 0.808972\tvalid_0's binary_logloss: 0.471104\n", - "[52]\tvalid_0's auc: 0.809326\tvalid_0's binary_logloss: 0.470289\n", - "[53]\tvalid_0's auc: 0.809472\tvalid_0's binary_logloss: 0.469508\n", - "[54]\tvalid_0's auc: 0.809505\tvalid_0's binary_logloss: 0.46869\n", - "[55]\tvalid_0's auc: 0.809594\tvalid_0's binary_logloss: 0.467885\n", - "[56]\tvalid_0's auc: 0.809847\tvalid_0's binary_logloss: 0.467356\n", - "[57]\tvalid_0's auc: 0.810262\tvalid_0's binary_logloss: 0.466531\n", - "[58]\tvalid_0's auc: 0.810407\tvalid_0's binary_logloss: 0.46573\n", - "[59]\tvalid_0's auc: 0.810618\tvalid_0's binary_logloss: 0.465205\n", - "[60]\tvalid_0's auc: 0.81066\tvalid_0's binary_logloss: 0.464435\n", - "[61]\tvalid_0's auc: 0.810638\tvalid_0's binary_logloss: 0.463721\n", - "[62]\tvalid_0's auc: 0.810658\tvalid_0's binary_logloss: 0.462982\n", - "[63]\tvalid_0's auc: 0.811106\tvalid_0's binary_logloss: 0.462246\n", - "[64]\tvalid_0's auc: 0.811313\tvalid_0's binary_logloss: 0.461748\n", - "[65]\tvalid_0's auc: 0.811351\tvalid_0's binary_logloss: 0.461038\n", - "[66]\tvalid_0's auc: 0.811433\tvalid_0's binary_logloss: 0.460323\n", - "[67]\tvalid_0's auc: 0.81158\tvalid_0's binary_logloss: 0.459662\n", - "[68]\tvalid_0's auc: 0.811561\tvalid_0's binary_logloss: 0.458988\n", - "[69]\tvalid_0's auc: 0.811748\tvalid_0's binary_logloss: 0.458592\n", - "[70]\tvalid_0's auc: 0.811919\tvalid_0's binary_logloss: 0.457934\n", - "[71]\tvalid_0's auc: 0.812073\tvalid_0's binary_logloss: 0.457508\n", - "[72]\tvalid_0's auc: 0.812273\tvalid_0's binary_logloss: 0.457038\n", - "[73]\tvalid_0's auc: 0.812561\tvalid_0's binary_logloss: 0.456439\n", - "[74]\tvalid_0's auc: 0.812633\tvalid_0's binary_logloss: 0.455789\n", - "[75]\tvalid_0's auc: 0.812757\tvalid_0's binary_logloss: 0.455173\n", - "[76]\tvalid_0's auc: 0.812923\tvalid_0's binary_logloss: 0.454533\n", - "[77]\tvalid_0's auc: 0.81295\tvalid_0's binary_logloss: 0.45392\n", - "[78]\tvalid_0's auc: 0.813073\tvalid_0's binary_logloss: 0.453517\n", - "[79]\tvalid_0's auc: 0.813202\tvalid_0's binary_logloss: 0.452932\n", - "[80]\tvalid_0's auc: 0.813611\tvalid_0's binary_logloss: 0.452285\n", - "[81]\tvalid_0's auc: 0.813769\tvalid_0's binary_logloss: 0.45191\n", - "[82]\tvalid_0's auc: 0.814468\tvalid_0's binary_logloss: 0.451455\n", - "[83]\tvalid_0's auc: 0.814656\tvalid_0's binary_logloss: 0.450885\n", - "[84]\tvalid_0's auc: 0.814755\tvalid_0's binary_logloss: 0.450308\n", - "[85]\tvalid_0's auc: 0.814824\tvalid_0's binary_logloss: 0.449739\n", - "[86]\tvalid_0's auc: 0.81499\tvalid_0's binary_logloss: 0.449348\n", - "[87]\tvalid_0's auc: 0.815232\tvalid_0's binary_logloss: 0.448759\n", - "[88]\tvalid_0's auc: 0.815452\tvalid_0's binary_logloss: 0.44823\n", - "[89]\tvalid_0's auc: 0.815593\tvalid_0's binary_logloss: 0.447861\n", - "[90]\tvalid_0's auc: 0.815591\tvalid_0's binary_logloss: 0.447323\n", - "[91]\tvalid_0's auc: 0.815672\tvalid_0's binary_logloss: 0.446796\n", - "[92]\tvalid_0's auc: 0.815875\tvalid_0's binary_logloss: 0.446472\n", - "[93]\tvalid_0's auc: 0.815984\tvalid_0's binary_logloss: 0.445961\n", - "[94]\tvalid_0's auc: 0.816026\tvalid_0's binary_logloss: 0.445439\n", - "[95]\tvalid_0's auc: 0.816172\tvalid_0's binary_logloss: 0.444909\n", - "[96]\tvalid_0's auc: 0.816321\tvalid_0's binary_logloss: 0.444413\n", - "[97]\tvalid_0's auc: 0.816751\tvalid_0's binary_logloss: 0.44405\n", - "[98]\tvalid_0's auc: 0.817226\tvalid_0's binary_logloss: 0.443626\n", - "[99]\tvalid_0's auc: 0.817286\tvalid_0's binary_logloss: 0.443136\n", - "[100]\tvalid_0's auc: 0.817391\tvalid_0's binary_logloss: 0.442854\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.817391\tvalid_0's binary_logloss: 0.442854\n", - "[1]\tvalid_0's auc: 0.771584\tvalid_0's binary_logloss: 0.527139\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.775446\tvalid_0's binary_logloss: 0.525462\n", - "[3]\tvalid_0's auc: 0.790092\tvalid_0's binary_logloss: 0.524461\n", - "[4]\tvalid_0's auc: 0.791432\tvalid_0's binary_logloss: 0.523322\n", - "[5]\tvalid_0's auc: 0.797482\tvalid_0's binary_logloss: 0.521614\n", - "[6]\tvalid_0's auc: 0.79893\tvalid_0's binary_logloss: 0.520007\n", - "[7]\tvalid_0's auc: 0.800753\tvalid_0's binary_logloss: 0.5187\n", - "[8]\tvalid_0's auc: 0.802197\tvalid_0's binary_logloss: 0.517125\n", - "[9]\tvalid_0's auc: 0.802828\tvalid_0's binary_logloss: 0.516269\n", - "[10]\tvalid_0's auc: 0.803496\tvalid_0's binary_logloss: 0.51474\n", - "[11]\tvalid_0's auc: 0.804972\tvalid_0's binary_logloss: 0.513321\n", - "[12]\tvalid_0's auc: 0.804995\tvalid_0's binary_logloss: 0.512334\n", - "[13]\tvalid_0's auc: 0.80525\tvalid_0's binary_logloss: 0.51151\n", - "[14]\tvalid_0's auc: 0.805026\tvalid_0's binary_logloss: 0.510149\n", - "[15]\tvalid_0's auc: 0.805622\tvalid_0's binary_logloss: 0.508708\n", - "[16]\tvalid_0's auc: 0.806974\tvalid_0's binary_logloss: 0.507384\n", - "[17]\tvalid_0's auc: 0.807045\tvalid_0's binary_logloss: 0.506017\n", - "[18]\tvalid_0's auc: 0.807265\tvalid_0's binary_logloss: 0.504853\n", - "[19]\tvalid_0's auc: 0.807126\tvalid_0's binary_logloss: 0.503972\n", - "[20]\tvalid_0's auc: 0.806948\tvalid_0's binary_logloss: 0.502693\n", - "[21]\tvalid_0's auc: 0.807315\tvalid_0's binary_logloss: 0.502166\n", - "[22]\tvalid_0's auc: 0.808067\tvalid_0's binary_logloss: 0.500948\n", - "[23]\tvalid_0's auc: 0.808226\tvalid_0's binary_logloss: 0.49987\n", - "[24]\tvalid_0's auc: 0.808268\tvalid_0's binary_logloss: 0.498623\n", - "[25]\tvalid_0's auc: 0.808569\tvalid_0's binary_logloss: 0.497389\n", - "[26]\tvalid_0's auc: 0.809069\tvalid_0's binary_logloss: 0.49624\n", - "[27]\tvalid_0's auc: 0.809312\tvalid_0's binary_logloss: 0.495095\n", - "[28]\tvalid_0's auc: 0.809549\tvalid_0's binary_logloss: 0.494012\n", - "[29]\tvalid_0's auc: 0.809944\tvalid_0's binary_logloss: 0.492834\n", - "[30]\tvalid_0's auc: 0.810047\tvalid_0's binary_logloss: 0.491735\n", - "[31]\tvalid_0's auc: 0.810086\tvalid_0's binary_logloss: 0.490633\n" - ] + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:14.126608Z", + "start_time": "2020-11-18T04:21:13.493653Z" + } + }, + "outputs": [], + "source": [ + "# 排序模型分组\n", + "trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n", + "g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", + "\n", + "if offline:\n", + " val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n", + " g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[32]\tvalid_0's auc: 0.810566\tvalid_0's binary_logloss: 0.489595\n", - "[33]\tvalid_0's auc: 0.810539\tvalid_0's binary_logloss: 0.488536\n", - "[34]\tvalid_0's auc: 0.810529\tvalid_0's binary_logloss: 0.487489\n", - "[35]\tvalid_0's auc: 0.810932\tvalid_0's binary_logloss: 0.486775\n", - "[36]\tvalid_0's auc: 0.810769\tvalid_0's binary_logloss: 0.48577\n", - "[37]\tvalid_0's auc: 0.811363\tvalid_0's binary_logloss: 0.485123\n", - "[38]\tvalid_0's auc: 0.811801\tvalid_0's binary_logloss: 0.484413\n", - "[39]\tvalid_0's auc: 0.811987\tvalid_0's binary_logloss: 0.483371\n", - "[40]\tvalid_0's auc: 0.812268\tvalid_0's binary_logloss: 0.482407\n", - "[41]\tvalid_0's auc: 0.813297\tvalid_0's binary_logloss: 0.481742\n", - "[42]\tvalid_0's auc: 0.813453\tvalid_0's binary_logloss: 0.481108\n", - "[43]\tvalid_0's auc: 0.813603\tvalid_0's binary_logloss: 0.480163\n", - "[44]\tvalid_0's auc: 0.813654\tvalid_0's binary_logloss: 0.479239\n", - "[45]\tvalid_0's auc: 0.814267\tvalid_0's binary_logloss: 0.478299\n", - "[46]\tvalid_0's auc: 0.81455\tvalid_0's binary_logloss: 0.477678\n", - "[47]\tvalid_0's auc: 0.81452\tvalid_0's binary_logloss: 0.476766\n", - "[48]\tvalid_0's auc: 0.814925\tvalid_0's binary_logloss: 0.475815\n", - "[49]\tvalid_0's auc: 0.814907\tvalid_0's binary_logloss: 0.47503\n", - "[50]\tvalid_0's auc: 0.815278\tvalid_0's binary_logloss: 0.474588\n", - "[51]\tvalid_0's auc: 0.815535\tvalid_0's binary_logloss: 0.474171\n", - "[52]\tvalid_0's auc: 0.815685\tvalid_0's binary_logloss: 0.473335\n", - "[53]\tvalid_0's auc: 0.815787\tvalid_0's binary_logloss: 0.472509\n", - "[54]\tvalid_0's auc: 0.815827\tvalid_0's binary_logloss: 0.471686\n", - "[55]\tvalid_0's auc: 0.815871\tvalid_0's binary_logloss: 0.470838\n", - "[56]\tvalid_0's auc: 0.816238\tvalid_0's binary_logloss: 0.470285\n", - "[57]\tvalid_0's auc: 0.816269\tvalid_0's binary_logloss: 0.469495\n", - "[58]\tvalid_0's auc: 0.816528\tvalid_0's binary_logloss: 0.468654\n", - "[59]\tvalid_0's auc: 0.816706\tvalid_0's binary_logloss: 0.468122\n", - "[60]\tvalid_0's auc: 0.816821\tvalid_0's binary_logloss: 0.467352\n", - "[61]\tvalid_0's auc: 0.816759\tvalid_0's binary_logloss: 0.466622\n", - "[62]\tvalid_0's auc: 0.81682\tvalid_0's binary_logloss: 0.465867\n", - "[63]\tvalid_0's auc: 0.817251\tvalid_0's binary_logloss: 0.465112\n", - "[64]\tvalid_0's auc: 0.817476\tvalid_0's binary_logloss: 0.464589\n", - "[65]\tvalid_0's auc: 0.817613\tvalid_0's binary_logloss: 0.463831\n", - "[66]\tvalid_0's auc: 0.817648\tvalid_0's binary_logloss: 0.463098\n", - "[67]\tvalid_0's auc: 0.817719\tvalid_0's binary_logloss: 0.462414\n", - "[68]\tvalid_0's auc: 0.817814\tvalid_0's binary_logloss: 0.461727\n", - "[69]\tvalid_0's auc: 0.817973\tvalid_0's binary_logloss: 0.461329\n", - "[70]\tvalid_0's auc: 0.818108\tvalid_0's binary_logloss: 0.460674\n", - "[71]\tvalid_0's auc: 0.818347\tvalid_0's binary_logloss: 0.460222\n", - "[72]\tvalid_0's auc: 0.818456\tvalid_0's binary_logloss: 0.45977\n", - "[73]\tvalid_0's auc: 0.818727\tvalid_0's binary_logloss: 0.459157\n", - "[74]\tvalid_0's auc: 0.818988\tvalid_0's binary_logloss: 0.458437\n", - "[75]\tvalid_0's auc: 0.819144\tvalid_0's binary_logloss: 0.457808\n", - "[76]\tvalid_0's auc: 0.819259\tvalid_0's binary_logloss: 0.457159\n", - "[77]\tvalid_0's auc: 0.819343\tvalid_0's binary_logloss: 0.456512\n", - "[78]\tvalid_0's auc: 0.81954\tvalid_0's binary_logloss: 0.456045\n", - "[79]\tvalid_0's auc: 0.819687\tvalid_0's binary_logloss: 0.455416\n", - "[80]\tvalid_0's auc: 0.819958\tvalid_0's binary_logloss: 0.454765\n", - "[81]\tvalid_0's auc: 0.820115\tvalid_0's binary_logloss: 0.45436\n", - "[82]\tvalid_0's auc: 0.820536\tvalid_0's binary_logloss: 0.453965\n", - "[83]\tvalid_0's auc: 0.820649\tvalid_0's binary_logloss: 0.453383\n", - "[84]\tvalid_0's auc: 0.820663\tvalid_0's binary_logloss: 0.452804\n", - "[85]\tvalid_0's auc: 0.820809\tvalid_0's binary_logloss: 0.452167\n", - "[86]\tvalid_0's auc: 0.821024\tvalid_0's binary_logloss: 0.451735\n", - "[87]\tvalid_0's auc: 0.821124\tvalid_0's binary_logloss: 0.451167\n", - "[88]\tvalid_0's auc: 0.821243\tvalid_0's binary_logloss: 0.45061\n", - "[89]\tvalid_0's auc: 0.821404\tvalid_0's binary_logloss: 0.450215\n", - "[90]\tvalid_0's auc: 0.821488\tvalid_0's binary_logloss: 0.449656\n", - "[91]\tvalid_0's auc: 0.821538\tvalid_0's binary_logloss: 0.449107\n", - "[92]\tvalid_0's auc: 0.82172\tvalid_0's binary_logloss: 0.448752\n", - "[93]\tvalid_0's auc: 0.821809\tvalid_0's binary_logloss: 0.448188\n", - "[94]\tvalid_0's auc: 0.82184\tvalid_0's binary_logloss: 0.447659\n", - "[95]\tvalid_0's auc: 0.821971\tvalid_0's binary_logloss: 0.447108\n", - "[96]\tvalid_0's auc: 0.822086\tvalid_0's binary_logloss: 0.446596\n", - "[97]\tvalid_0's auc: 0.82247\tvalid_0's binary_logloss: 0.446244\n", - "[98]\tvalid_0's auc: 0.822951\tvalid_0's binary_logloss: 0.445812\n", - "[99]\tvalid_0's auc: 0.822991\tvalid_0's binary_logloss: 0.445329\n", - "[100]\tvalid_0's auc: 0.823174\tvalid_0's binary_logloss: 0.445037\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.823174\tvalid_0's binary_logloss: 0.445037\n", - "[1]\tvalid_0's auc: 0.769525\tvalid_0's binary_logloss: 0.526256\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.775857\tvalid_0's binary_logloss: 0.524594\n", - "[3]\tvalid_0's auc: 0.785307\tvalid_0's binary_logloss: 0.523606\n", - "[4]\tvalid_0's auc: 0.786356\tvalid_0's binary_logloss: 0.522495\n", - "[5]\tvalid_0's auc: 0.793385\tvalid_0's binary_logloss: 0.520812\n", - "[6]\tvalid_0's auc: 0.794014\tvalid_0's binary_logloss: 0.519253\n", - "[7]\tvalid_0's auc: 0.795454\tvalid_0's binary_logloss: 0.517961\n", - "[8]\tvalid_0's auc: 0.79807\tvalid_0's binary_logloss: 0.516363\n", - "[9]\tvalid_0's auc: 0.798756\tvalid_0's binary_logloss: 0.51548\n", - "[10]\tvalid_0's auc: 0.798314\tvalid_0's binary_logloss: 0.514021\n", - "[11]\tvalid_0's auc: 0.799343\tvalid_0's binary_logloss: 0.512678\n", - "[12]\tvalid_0's auc: 0.799573\tvalid_0's binary_logloss: 0.511708\n", - "[13]\tvalid_0's auc: 0.799563\tvalid_0's binary_logloss: 0.510892\n", - "[14]\tvalid_0's auc: 0.800333\tvalid_0's binary_logloss: 0.509532\n", - "[15]\tvalid_0's auc: 0.800672\tvalid_0's binary_logloss: 0.508117\n", - "[16]\tvalid_0's auc: 0.801953\tvalid_0's binary_logloss: 0.506866\n", - "[17]\tvalid_0's auc: 0.802078\tvalid_0's binary_logloss: 0.5055\n", - "[18]\tvalid_0's auc: 0.802449\tvalid_0's binary_logloss: 0.504358\n", - "[19]\tvalid_0's auc: 0.802329\tvalid_0's binary_logloss: 0.503503\n", - "[20]\tvalid_0's auc: 0.802437\tvalid_0's binary_logloss: 0.502233\n", - "[21]\tvalid_0's auc: 0.802653\tvalid_0's binary_logloss: 0.50174\n", - "[22]\tvalid_0's auc: 0.803753\tvalid_0's binary_logloss: 0.50056\n", - "[23]\tvalid_0's auc: 0.803956\tvalid_0's binary_logloss: 0.499496\n", - "[24]\tvalid_0's auc: 0.804231\tvalid_0's binary_logloss: 0.498283\n", - "[25]\tvalid_0's auc: 0.804554\tvalid_0's binary_logloss: 0.497059\n", - "[26]\tvalid_0's auc: 0.805133\tvalid_0's binary_logloss: 0.495963\n", - "[27]\tvalid_0's auc: 0.805333\tvalid_0's binary_logloss: 0.494842\n", - "[28]\tvalid_0's auc: 0.805644\tvalid_0's binary_logloss: 0.493771\n", - "[29]\tvalid_0's auc: 0.806029\tvalid_0's binary_logloss: 0.492598\n", - "[30]\tvalid_0's auc: 0.806321\tvalid_0's binary_logloss: 0.491474\n", - "[31]\tvalid_0's auc: 0.806201\tvalid_0's binary_logloss: 0.490419\n", - "[32]\tvalid_0's auc: 0.806671\tvalid_0's binary_logloss: 0.489393\n", - "[33]\tvalid_0's auc: 0.806899\tvalid_0's binary_logloss: 0.488331\n", - "[34]\tvalid_0's auc: 0.807105\tvalid_0's binary_logloss: 0.487277\n", - "[35]\tvalid_0's auc: 0.807257\tvalid_0's binary_logloss: 0.486592\n", - "[36]\tvalid_0's auc: 0.80729\tvalid_0's binary_logloss: 0.485607\n", - "[37]\tvalid_0's auc: 0.807752\tvalid_0's binary_logloss: 0.484951\n", - "[38]\tvalid_0's auc: 0.808191\tvalid_0's binary_logloss: 0.484269\n", - "[39]\tvalid_0's auc: 0.808417\tvalid_0's binary_logloss: 0.483242\n", - "[40]\tvalid_0's auc: 0.808761\tvalid_0's binary_logloss: 0.482291\n", - "[41]\tvalid_0's auc: 0.80965\tvalid_0's binary_logloss: 0.48164\n", - "[42]\tvalid_0's auc: 0.810065\tvalid_0's binary_logloss: 0.480962\n", - "[43]\tvalid_0's auc: 0.810209\tvalid_0's binary_logloss: 0.479995\n", - "[44]\tvalid_0's auc: 0.810091\tvalid_0's binary_logloss: 0.479077\n", - "[45]\tvalid_0's auc: 0.810573\tvalid_0's binary_logloss: 0.478185\n", - "[46]\tvalid_0's auc: 0.810924\tvalid_0's binary_logloss: 0.477558\n", - "[47]\tvalid_0's auc: 0.810951\tvalid_0's binary_logloss: 0.476662\n", - "[48]\tvalid_0's auc: 0.811101\tvalid_0's binary_logloss: 0.475745\n", - "[49]\tvalid_0's auc: 0.811269\tvalid_0's binary_logloss: 0.474951\n", - "[50]\tvalid_0's auc: 0.81173\tvalid_0's binary_logloss: 0.474514\n", - "[51]\tvalid_0's auc: 0.811937\tvalid_0's binary_logloss: 0.474114\n", - "[52]\tvalid_0's auc: 0.812136\tvalid_0's binary_logloss: 0.473297\n", - "[53]\tvalid_0's auc: 0.812249\tvalid_0's binary_logloss: 0.472497\n", - "[54]\tvalid_0's auc: 0.812121\tvalid_0's binary_logloss: 0.471696\n", - "[55]\tvalid_0's auc: 0.812164\tvalid_0's binary_logloss: 0.470905\n", - "[56]\tvalid_0's auc: 0.812462\tvalid_0's binary_logloss: 0.470384\n", - "[57]\tvalid_0's auc: 0.812613\tvalid_0's binary_logloss: 0.4696\n", - "[58]\tvalid_0's auc: 0.812615\tvalid_0's binary_logloss: 0.468778\n", - "[59]\tvalid_0's auc: 0.812842\tvalid_0's binary_logloss: 0.468211\n", - "[60]\tvalid_0's auc: 0.81312\tvalid_0's binary_logloss: 0.467385\n", - "[61]\tvalid_0's auc: 0.813039\tvalid_0's binary_logloss: 0.466632\n", - "[62]\tvalid_0's auc: 0.812942\tvalid_0's binary_logloss: 0.465933\n", - "[63]\tvalid_0's auc: 0.813274\tvalid_0's binary_logloss: 0.465214\n", - "[64]\tvalid_0's auc: 0.813572\tvalid_0's binary_logloss: 0.464692\n", - "[65]\tvalid_0's auc: 0.813594\tvalid_0's binary_logloss: 0.463925\n", - "[66]\tvalid_0's auc: 0.813719\tvalid_0's binary_logloss: 0.463177\n", - "[67]\tvalid_0's auc: 0.814011\tvalid_0's binary_logloss: 0.462513\n", - "[68]\tvalid_0's auc: 0.813989\tvalid_0's binary_logloss: 0.461843\n" - ] + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:16.136151Z", + "start_time": "2020-11-18T04:21:16.124444Z" + } + }, + "outputs": [], + "source": [ + "# 排序模型定义\n", + "lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", + " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", + " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) " + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[69]\tvalid_0's auc: 0.814218\tvalid_0's binary_logloss: 0.461443\n", - "[70]\tvalid_0's auc: 0.814334\tvalid_0's binary_logloss: 0.460775\n", - "[71]\tvalid_0's auc: 0.814493\tvalid_0's binary_logloss: 0.460332\n", - "[72]\tvalid_0's auc: 0.814663\tvalid_0's binary_logloss: 0.459867\n", - "[73]\tvalid_0's auc: 0.814856\tvalid_0's binary_logloss: 0.459266\n", - "[74]\tvalid_0's auc: 0.815017\tvalid_0's binary_logloss: 0.458585\n", - "[75]\tvalid_0's auc: 0.815186\tvalid_0's binary_logloss: 0.457958\n", - "[76]\tvalid_0's auc: 0.815374\tvalid_0's binary_logloss: 0.457316\n", - "[77]\tvalid_0's auc: 0.81554\tvalid_0's binary_logloss: 0.45665\n", - "[78]\tvalid_0's auc: 0.81569\tvalid_0's binary_logloss: 0.456217\n", - "[79]\tvalid_0's auc: 0.815861\tvalid_0's binary_logloss: 0.455615\n", - "[80]\tvalid_0's auc: 0.816443\tvalid_0's binary_logloss: 0.454895\n", - "[81]\tvalid_0's auc: 0.816659\tvalid_0's binary_logloss: 0.454503\n", - "[82]\tvalid_0's auc: 0.817017\tvalid_0's binary_logloss: 0.454149\n", - "[83]\tvalid_0's auc: 0.817162\tvalid_0's binary_logloss: 0.453578\n", - "[84]\tvalid_0's auc: 0.817274\tvalid_0's binary_logloss: 0.452984\n", - "[85]\tvalid_0's auc: 0.817283\tvalid_0's binary_logloss: 0.452416\n", - "[86]\tvalid_0's auc: 0.817339\tvalid_0's binary_logloss: 0.452022\n", - "[87]\tvalid_0's auc: 0.817494\tvalid_0's binary_logloss: 0.45146\n", - "[88]\tvalid_0's auc: 0.817594\tvalid_0's binary_logloss: 0.450926\n", - "[89]\tvalid_0's auc: 0.817771\tvalid_0's binary_logloss: 0.450553\n", - "[90]\tvalid_0's auc: 0.81789\tvalid_0's binary_logloss: 0.449985\n", - "[91]\tvalid_0's auc: 0.817931\tvalid_0's binary_logloss: 0.449439\n", - "[92]\tvalid_0's auc: 0.818138\tvalid_0's binary_logloss: 0.449094\n", - "[93]\tvalid_0's auc: 0.818334\tvalid_0's binary_logloss: 0.448527\n", - "[94]\tvalid_0's auc: 0.818426\tvalid_0's binary_logloss: 0.447989\n", - "[95]\tvalid_0's auc: 0.818676\tvalid_0's binary_logloss: 0.447407\n", - "[96]\tvalid_0's auc: 0.818852\tvalid_0's binary_logloss: 0.446884\n", - "[97]\tvalid_0's auc: 0.81945\tvalid_0's binary_logloss: 0.446455\n", - "[98]\tvalid_0's auc: 0.819861\tvalid_0's binary_logloss: 0.446045\n", - "[99]\tvalid_0's auc: 0.819943\tvalid_0's binary_logloss: 0.445543\n", - "[100]\tvalid_0's auc: 0.820076\tvalid_0's binary_logloss: 0.445258\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.820076\tvalid_0's binary_logloss: 0.445258\n", - "[1]\tvalid_0's auc: 0.770032\tvalid_0's binary_logloss: 0.527241\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.779881\tvalid_0's binary_logloss: 0.525545\n", - "[3]\tvalid_0's auc: 0.791308\tvalid_0's binary_logloss: 0.524508\n", - "[4]\tvalid_0's auc: 0.790788\tvalid_0's binary_logloss: 0.52341\n", - "[5]\tvalid_0's auc: 0.795645\tvalid_0's binary_logloss: 0.521753\n", - "[6]\tvalid_0's auc: 0.797745\tvalid_0's binary_logloss: 0.520131\n", - "[7]\tvalid_0's auc: 0.79931\tvalid_0's binary_logloss: 0.518872\n", - "[8]\tvalid_0's auc: 0.800014\tvalid_0's binary_logloss: 0.517353\n", - "[9]\tvalid_0's auc: 0.800549\tvalid_0's binary_logloss: 0.516487\n", - "[10]\tvalid_0's auc: 0.800261\tvalid_0's binary_logloss: 0.515039\n", - "[11]\tvalid_0's auc: 0.801261\tvalid_0's binary_logloss: 0.513695\n", - "[12]\tvalid_0's auc: 0.801062\tvalid_0's binary_logloss: 0.512735\n", - "[13]\tvalid_0's auc: 0.801155\tvalid_0's binary_logloss: 0.51192\n", - "[14]\tvalid_0's auc: 0.801315\tvalid_0's binary_logloss: 0.510559\n", - "[15]\tvalid_0's auc: 0.80185\tvalid_0's binary_logloss: 0.509147\n", - "[16]\tvalid_0's auc: 0.803029\tvalid_0's binary_logloss: 0.507914\n", - "[17]\tvalid_0's auc: 0.803035\tvalid_0's binary_logloss: 0.506583\n", - "[18]\tvalid_0's auc: 0.803433\tvalid_0's binary_logloss: 0.505441\n", - "[19]\tvalid_0's auc: 0.803717\tvalid_0's binary_logloss: 0.504599\n", - "[20]\tvalid_0's auc: 0.803819\tvalid_0's binary_logloss: 0.503327\n", - "[21]\tvalid_0's auc: 0.803923\tvalid_0's binary_logloss: 0.502782\n", - "[22]\tvalid_0's auc: 0.804939\tvalid_0's binary_logloss: 0.501596\n", - "[23]\tvalid_0's auc: 0.804707\tvalid_0's binary_logloss: 0.500572\n", - "[24]\tvalid_0's auc: 0.804632\tvalid_0's binary_logloss: 0.499367\n", - "[25]\tvalid_0's auc: 0.804756\tvalid_0's binary_logloss: 0.498161\n", - "[26]\tvalid_0's auc: 0.805067\tvalid_0's binary_logloss: 0.497061\n", - "[27]\tvalid_0's auc: 0.805119\tvalid_0's binary_logloss: 0.495933\n", - "[28]\tvalid_0's auc: 0.805304\tvalid_0's binary_logloss: 0.494849\n", - "[29]\tvalid_0's auc: 0.805688\tvalid_0's binary_logloss: 0.493677\n", - "[30]\tvalid_0's auc: 0.805822\tvalid_0's binary_logloss: 0.492594\n", - "[31]\tvalid_0's auc: 0.805869\tvalid_0's binary_logloss: 0.49152\n", - "[32]\tvalid_0's auc: 0.807267\tvalid_0's binary_logloss: 0.490435\n", - "[33]\tvalid_0's auc: 0.807301\tvalid_0's binary_logloss: 0.489392\n", - "[34]\tvalid_0's auc: 0.80736\tvalid_0's binary_logloss: 0.488325\n", - "[35]\tvalid_0's auc: 0.807706\tvalid_0's binary_logloss: 0.487654\n", - "[36]\tvalid_0's auc: 0.807758\tvalid_0's binary_logloss: 0.486651\n", - "[37]\tvalid_0's auc: 0.808051\tvalid_0's binary_logloss: 0.486012\n", - "[38]\tvalid_0's auc: 0.808429\tvalid_0's binary_logloss: 0.485355\n", - "[39]\tvalid_0's auc: 0.808663\tvalid_0's binary_logloss: 0.484327\n", - "[40]\tvalid_0's auc: 0.809007\tvalid_0's binary_logloss: 0.483386\n", - "[41]\tvalid_0's auc: 0.809781\tvalid_0's binary_logloss: 0.482745\n", - "[42]\tvalid_0's auc: 0.810071\tvalid_0's binary_logloss: 0.482124\n", - "[43]\tvalid_0's auc: 0.810383\tvalid_0's binary_logloss: 0.481154\n", - "[44]\tvalid_0's auc: 0.810446\tvalid_0's binary_logloss: 0.480243\n", - "[45]\tvalid_0's auc: 0.811148\tvalid_0's binary_logloss: 0.479261\n", - "[46]\tvalid_0's auc: 0.811245\tvalid_0's binary_logloss: 0.478687\n", - "[47]\tvalid_0's auc: 0.811214\tvalid_0's binary_logloss: 0.477812\n", - "[48]\tvalid_0's auc: 0.811408\tvalid_0's binary_logloss: 0.47689\n", - "[49]\tvalid_0's auc: 0.811486\tvalid_0's binary_logloss: 0.476132\n", - "[50]\tvalid_0's auc: 0.811806\tvalid_0's binary_logloss: 0.475718\n", - "[51]\tvalid_0's auc: 0.812017\tvalid_0's binary_logloss: 0.475342\n", - "[52]\tvalid_0's auc: 0.812255\tvalid_0's binary_logloss: 0.474505\n", - "[53]\tvalid_0's auc: 0.812249\tvalid_0's binary_logloss: 0.473707\n", - "[54]\tvalid_0's auc: 0.812235\tvalid_0's binary_logloss: 0.47289\n", - "[55]\tvalid_0's auc: 0.812233\tvalid_0's binary_logloss: 0.472091\n", - "[56]\tvalid_0's auc: 0.812492\tvalid_0's binary_logloss: 0.471563\n", - "[57]\tvalid_0's auc: 0.812579\tvalid_0's binary_logloss: 0.47077\n", - "[58]\tvalid_0's auc: 0.812598\tvalid_0's binary_logloss: 0.469992\n", - "[59]\tvalid_0's auc: 0.812885\tvalid_0's binary_logloss: 0.469458\n", - "[60]\tvalid_0's auc: 0.812995\tvalid_0's binary_logloss: 0.468676\n", - "[61]\tvalid_0's auc: 0.812961\tvalid_0's binary_logloss: 0.467939\n", - "[62]\tvalid_0's auc: 0.812919\tvalid_0's binary_logloss: 0.467232\n", - "[63]\tvalid_0's auc: 0.813291\tvalid_0's binary_logloss: 0.466491\n", - "[64]\tvalid_0's auc: 0.813702\tvalid_0's binary_logloss: 0.465945\n", - "[65]\tvalid_0's auc: 0.813803\tvalid_0's binary_logloss: 0.465197\n", - "[66]\tvalid_0's auc: 0.813851\tvalid_0's binary_logloss: 0.4645\n", - "[67]\tvalid_0's auc: 0.814011\tvalid_0's binary_logloss: 0.463814\n", - "[68]\tvalid_0's auc: 0.814027\tvalid_0's binary_logloss: 0.463113\n", - "[69]\tvalid_0's auc: 0.814138\tvalid_0's binary_logloss: 0.462727\n", - "[70]\tvalid_0's auc: 0.814365\tvalid_0's binary_logloss: 0.462077\n", - "[71]\tvalid_0's auc: 0.814432\tvalid_0's binary_logloss: 0.461655\n", - "[72]\tvalid_0's auc: 0.8146\tvalid_0's binary_logloss: 0.461194\n", - "[73]\tvalid_0's auc: 0.815324\tvalid_0's binary_logloss: 0.460477\n", - "[74]\tvalid_0's auc: 0.815411\tvalid_0's binary_logloss: 0.459805\n", - "[75]\tvalid_0's auc: 0.815548\tvalid_0's binary_logloss: 0.459189\n", - "[76]\tvalid_0's auc: 0.815625\tvalid_0's binary_logloss: 0.458525\n", - "[77]\tvalid_0's auc: 0.81562\tvalid_0's binary_logloss: 0.457905\n", - "[78]\tvalid_0's auc: 0.815786\tvalid_0's binary_logloss: 0.45747\n", - "[79]\tvalid_0's auc: 0.815834\tvalid_0's binary_logloss: 0.456884\n", - "[80]\tvalid_0's auc: 0.816475\tvalid_0's binary_logloss: 0.45617\n", - "[81]\tvalid_0's auc: 0.816677\tvalid_0's binary_logloss: 0.455787\n", - "[82]\tvalid_0's auc: 0.817255\tvalid_0's binary_logloss: 0.455358\n", - "[83]\tvalid_0's auc: 0.817383\tvalid_0's binary_logloss: 0.454775\n", - "[84]\tvalid_0's auc: 0.817509\tvalid_0's binary_logloss: 0.454176\n", - "[85]\tvalid_0's auc: 0.817572\tvalid_0's binary_logloss: 0.453609\n", - "[86]\tvalid_0's auc: 0.817721\tvalid_0's binary_logloss: 0.453213\n", - "[87]\tvalid_0's auc: 0.817992\tvalid_0's binary_logloss: 0.452586\n", - "[88]\tvalid_0's auc: 0.81808\tvalid_0's binary_logloss: 0.45204\n", - "[89]\tvalid_0's auc: 0.818202\tvalid_0's binary_logloss: 0.451643\n", - "[90]\tvalid_0's auc: 0.818336\tvalid_0's binary_logloss: 0.451081\n", - "[91]\tvalid_0's auc: 0.818347\tvalid_0's binary_logloss: 0.450531\n", - "[92]\tvalid_0's auc: 0.818558\tvalid_0's binary_logloss: 0.450179\n", - "[93]\tvalid_0's auc: 0.818743\tvalid_0's binary_logloss: 0.449647\n", - "[94]\tvalid_0's auc: 0.818789\tvalid_0's binary_logloss: 0.449133\n", - "[95]\tvalid_0's auc: 0.818849\tvalid_0's binary_logloss: 0.44862\n", - "[96]\tvalid_0's auc: 0.81913\tvalid_0's binary_logloss: 0.448072\n", - "[97]\tvalid_0's auc: 0.819526\tvalid_0's binary_logloss: 0.447713\n", - "[98]\tvalid_0's auc: 0.819971\tvalid_0's binary_logloss: 0.447296\n", - "[99]\tvalid_0's auc: 0.819972\tvalid_0's binary_logloss: 0.446814\n" - ] + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:22.965433Z", + "start_time": "2020-11-18T04:21:17.799127Z" + } + }, + "outputs": [], + "source": [ + "# 排序模型训练\n", + "if offline:\n", + " lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,\n", + " eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], \n", + " eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n", + "else:\n", + " lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[100]\tvalid_0's auc: 0.820086\tvalid_0's binary_logloss: 0.446533\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.820086\tvalid_0's binary_logloss: 0.446533\n", - "[1]\tvalid_0's auc: 0.768646\tvalid_0's binary_logloss: 0.527167\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.779902\tvalid_0's binary_logloss: 0.525481\n", - "[3]\tvalid_0's auc: 0.789868\tvalid_0's binary_logloss: 0.524485\n", - "[4]\tvalid_0's auc: 0.791895\tvalid_0's binary_logloss: 0.523382\n", - "[5]\tvalid_0's auc: 0.795453\tvalid_0's binary_logloss: 0.521759\n", - "[6]\tvalid_0's auc: 0.796672\tvalid_0's binary_logloss: 0.520166\n", - "[7]\tvalid_0's auc: 0.798023\tvalid_0's binary_logloss: 0.518857\n", - "[8]\tvalid_0's auc: 0.799331\tvalid_0's binary_logloss: 0.517297\n", - "[9]\tvalid_0's auc: 0.800181\tvalid_0's binary_logloss: 0.516416\n", - "[10]\tvalid_0's auc: 0.800373\tvalid_0's binary_logloss: 0.514967\n", - "[11]\tvalid_0's auc: 0.801087\tvalid_0's binary_logloss: 0.513631\n", - "[12]\tvalid_0's auc: 0.801122\tvalid_0's binary_logloss: 0.512658\n", - "[13]\tvalid_0's auc: 0.801043\tvalid_0's binary_logloss: 0.511833\n", - "[14]\tvalid_0's auc: 0.801238\tvalid_0's binary_logloss: 0.510461\n", - "[15]\tvalid_0's auc: 0.801847\tvalid_0's binary_logloss: 0.509034\n", - "[16]\tvalid_0's auc: 0.803139\tvalid_0's binary_logloss: 0.507759\n", - "[17]\tvalid_0's auc: 0.803577\tvalid_0's binary_logloss: 0.506361\n", - "[18]\tvalid_0's auc: 0.803834\tvalid_0's binary_logloss: 0.505229\n", - "[19]\tvalid_0's auc: 0.803943\tvalid_0's binary_logloss: 0.504371\n", - "[20]\tvalid_0's auc: 0.80415\tvalid_0's binary_logloss: 0.503102\n", - "[21]\tvalid_0's auc: 0.804446\tvalid_0's binary_logloss: 0.502564\n", - "[22]\tvalid_0's auc: 0.805163\tvalid_0's binary_logloss: 0.501396\n", - "[23]\tvalid_0's auc: 0.805323\tvalid_0's binary_logloss: 0.500327\n", - "[24]\tvalid_0's auc: 0.805314\tvalid_0's binary_logloss: 0.499123\n", - "[25]\tvalid_0's auc: 0.80535\tvalid_0's binary_logloss: 0.497927\n", - "[26]\tvalid_0's auc: 0.805864\tvalid_0's binary_logloss: 0.496834\n", - "[27]\tvalid_0's auc: 0.805919\tvalid_0's binary_logloss: 0.495667\n", - "[28]\tvalid_0's auc: 0.806272\tvalid_0's binary_logloss: 0.494606\n", - "[29]\tvalid_0's auc: 0.806599\tvalid_0's binary_logloss: 0.49343\n", - "[30]\tvalid_0's auc: 0.806932\tvalid_0's binary_logloss: 0.492303\n", - "[31]\tvalid_0's auc: 0.806656\tvalid_0's binary_logloss: 0.491249\n", - "[32]\tvalid_0's auc: 0.807436\tvalid_0's binary_logloss: 0.490188\n", - "[33]\tvalid_0's auc: 0.807629\tvalid_0's binary_logloss: 0.489117\n", - "[34]\tvalid_0's auc: 0.807501\tvalid_0's binary_logloss: 0.48808\n", - "[35]\tvalid_0's auc: 0.807885\tvalid_0's binary_logloss: 0.487383\n", - "[36]\tvalid_0's auc: 0.807921\tvalid_0's binary_logloss: 0.48636\n", - "[37]\tvalid_0's auc: 0.808267\tvalid_0's binary_logloss: 0.485724\n", - "[38]\tvalid_0's auc: 0.808563\tvalid_0's binary_logloss: 0.485076\n", - "[39]\tvalid_0's auc: 0.808813\tvalid_0's binary_logloss: 0.484039\n", - "[40]\tvalid_0's auc: 0.809023\tvalid_0's binary_logloss: 0.483091\n", - "[41]\tvalid_0's auc: 0.809782\tvalid_0's binary_logloss: 0.482441\n", - "[42]\tvalid_0's auc: 0.810135\tvalid_0's binary_logloss: 0.48179\n", - "[43]\tvalid_0's auc: 0.810219\tvalid_0's binary_logloss: 0.48082\n", - "[44]\tvalid_0's auc: 0.81031\tvalid_0's binary_logloss: 0.479906\n", - "[45]\tvalid_0's auc: 0.810514\tvalid_0's binary_logloss: 0.479024\n", - "[46]\tvalid_0's auc: 0.810566\tvalid_0's binary_logloss: 0.478437\n", - "[47]\tvalid_0's auc: 0.810611\tvalid_0's binary_logloss: 0.477529\n", - "[48]\tvalid_0's auc: 0.810781\tvalid_0's binary_logloss: 0.476637\n", - "[49]\tvalid_0's auc: 0.81089\tvalid_0's binary_logloss: 0.475883\n", - "[50]\tvalid_0's auc: 0.811266\tvalid_0's binary_logloss: 0.475459\n", - "[51]\tvalid_0's auc: 0.811402\tvalid_0's binary_logloss: 0.475078\n", - "[52]\tvalid_0's auc: 0.811765\tvalid_0's binary_logloss: 0.474246\n", - "[53]\tvalid_0's auc: 0.811891\tvalid_0's binary_logloss: 0.473452\n", - "[54]\tvalid_0's auc: 0.811868\tvalid_0's binary_logloss: 0.47263\n", - "[55]\tvalid_0's auc: 0.81192\tvalid_0's binary_logloss: 0.471804\n", - "[56]\tvalid_0's auc: 0.812272\tvalid_0's binary_logloss: 0.471275\n", - "[57]\tvalid_0's auc: 0.812639\tvalid_0's binary_logloss: 0.470396\n", - "[58]\tvalid_0's auc: 0.812764\tvalid_0's binary_logloss: 0.469597\n", - "[59]\tvalid_0's auc: 0.813084\tvalid_0's binary_logloss: 0.469049\n", - "[60]\tvalid_0's auc: 0.813342\tvalid_0's binary_logloss: 0.468244\n", - "[61]\tvalid_0's auc: 0.813302\tvalid_0's binary_logloss: 0.467499\n", - "[62]\tvalid_0's auc: 0.813221\tvalid_0's binary_logloss: 0.466758\n", - "[63]\tvalid_0's auc: 0.813697\tvalid_0's binary_logloss: 0.466017\n", - "[64]\tvalid_0's auc: 0.813985\tvalid_0's binary_logloss: 0.465501\n", - "[65]\tvalid_0's auc: 0.81416\tvalid_0's binary_logloss: 0.464725\n", - "[66]\tvalid_0's auc: 0.814227\tvalid_0's binary_logloss: 0.46398\n", - "[67]\tvalid_0's auc: 0.814397\tvalid_0's binary_logloss: 0.463309\n", - "[68]\tvalid_0's auc: 0.814426\tvalid_0's binary_logloss: 0.462627\n", - "[69]\tvalid_0's auc: 0.814593\tvalid_0's binary_logloss: 0.462244\n", - "[70]\tvalid_0's auc: 0.814789\tvalid_0's binary_logloss: 0.461571\n", - "[71]\tvalid_0's auc: 0.814889\tvalid_0's binary_logloss: 0.461144\n", - "[72]\tvalid_0's auc: 0.815078\tvalid_0's binary_logloss: 0.460684\n", - "[73]\tvalid_0's auc: 0.815439\tvalid_0's binary_logloss: 0.460063\n", - "[74]\tvalid_0's auc: 0.815511\tvalid_0's binary_logloss: 0.459386\n", - "[75]\tvalid_0's auc: 0.815574\tvalid_0's binary_logloss: 0.45877\n", - "[76]\tvalid_0's auc: 0.815634\tvalid_0's binary_logloss: 0.458128\n", - "[77]\tvalid_0's auc: 0.815618\tvalid_0's binary_logloss: 0.457495\n", - "[78]\tvalid_0's auc: 0.81582\tvalid_0's binary_logloss: 0.457057\n", - "[79]\tvalid_0's auc: 0.81594\tvalid_0's binary_logloss: 0.456475\n", - "[80]\tvalid_0's auc: 0.815961\tvalid_0's binary_logloss: 0.455885\n", - "[81]\tvalid_0's auc: 0.816153\tvalid_0's binary_logloss: 0.455511\n", - "[82]\tvalid_0's auc: 0.816433\tvalid_0's binary_logloss: 0.455186\n", - "[83]\tvalid_0's auc: 0.816546\tvalid_0's binary_logloss: 0.454625\n", - "[84]\tvalid_0's auc: 0.816586\tvalid_0's binary_logloss: 0.454039\n", - "[85]\tvalid_0's auc: 0.816584\tvalid_0's binary_logloss: 0.453482\n", - "[86]\tvalid_0's auc: 0.816881\tvalid_0's binary_logloss: 0.453048\n", - "[87]\tvalid_0's auc: 0.817029\tvalid_0's binary_logloss: 0.452485\n", - "[88]\tvalid_0's auc: 0.81707\tvalid_0's binary_logloss: 0.451941\n", - "[89]\tvalid_0's auc: 0.817298\tvalid_0's binary_logloss: 0.451544\n", - "[90]\tvalid_0's auc: 0.817343\tvalid_0's binary_logloss: 0.450975\n", - "[91]\tvalid_0's auc: 0.817357\tvalid_0's binary_logloss: 0.450422\n", - "[92]\tvalid_0's auc: 0.817592\tvalid_0's binary_logloss: 0.450109\n", - "[93]\tvalid_0's auc: 0.817729\tvalid_0's binary_logloss: 0.449542\n", - "[94]\tvalid_0's auc: 0.817834\tvalid_0's binary_logloss: 0.448982\n", - "[95]\tvalid_0's auc: 0.81809\tvalid_0's binary_logloss: 0.448398\n", - "[96]\tvalid_0's auc: 0.818269\tvalid_0's binary_logloss: 0.447908\n", - "[97]\tvalid_0's auc: 0.818682\tvalid_0's binary_logloss: 0.447547\n", - "[98]\tvalid_0's auc: 0.819015\tvalid_0's binary_logloss: 0.447165\n", - "[99]\tvalid_0's auc: 0.819016\tvalid_0's binary_logloss: 0.446669\n", - "[100]\tvalid_0's auc: 0.819127\tvalid_0's binary_logloss: 0.446397\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.819127\tvalid_0's binary_logloss: 0.446397\n" - ] - } - ], - "source": [ - "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", - "# 这一部分与前面的单独训练和验证是分开的\n", - "def get_kfold_users(trn_df, n=5):\n", - " user_ids = trn_df['user_id'].unique()\n", - " user_set = [user_ids[i::n] for i in range(n)]\n", - " return user_set\n", - "\n", - "k_fold = 5\n", - "trn_df = trn_user_item_feats_df_rank_model\n", - "user_set = get_kfold_users(trn_df, n=k_fold)\n", - "\n", - "score_list = []\n", - "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n", - "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", - "\n", - "# 五折交叉验证,并将中间结果保存用于staking\n", - "for n_fold, valid_user in enumerate(user_set):\n", - " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", - " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", - " \n", - " # 模型及参数的定义\n", - " lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", - " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", - " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) \n", - " # 训练模型\n", - " lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], \n", - " eval_metric=['auc', ],early_stopping_rounds=50, )\n", - " \n", - " # 预测验证集结果\n", - " valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], \n", - " num_iteration=lgb_Classfication.best_iteration_)[:,1]\n", - " \n", - " # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化\n", - " # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))\n", - " \n", - " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", - " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", - " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", - " \n", - " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", - " if not offline:\n", - " sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], \n", - " num_iteration=lgb_Classfication.best_iteration_)[:,1]\n", - " \n", - "score_df_ = pd.concat(score_list, axis=0)\n", - "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", - "# 保存训练集交叉验证产生的新特征\n", - "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)\n", - " \n", - "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", - "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n", - "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n", - "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n", - "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - "\n", - "# 保存测试集交叉验证的新特征\n", - "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:24:23.074237Z", - "start_time": "2020-11-18T04:24:13.812284Z" - } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]\n", - "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", - "submit(rank_results, topk=5, model_name='lgb_cls')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DIN模型" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户的历史点击行为列表\n", - "这个是为后面的DIN模型服务的" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:24:30.508213Z", - "start_time": "2020-11-18T04:24:27.426372Z" - } - }, - "outputs": [], - "source": [ - "if offline:\n", - " all_data = pd.read_csv('./data_raw/train_click_log.csv')\n", - "else:\n", - " trn_data = pd.read_csv('./data_raw/train_click_log.csv')\n", - " tst_data = pd.read_csv('./data_raw/testA_click_log.csv')\n", - " all_data = trn_data.append(tst_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:25:28.082071Z", - "start_time": "2020-11-18T04:24:33.649524Z" - } - }, - "outputs": [], - "source": [ - "hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()\n", - "his_behavior_df = pd.DataFrame()\n", - "his_behavior_df['user_id'] = hist_click['user_id']\n", - "his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:25:52.925866Z", - "start_time": "2020-11-18T04:25:52.863922Z" - } - }, - "outputs": [], - "source": [ - "trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()\n", - "\n", - "if offline:\n", - " val_user_item_feats_df_din_model = val_user_item_feats_df.copy()\n", - "else: \n", - " val_user_item_feats_df_din_model = None\n", - " \n", - "tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:00.070681Z", - "start_time": "2020-11-18T04:25:56.417197Z" - } - }, - "outputs": [], - "source": [ - "trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')\n", - "\n", - "if offline:\n", - " val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')\n", - "else:\n", - " val_user_item_feats_df_din_model = None\n", - "\n", - "tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DIN模型简介\n", - "我们下面尝试使用DIN模型, DIN的全称是Deep Interest Network, 这是阿里2018年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型, 它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性,来计算用户兴趣的表示向量。具体来说就是通过引入局部激活单元,通过软搜索历史行为的相关部分来关注相关的用户兴趣,并采用加权和来获得有关候选广告的用户兴趣的表示。与候选广告相关性较高的行为会获得较高的激活权重,并支配着用户兴趣。该表示向量在不同广告上有所不同,大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合, 我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。 该模型的结构如下:\n", - "\n", - "![image-20201116201646983](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png)\n", - "\n", - "\n", - "我们这里直接调包来使用这个模型, 关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用:deepctr的函数原型如下:\n", - "> def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False,\n", - "> dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation=\"dice\",\n", - "> att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024,\n", - "> task='binary'):\n", - "> \n", - "> * dnn_feature_columns: 特征列, 包含数据所有特征的列表\n", - "> * history_feature_list: 用户历史行为列, 反应用户历史行为的特征的列表\n", - "> * dnn_use_bn: 是否使用BatchNormalization\n", - "> * dnn_hidden_units: 全连接层网络的层数和每一层神经元的个数, 一个列表或者元组\n", - "> * dnn_activation_relu: 全连接网络的激活单元类型\n", - "> * att_hidden_size: 注意力层的全连接网络的层数和每一层神经元的个数\n", - "> * att_activation: 注意力层的激活单元类型\n", - "> * att_weight_normalization: 是否归一化注意力得分\n", - "> * l2_reg_dnn: 全连接网络的正则化系数\n", - "> * l2_reg_embedding: embedding向量的正则化稀疏\n", - "> * dnn_dropout: 全连接网络的神经元的失活概率\n", - "> * task: 任务, 可以是分类, 也可是是回归\n", - "\n", - "在具体使用的时候, 我们必须要传入特征列和历史行为列, 但是再传入之前, 我们需要进行一下特征列的预处理。具体如下:\n", - "\n", - "1. 首先,我们要处理数据集, 得到数据, 由于我们是基于用户过去的行为去预测用户是否点击当前文章, 所以我们需要把数据的特征列划分成数值型特征, 离散型特征和历史行为特征列三部分, 对于每一部分, DIN模型的处理会有不同\n", - " 1. 对于离散型特征, 在我们的数据集中就是那些类别型的特征, 比如user_id这种, 这种类别型特征, 我们首先要经过embedding处理得到每个特征的低维稠密型表示, 既然要经过embedding, 那么我们就需要为每一列的类别特征的取值建立一个字典,并指明embedding维度, 所以在使用deepctr的DIN模型准备数据的时候, 我们需要通过SparseFeat函数指明这些类别型特征, 这个函数的传入参数就是列名, 列的唯一取值(建立字典用)和embedding维度。\n", - " 2. 对于用户历史行为特征列, 比如文章id, 文章的类别等这种, 同样的我们需要先经过embedding处理, 只不过和上面不一样的地方是,对于这种特征, 我们在得到每个特征的embedding表示之后, 还需要通过一个Attention_layer计算用户的历史行为和当前候选文章的相关性以此得到当前用户的embedding向量, 这个向量就可以基于当前的候选文章与用户过去点击过得历史文章的相似性的程度来反应用户的兴趣, 并且随着用户的不同的历史点击来变化,去动态的模拟用户兴趣的变化过程。这类特征对于每个用户都是一个历史行为序列, 对于每个用户, 历史行为序列长度会不一样, 可能有的用户点击的历史文章多,有的点击的历史文章少, 所以我们还需要把这个长度统一起来, 在为DIN模型准备数据的时候, 我们首先要通过SparseFeat函数指明这些类别型特征, 然后还需要通过VarLenSparseFeat函数再进行序列填充, 使得每个用户的历史序列一样长, 所以这个函数参数中会有个maxlen,来指明序列的最大长度是多少。\n", - " 3. 对于连续型特征列, 我们只需要用DenseFeat函数来指明列名和维度即可。\n", - "2. 处理完特征列之后, 我们把相应的数据与列进行对应,就得到了最后的数据。\n", - "\n", - "下面根据具体的代码感受一下, 逻辑是这样, 首先我们需要写一个数据准备函数, 在这里面就是根据上面的具体步骤准备数据, 得到数据和特征列, 然后就是建立DIN模型并训练, 最后基于模型进行测试。" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:08.405211Z", - "start_time": "2020-11-18T04:26:04.887013Z" - } - }, - "outputs": [], - "source": [ - "# 导入deepctr\n", - "from deepctr.models import DIN\n", - "from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names\n", - "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", - "\n", - "from tensorflow.keras import backend as K\n", - "from tensorflow.keras.layers import *\n", - "from tensorflow.keras.models import *\n", - "from tensorflow.keras.callbacks import * \n", - "import tensorflow as tf\n", - "\n", - "import os\n", - "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", - "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:13.485712Z", - "start_time": "2020-11-18T04:26:13.476042Z" - } - }, - "outputs": [], - "source": [ - "# 数据准备函数\n", - "def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100):\n", - " \"\"\"\n", - " 数据准备函数:\n", - " df: 数据集\n", - " dense_fea: 数值型特征列\n", - " sparse_fea: 离散型特征列\n", - " behavior_fea: 用户的候选行为特征列\n", - " his_behavior_fea: 用户的历史行为特征列\n", - " embedding_dim: embedding的维度, 这里为了简单, 统一把离散型特征列采用一样的隐向量维度\n", - " max_len: 用户序列的最大长度\n", - " \"\"\"\n", - " \n", - " sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea]\n", - " \n", - " dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]\n", - " \n", - " var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,\n", - " embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in hist_behavior_fea]\n", - " \n", - " dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns\n", - " \n", - " # 建立x, x是一个字典的形式\n", - " x = {}\n", - " for name in get_feature_names(dnn_feature_columns):\n", - " if name in his_behavior_fea:\n", - " # 这是历史行为序列\n", - " his_list = [l for l in df[name]]\n", - " x[name] = pad_sequences(his_list, maxlen=max_len, padding='post') # 二维数组\n", - " else:\n", - " x[name] = df[name].values\n", - " \n", - " return x, dnn_feature_columns" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:18.783217Z", - "start_time": "2020-11-18T04:26:18.776795Z" - } - }, - "outputs": [], - "source": [ - "# 把特征分开\n", - "sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', \n", - " 'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']\n", - "\n", - "behavior_fea = ['click_article_id']\n", - "\n", - "hist_behavior_fea = ['hist_click_article_id']\n", - "\n", - "dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',\n", - " 'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',\n", - " 'words_hbo','words_count']" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:25.469810Z", - "start_time": "2020-11-18T04:26:24.779347Z" - } - }, - "outputs": [], - "source": [ - "# dense特征进行归一化, 神经网络训练都需要将数值进行归一化处理\n", - "mm = MinMaxScaler()\n", - "\n", - "# 下面是做一些特殊处理,当在其他的地方出现无效值的时候,不处理无法进行归一化,刚开始可以先把他注释掉,在运行了下面的代码\n", - "# 之后如果发现报错,应该先去想办法处理如何不出现inf之类的值\n", - "# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)\n", - "# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)\n", - "\n", - "for feat in dense_fea:\n", - " trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])\n", - " \n", - " if val_user_item_feats_df_din_model is not None:\n", - " val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])\n", - " \n", - " tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:36.727753Z", - "start_time": "2020-11-18T04:26:28.854705Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:28.616665Z", + "start_time": "2020-11-18T04:21:24.672280Z" + } + }, + "outputs": [], + "source": [ + "# 模型预测\n", + "tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n", + "\n", + "# 将这里的排序结果保存一份,用户后面的模型融合\n", + "tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n" - ] - } - ], - "source": [ - "# 准备训练数据\n", - "x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - "y_trn = trn_user_item_feats_df_din_model['label'].values\n", - "\n", - "if offline:\n", - " # 准备验证数据\n", - " x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - " y_val = val_user_item_feats_df_din_model['label'].values\n", - " \n", - "dense_fea = [x for x in dense_fea if x != 'label']\n", - "x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:45.146318Z", - "start_time": "2020-11-18T04:26:40.423914Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:40.253692Z", + "start_time": "2020-11-18T04:21:30.546587Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]\n", + "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", + "submit(rank_results, topk=5, model_name='lgb_ranker')" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:255: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", - "Model: \"model\"\n", - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "user_id (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_article_id (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "category_id (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_environment (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_deviceGroup (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_os (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_country (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_region (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_referrer_type (InputLayer [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "is_cat_hab (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_user_id (Embedding) (None, 1, 32) 1600032 user_id[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_seq_emb_hist_click_artic multiple 525664 click_article_id[0][0] \n", - " hist_click_article_id[0][0] \n", - " click_article_id[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_category_id (Embeddi (None, 1, 32) 7776 category_id[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_environment (E (None, 1, 32) 128 click_environment[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_deviceGroup (E (None, 1, 32) 160 click_deviceGroup[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_os (Embedding) (None, 1, 32) 288 click_os[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_country (Embed (None, 1, 32) 384 click_country[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_region (Embedd (None, 1, 32) 928 click_region[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_referrer_type (None, 1, 32) 256 click_referrer_type[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_is_cat_hab (Embeddin (None, 1, 32) 64 is_cat_hab[0][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask (NoMask) (None, 1, 32) 0 sparse_emb_user_id[0][0] \n", - " sparse_seq_emb_hist_click_article\n", - " sparse_emb_category_id[0][0] \n", - " sparse_emb_click_environment[0][0\n", - " sparse_emb_click_deviceGroup[0][0\n", - " sparse_emb_click_os[0][0] \n", - " sparse_emb_click_country[0][0] \n", - " sparse_emb_click_region[0][0] \n", - " sparse_emb_click_referrer_type[0]\n", - " sparse_emb_is_cat_hab[0][0] \n", - "__________________________________________________________________________________________________\n", - "hist_click_article_id (InputLay [(None, 50)] 0 \n", - "__________________________________________________________________________________________________\n", - "concatenate (Concatenate) (None, 1, 320) 0 no_mask[0][0] \n", - " no_mask[1][0] \n", - " no_mask[2][0] \n", - " no_mask[3][0] \n", - " no_mask[4][0] \n", - " no_mask[5][0] \n", - " no_mask[6][0] \n", - " no_mask[7][0] \n", - " no_mask[8][0] \n", - " no_mask[9][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask_1 (NoMask) (None, 1, 320) 0 concatenate[0][0] \n", - "__________________________________________________________________________________________________\n", - "attention_sequence_pooling_laye (None, 1, 32) 13961 sparse_seq_emb_hist_click_article\n", - " sparse_seq_emb_hist_click_article\n", - "__________________________________________________________________________________________________\n", - "concatenate_1 (Concatenate) (None, 1, 352) 0 no_mask_1[0][0] \n", - " attention_sequence_pooling_layer[\n", - "__________________________________________________________________________________________________\n", - "sim0 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "time_diff0 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "word_diff0 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sim_max (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sim_min (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sim_sum (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sim_mean (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "score (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "rank (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_size (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "time_diff_mean (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "active_level (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "user_time_hob1 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "user_time_hob2 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "words_hbo (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "words_count (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "flatten (Flatten) (None, 352) 0 concatenate_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask_3 (NoMask) (None, 1) 0 sim0[0][0] \n", - " time_diff0[0][0] \n", - " word_diff0[0][0] \n", - " sim_max[0][0] \n", - " sim_min[0][0] \n", - " sim_sum[0][0] \n", - " sim_mean[0][0] \n", - " score[0][0] \n", - " rank[0][0] \n", - " click_size[0][0] \n", - " time_diff_mean[0][0] \n", - " active_level[0][0] \n", - " user_time_hob1[0][0] \n", - " user_time_hob2[0][0] \n", - " words_hbo[0][0] \n", - " words_count[0][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask_2 (NoMask) (None, 352) 0 flatten[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_2 (Concatenate) (None, 16) 0 no_mask_3[0][0] \n", - " no_mask_3[1][0] \n", - " no_mask_3[2][0] \n", - " no_mask_3[3][0] \n", - " no_mask_3[4][0] \n", - " no_mask_3[5][0] \n", - " no_mask_3[6][0] \n", - " no_mask_3[7][0] \n", - " no_mask_3[8][0] \n", - " no_mask_3[9][0] \n", - " no_mask_3[10][0] \n", - " no_mask_3[11][0] \n", - " no_mask_3[12][0] \n", - " no_mask_3[13][0] \n", - " no_mask_3[14][0] \n", - " no_mask_3[15][0] \n", - "__________________________________________________________________________________________________\n", - "flatten_1 (Flatten) (None, 352) 0 no_mask_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "flatten_2 (Flatten) (None, 16) 0 concatenate_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask_4 (NoMask) multiple 0 flatten_1[0][0] \n", - " flatten_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_3 (Concatenate) (None, 368) 0 no_mask_4[0][0] \n", - " no_mask_4[1][0] \n", - "__________________________________________________________________________________________________\n", - "dnn_1 (DNN) (None, 80) 89880 concatenate_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense (Dense) (None, 1) 80 dnn_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "prediction_layer (PredictionLay (None, 1) 1 dense[0][0] \n", - "==================================================================================================\n", - "Total params: 2,239,602\n", - "Trainable params: 2,239,362\n", - "Non-trainable params: 240\n", - "__________________________________________________________________________________________________\n" - ] - } - ], - "source": [ - "# 建立模型\n", - "model = DIN(dnn_feature_columns, behavior_fea)\n", - "\n", - "# 查看模型结构\n", - "model.summary()\n", - "\n", - "# 模型编译\n", - "model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:28:43.885773Z", - "start_time": "2020-11-18T04:26:48.746787Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:22:26.195838Z", + "start_time": "2020-11-18T04:21:46.115002Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1]\tvalid_0's ndcg@1: 0.909975\tvalid_0's ndcg@2: 0.963068\tvalid_0's ndcg@3: 0.96533\tvalid_0's ndcg@4: 0.965729\tvalid_0's ndcg@5: 0.965864\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.9143\tvalid_0's ndcg@2: 0.964711\tvalid_0's ndcg@3: 0.966961\tvalid_0's ndcg@4: 0.967338\tvalid_0's ndcg@5: 0.967483\n", + "[3]\tvalid_0's ndcg@1: 0.9181\tvalid_0's ndcg@2: 0.966114\tvalid_0's ndcg@3: 0.968289\tvalid_0's ndcg@4: 0.968773\tvalid_0's ndcg@5: 0.96887\n", + "[4]\tvalid_0's ndcg@1: 0.925575\tvalid_0's ndcg@2: 0.969093\tvalid_0's ndcg@3: 0.971193\tvalid_0's ndcg@4: 0.971603\tvalid_0's ndcg@5: 0.97169\n", + "[5]\tvalid_0's ndcg@1: 0.9267\tvalid_0's ndcg@2: 0.969635\tvalid_0's ndcg@3: 0.97166\tvalid_0's ndcg@4: 0.972037\tvalid_0's ndcg@5: 0.972133\n", + "[6]\tvalid_0's ndcg@1: 0.927\tvalid_0's ndcg@2: 0.969682\tvalid_0's ndcg@3: 0.971757\tvalid_0's ndcg@4: 0.972134\tvalid_0's ndcg@5: 0.972231\n", + "[7]\tvalid_0's ndcg@1: 0.928825\tvalid_0's ndcg@2: 0.970451\tvalid_0's ndcg@3: 0.972476\tvalid_0's ndcg@4: 0.97282\tvalid_0's ndcg@5: 0.972927\n", + "[8]\tvalid_0's ndcg@1: 0.930025\tvalid_0's ndcg@2: 0.970988\tvalid_0's ndcg@3: 0.972951\tvalid_0's ndcg@4: 0.973295\tvalid_0's ndcg@5: 0.973402\n", + "[9]\tvalid_0's ndcg@1: 0.931125\tvalid_0's ndcg@2: 0.971347\tvalid_0's ndcg@3: 0.973384\tvalid_0's ndcg@4: 0.973707\tvalid_0's ndcg@5: 0.973794\n", + "[10]\tvalid_0's ndcg@1: 0.9311\tvalid_0's ndcg@2: 0.971385\tvalid_0's ndcg@3: 0.973372\tvalid_0's ndcg@4: 0.973717\tvalid_0's ndcg@5: 0.973794\n", + "[11]\tvalid_0's ndcg@1: 0.930975\tvalid_0's ndcg@2: 0.971433\tvalid_0's ndcg@3: 0.973333\tvalid_0's ndcg@4: 0.973699\tvalid_0's ndcg@5: 0.973767\n", + "[12]\tvalid_0's ndcg@1: 0.93145\tvalid_0's ndcg@2: 0.971656\tvalid_0's ndcg@3: 0.973493\tvalid_0's ndcg@4: 0.973881\tvalid_0's ndcg@5: 0.973949\n", + "[13]\tvalid_0's ndcg@1: 0.932525\tvalid_0's ndcg@2: 0.971927\tvalid_0's ndcg@3: 0.973839\tvalid_0's ndcg@4: 0.974227\tvalid_0's ndcg@5: 0.974304\n", + "[14]\tvalid_0's ndcg@1: 0.932575\tvalid_0's ndcg@2: 0.971898\tvalid_0's ndcg@3: 0.973823\tvalid_0's ndcg@4: 0.974243\tvalid_0's ndcg@5: 0.97432\n", + "[15]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972239\tvalid_0's ndcg@3: 0.974189\tvalid_0's ndcg@4: 0.974587\tvalid_0's ndcg@5: 0.974665\n", + "[16]\tvalid_0's ndcg@1: 0.933475\tvalid_0's ndcg@2: 0.972309\tvalid_0's ndcg@3: 0.974209\tvalid_0's ndcg@4: 0.974596\tvalid_0's ndcg@5: 0.974674\n", + "[17]\tvalid_0's ndcg@1: 0.933725\tvalid_0's ndcg@2: 0.972369\tvalid_0's ndcg@3: 0.974307\tvalid_0's ndcg@4: 0.974684\tvalid_0's ndcg@5: 0.974761\n", + "[18]\tvalid_0's ndcg@1: 0.9339\tvalid_0's ndcg@2: 0.972497\tvalid_0's ndcg@3: 0.974372\tvalid_0's ndcg@4: 0.974749\tvalid_0's ndcg@5: 0.974836\n", + "[19]\tvalid_0's ndcg@1: 0.9345\tvalid_0's ndcg@2: 0.972845\tvalid_0's ndcg@3: 0.974645\tvalid_0's ndcg@4: 0.974979\tvalid_0's ndcg@5: 0.975085\n", + "[20]\tvalid_0's ndcg@1: 0.9349\tvalid_0's ndcg@2: 0.973103\tvalid_0's ndcg@3: 0.97484\tvalid_0's ndcg@4: 0.975174\tvalid_0's ndcg@5: 0.975271\n", + "[21]\tvalid_0's ndcg@1: 0.935\tvalid_0's ndcg@2: 0.973092\tvalid_0's ndcg@3: 0.97488\tvalid_0's ndcg@4: 0.975192\tvalid_0's ndcg@5: 0.975289\n", + "[22]\tvalid_0's ndcg@1: 0.93525\tvalid_0's ndcg@2: 0.9732\tvalid_0's ndcg@3: 0.974988\tvalid_0's ndcg@4: 0.975289\tvalid_0's ndcg@5: 0.975386\n", + "[23]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.972949\tvalid_0's ndcg@3: 0.974824\tvalid_0's ndcg@4: 0.975136\tvalid_0's ndcg@5: 0.975223\n", + "[24]\tvalid_0's ndcg@1: 0.93545\tvalid_0's ndcg@2: 0.973274\tvalid_0's ndcg@3: 0.975087\tvalid_0's ndcg@4: 0.975388\tvalid_0's ndcg@5: 0.975475\n", + "[25]\tvalid_0's ndcg@1: 0.9356\tvalid_0's ndcg@2: 0.973345\tvalid_0's ndcg@3: 0.97512\tvalid_0's ndcg@4: 0.975443\tvalid_0's ndcg@5: 0.97553\n", + "[26]\tvalid_0's ndcg@1: 0.93525\tvalid_0's ndcg@2: 0.9732\tvalid_0's ndcg@3: 0.975\tvalid_0's ndcg@4: 0.975313\tvalid_0's ndcg@5: 0.9754\n", + "[27]\tvalid_0's ndcg@1: 0.935175\tvalid_0's ndcg@2: 0.97322\tvalid_0's ndcg@3: 0.974983\tvalid_0's ndcg@4: 0.975295\tvalid_0's ndcg@5: 0.975382\n", + "[28]\tvalid_0's ndcg@1: 0.935425\tvalid_0's ndcg@2: 0.973328\tvalid_0's ndcg@3: 0.975041\tvalid_0's ndcg@4: 0.975374\tvalid_0's ndcg@5: 0.975471\n", + "[29]\tvalid_0's ndcg@1: 0.935275\tvalid_0's ndcg@2: 0.973225\tvalid_0's ndcg@3: 0.974963\tvalid_0's ndcg@4: 0.975297\tvalid_0's ndcg@5: 0.975403\n", + "[30]\tvalid_0's ndcg@1: 0.9353\tvalid_0's ndcg@2: 0.973235\tvalid_0's ndcg@3: 0.97501\tvalid_0's ndcg@4: 0.975311\tvalid_0's ndcg@5: 0.975418\n", + "[31]\tvalid_0's ndcg@1: 0.9356\tvalid_0's ndcg@2: 0.973361\tvalid_0's ndcg@3: 0.975099\tvalid_0's ndcg@4: 0.975422\tvalid_0's ndcg@5: 0.975528\n", + "[32]\tvalid_0's ndcg@1: 0.9364\tvalid_0's ndcg@2: 0.973641\tvalid_0's ndcg@3: 0.975391\tvalid_0's ndcg@4: 0.975714\tvalid_0's ndcg@5: 0.97582\n", + "[33]\tvalid_0's ndcg@1: 0.9367\tvalid_0's ndcg@2: 0.973751\tvalid_0's ndcg@3: 0.975501\tvalid_0's ndcg@4: 0.975824\tvalid_0's ndcg@5: 0.975931\n", + "[34]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.973902\tvalid_0's ndcg@3: 0.975677\tvalid_0's ndcg@4: 0.975989\tvalid_0's ndcg@5: 0.976095\n", + "[35]\tvalid_0's ndcg@1: 0.9377\tvalid_0's ndcg@2: 0.974105\tvalid_0's ndcg@3: 0.975892\tvalid_0's ndcg@4: 0.976194\tvalid_0's ndcg@5: 0.9763\n", + "[36]\tvalid_0's ndcg@1: 0.938\tvalid_0's ndcg@2: 0.974184\tvalid_0's ndcg@3: 0.975984\tvalid_0's ndcg@4: 0.976296\tvalid_0's ndcg@5: 0.976402\n", + "[37]\tvalid_0's ndcg@1: 0.93845\tvalid_0's ndcg@2: 0.974366\tvalid_0's ndcg@3: 0.976166\tvalid_0's ndcg@4: 0.976467\tvalid_0's ndcg@5: 0.976574\n", + "[38]\tvalid_0's ndcg@1: 0.938925\tvalid_0's ndcg@2: 0.974557\tvalid_0's ndcg@3: 0.976332\tvalid_0's ndcg@4: 0.976655\tvalid_0's ndcg@5: 0.976751\n", + "[39]\tvalid_0's ndcg@1: 0.93865\tvalid_0's ndcg@2: 0.974471\tvalid_0's ndcg@3: 0.976234\tvalid_0's ndcg@4: 0.976557\tvalid_0's ndcg@5: 0.976653\n", + "[40]\tvalid_0's ndcg@1: 0.938325\tvalid_0's ndcg@2: 0.974335\tvalid_0's ndcg@3: 0.97611\tvalid_0's ndcg@4: 0.976433\tvalid_0's ndcg@5: 0.97653\n", + "[41]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.974669\tvalid_0's ndcg@3: 0.976431\tvalid_0's ndcg@4: 0.976743\tvalid_0's ndcg@5: 0.97683\n", + "[42]\tvalid_0's ndcg@1: 0.939375\tvalid_0's ndcg@2: 0.974833\tvalid_0's ndcg@3: 0.976546\tvalid_0's ndcg@4: 0.976858\tvalid_0's ndcg@5: 0.976945\n", + "[43]\tvalid_0's ndcg@1: 0.939625\tvalid_0's ndcg@2: 0.974878\tvalid_0's ndcg@3: 0.976628\tvalid_0's ndcg@4: 0.97694\tvalid_0's ndcg@5: 0.977027\n", + "[44]\tvalid_0's ndcg@1: 0.9395\tvalid_0's ndcg@2: 0.974832\tvalid_0's ndcg@3: 0.97657\tvalid_0's ndcg@4: 0.976893\tvalid_0's ndcg@5: 0.97698\n", + "[45]\tvalid_0's ndcg@1: 0.939775\tvalid_0's ndcg@2: 0.974949\tvalid_0's ndcg@3: 0.976674\tvalid_0's ndcg@4: 0.976997\tvalid_0's ndcg@5: 0.977084\n", + "[46]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.974945\tvalid_0's ndcg@3: 0.976708\tvalid_0's ndcg@4: 0.97702\tvalid_0's ndcg@5: 0.977107\n", + "[47]\tvalid_0's ndcg@1: 0.94005\tvalid_0's ndcg@2: 0.975004\tvalid_0's ndcg@3: 0.976766\tvalid_0's ndcg@4: 0.977078\tvalid_0's ndcg@5: 0.977175\n", + "[48]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975189\tvalid_0's ndcg@3: 0.976939\tvalid_0's ndcg@4: 0.97723\tvalid_0's ndcg@5: 0.977327\n", + "[49]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975189\tvalid_0's ndcg@3: 0.976939\tvalid_0's ndcg@4: 0.97723\tvalid_0's ndcg@5: 0.977327\n", + "[50]\tvalid_0's ndcg@1: 0.9405\tvalid_0's ndcg@2: 0.975264\tvalid_0's ndcg@3: 0.976989\tvalid_0's ndcg@4: 0.977291\tvalid_0's ndcg@5: 0.977368\n", + "[51]\tvalid_0's ndcg@1: 0.941125\tvalid_0's ndcg@2: 0.975526\tvalid_0's ndcg@3: 0.977226\tvalid_0's ndcg@4: 0.977528\tvalid_0's ndcg@5: 0.977605\n", + "[52]\tvalid_0's ndcg@1: 0.941\tvalid_0's ndcg@2: 0.97548\tvalid_0's ndcg@3: 0.977193\tvalid_0's ndcg@4: 0.977484\tvalid_0's ndcg@5: 0.977561\n", + "[53]\tvalid_0's ndcg@1: 0.9411\tvalid_0's ndcg@2: 0.975596\tvalid_0's ndcg@3: 0.977259\tvalid_0's ndcg@4: 0.977539\tvalid_0's ndcg@5: 0.977616\n", + "[54]\tvalid_0's ndcg@1: 0.9412\tvalid_0's ndcg@2: 0.975712\tvalid_0's ndcg@3: 0.977299\tvalid_0's ndcg@4: 0.97759\tvalid_0's ndcg@5: 0.977667\n", + "[55]\tvalid_0's ndcg@1: 0.94155\tvalid_0's ndcg@2: 0.975841\tvalid_0's ndcg@3: 0.977429\tvalid_0's ndcg@4: 0.977719\tvalid_0's ndcg@5: 0.977797\n", + "[56]\tvalid_0's ndcg@1: 0.941825\tvalid_0's ndcg@2: 0.975943\tvalid_0's ndcg@3: 0.97753\tvalid_0's ndcg@4: 0.977821\tvalid_0's ndcg@5: 0.977898\n", + "[57]\tvalid_0's ndcg@1: 0.9416\tvalid_0's ndcg@2: 0.975891\tvalid_0's ndcg@3: 0.977429\tvalid_0's ndcg@4: 0.977741\tvalid_0's ndcg@5: 0.977818\n", + "[58]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.975969\tvalid_0's ndcg@3: 0.977494\tvalid_0's ndcg@4: 0.977795\tvalid_0's ndcg@5: 0.977873\n", + "[59]\tvalid_0's ndcg@1: 0.942025\tvalid_0's ndcg@2: 0.975985\tvalid_0's ndcg@3: 0.977547\tvalid_0's ndcg@4: 0.977881\tvalid_0's ndcg@5: 0.977958\n", + "[60]\tvalid_0's ndcg@1: 0.94205\tvalid_0's ndcg@2: 0.975994\tvalid_0's ndcg@3: 0.977569\tvalid_0's ndcg@4: 0.977892\tvalid_0's ndcg@5: 0.977969\n", + "[61]\tvalid_0's ndcg@1: 0.94205\tvalid_0's ndcg@2: 0.975947\tvalid_0's ndcg@3: 0.977559\tvalid_0's ndcg@4: 0.977882\tvalid_0's ndcg@5: 0.97796\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[62]\tvalid_0's ndcg@1: 0.942225\tvalid_0's ndcg@2: 0.976027\tvalid_0's ndcg@3: 0.97764\tvalid_0's ndcg@4: 0.977941\tvalid_0's ndcg@5: 0.978028\n", + "[63]\tvalid_0's ndcg@1: 0.942125\tvalid_0's ndcg@2: 0.976022\tvalid_0's ndcg@3: 0.977622\tvalid_0's ndcg@4: 0.977912\tvalid_0's ndcg@5: 0.977999\n", + "[64]\tvalid_0's ndcg@1: 0.942675\tvalid_0's ndcg@2: 0.976193\tvalid_0's ndcg@3: 0.977793\tvalid_0's ndcg@4: 0.978105\tvalid_0's ndcg@5: 0.978192\n", + "[65]\tvalid_0's ndcg@1: 0.942725\tvalid_0's ndcg@2: 0.976227\tvalid_0's ndcg@3: 0.977802\tvalid_0's ndcg@4: 0.978125\tvalid_0's ndcg@5: 0.978212\n", + "[66]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976132\tvalid_0's ndcg@3: 0.977695\tvalid_0's ndcg@4: 0.978018\tvalid_0's ndcg@5: 0.978105\n", + "[67]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976092\tvalid_0's ndcg@3: 0.977679\tvalid_0's ndcg@4: 0.978002\tvalid_0's ndcg@5: 0.978089\n", + "[68]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976148\tvalid_0's ndcg@3: 0.977698\tvalid_0's ndcg@4: 0.978021\tvalid_0's ndcg@5: 0.978108\n", + "[69]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976123\tvalid_0's ndcg@3: 0.977686\tvalid_0's ndcg@4: 0.978009\tvalid_0's ndcg@5: 0.978096\n", + "[70]\tvalid_0's ndcg@1: 0.942625\tvalid_0's ndcg@2: 0.976222\tvalid_0's ndcg@3: 0.977785\tvalid_0's ndcg@4: 0.978097\tvalid_0's ndcg@5: 0.978184\n", + "[71]\tvalid_0's ndcg@1: 0.942575\tvalid_0's ndcg@2: 0.976188\tvalid_0's ndcg@3: 0.977763\tvalid_0's ndcg@4: 0.978075\tvalid_0's ndcg@5: 0.978162\n", + "[72]\tvalid_0's ndcg@1: 0.9427\tvalid_0's ndcg@2: 0.976234\tvalid_0's ndcg@3: 0.977809\tvalid_0's ndcg@4: 0.978121\tvalid_0's ndcg@5: 0.978208\n", + "[73]\tvalid_0's ndcg@1: 0.9428\tvalid_0's ndcg@2: 0.976255\tvalid_0's ndcg@3: 0.977843\tvalid_0's ndcg@4: 0.978155\tvalid_0's ndcg@5: 0.978242\n", + "[74]\tvalid_0's ndcg@1: 0.94295\tvalid_0's ndcg@2: 0.97631\tvalid_0's ndcg@3: 0.977898\tvalid_0's ndcg@4: 0.97821\tvalid_0's ndcg@5: 0.978297\n", + "[75]\tvalid_0's ndcg@1: 0.943\tvalid_0's ndcg@2: 0.976329\tvalid_0's ndcg@3: 0.977941\tvalid_0's ndcg@4: 0.978232\tvalid_0's ndcg@5: 0.978319\n", + "[76]\tvalid_0's ndcg@1: 0.9433\tvalid_0's ndcg@2: 0.976471\tvalid_0's ndcg@3: 0.978059\tvalid_0's ndcg@4: 0.97836\tvalid_0's ndcg@5: 0.978437\n", + "[77]\tvalid_0's ndcg@1: 0.94315\tvalid_0's ndcg@2: 0.976416\tvalid_0's ndcg@3: 0.977991\tvalid_0's ndcg@4: 0.978314\tvalid_0's ndcg@5: 0.978381\n", + "[78]\tvalid_0's ndcg@1: 0.943675\tvalid_0's ndcg@2: 0.976657\tvalid_0's ndcg@3: 0.978194\tvalid_0's ndcg@4: 0.978517\tvalid_0's ndcg@5: 0.978585\n", + "[79]\tvalid_0's ndcg@1: 0.94365\tvalid_0's ndcg@2: 0.976663\tvalid_0's ndcg@3: 0.978188\tvalid_0's ndcg@4: 0.978501\tvalid_0's ndcg@5: 0.978578\n", + "[80]\tvalid_0's ndcg@1: 0.943725\tvalid_0's ndcg@2: 0.976628\tvalid_0's ndcg@3: 0.978203\tvalid_0's ndcg@4: 0.978515\tvalid_0's ndcg@5: 0.978593\n", + "[81]\tvalid_0's ndcg@1: 0.943975\tvalid_0's ndcg@2: 0.97672\tvalid_0's ndcg@3: 0.978295\tvalid_0's ndcg@4: 0.978607\tvalid_0's ndcg@5: 0.978685\n", + "[82]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.976822\tvalid_0's ndcg@3: 0.978397\tvalid_0's ndcg@4: 0.97872\tvalid_0's ndcg@5: 0.978787\n", + "[83]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.976788\tvalid_0's ndcg@3: 0.978375\tvalid_0's ndcg@4: 0.978698\tvalid_0's ndcg@5: 0.978766\n", + "[84]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.97679\tvalid_0's ndcg@3: 0.97839\tvalid_0's ndcg@4: 0.978702\tvalid_0's ndcg@5: 0.97878\n", + "[85]\tvalid_0's ndcg@1: 0.9443\tvalid_0's ndcg@2: 0.976809\tvalid_0's ndcg@3: 0.978421\tvalid_0's ndcg@4: 0.978723\tvalid_0's ndcg@5: 0.9788\n", + "[86]\tvalid_0's ndcg@1: 0.944525\tvalid_0's ndcg@2: 0.976939\tvalid_0's ndcg@3: 0.978502\tvalid_0's ndcg@4: 0.978814\tvalid_0's ndcg@5: 0.978891\n", + "[87]\tvalid_0's ndcg@1: 0.944625\tvalid_0's ndcg@2: 0.976976\tvalid_0's ndcg@3: 0.978551\tvalid_0's ndcg@4: 0.978852\tvalid_0's ndcg@5: 0.97893\n", + "[88]\tvalid_0's ndcg@1: 0.944925\tvalid_0's ndcg@2: 0.977102\tvalid_0's ndcg@3: 0.978677\tvalid_0's ndcg@4: 0.978968\tvalid_0's ndcg@5: 0.979045\n", + "[89]\tvalid_0's ndcg@1: 0.945125\tvalid_0's ndcg@2: 0.977208\tvalid_0's ndcg@3: 0.978758\tvalid_0's ndcg@4: 0.979048\tvalid_0's ndcg@5: 0.979126\n", + "[90]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977135\tvalid_0's ndcg@3: 0.978735\tvalid_0's ndcg@4: 0.979026\tvalid_0's ndcg@5: 0.979104\n", + "[91]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977208\tvalid_0's ndcg@3: 0.978858\tvalid_0's ndcg@4: 0.979138\tvalid_0's ndcg@5: 0.979215\n", + "[92]\tvalid_0's ndcg@1: 0.9455\tvalid_0's ndcg@2: 0.977267\tvalid_0's ndcg@3: 0.978905\tvalid_0's ndcg@4: 0.979174\tvalid_0's ndcg@5: 0.979251\n", + "[93]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977193\tvalid_0's ndcg@3: 0.978818\tvalid_0's ndcg@4: 0.979098\tvalid_0's ndcg@5: 0.979176\n", + "[94]\tvalid_0's ndcg@1: 0.94545\tvalid_0's ndcg@2: 0.97728\tvalid_0's ndcg@3: 0.97888\tvalid_0's ndcg@4: 0.97916\tvalid_0's ndcg@5: 0.979238\n", + "[95]\tvalid_0's ndcg@1: 0.9458\tvalid_0's ndcg@2: 0.977394\tvalid_0's ndcg@3: 0.979006\tvalid_0's ndcg@4: 0.979286\tvalid_0's ndcg@5: 0.979364\n", + "[96]\tvalid_0's ndcg@1: 0.946075\tvalid_0's ndcg@2: 0.977527\tvalid_0's ndcg@3: 0.979114\tvalid_0's ndcg@4: 0.979394\tvalid_0's ndcg@5: 0.979472\n", + "[97]\tvalid_0's ndcg@1: 0.946475\tvalid_0's ndcg@2: 0.977659\tvalid_0's ndcg@3: 0.979259\tvalid_0's ndcg@4: 0.979539\tvalid_0's ndcg@5: 0.979616\n", + "[98]\tvalid_0's ndcg@1: 0.94675\tvalid_0's ndcg@2: 0.97776\tvalid_0's ndcg@3: 0.97936\tvalid_0's ndcg@4: 0.979651\tvalid_0's ndcg@5: 0.979719\n", + "[99]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.977831\tvalid_0's ndcg@3: 0.979419\tvalid_0's ndcg@4: 0.97971\tvalid_0's ndcg@5: 0.979777\n", + "[100]\tvalid_0's ndcg@1: 0.9468\tvalid_0's ndcg@2: 0.977794\tvalid_0's ndcg@3: 0.979369\tvalid_0's ndcg@4: 0.979671\tvalid_0's ndcg@5: 0.979739\n", + "Did not meet early stopping. Best iteration is:\n", + "[99]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.977831\tvalid_0's ndcg@3: 0.979419\tvalid_0's ndcg@4: 0.97971\tvalid_0's ndcg@5: 0.979777\n", + "[1]\tvalid_0's ndcg@1: 0.909075\tvalid_0's ndcg@2: 0.963019\tvalid_0's ndcg@3: 0.965069\tvalid_0's ndcg@4: 0.965543\tvalid_0's ndcg@5: 0.965601\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.9123\tvalid_0's ndcg@2: 0.964273\tvalid_0's ndcg@3: 0.966248\tvalid_0's ndcg@4: 0.966722\tvalid_0's ndcg@5: 0.966789\n", + "[3]\tvalid_0's ndcg@1: 0.915075\tvalid_0's ndcg@2: 0.965691\tvalid_0's ndcg@3: 0.967466\tvalid_0's ndcg@4: 0.967854\tvalid_0's ndcg@5: 0.967922\n", + "[4]\tvalid_0's ndcg@1: 0.91845\tvalid_0's ndcg@2: 0.967047\tvalid_0's ndcg@3: 0.968735\tvalid_0's ndcg@4: 0.969133\tvalid_0's ndcg@5: 0.969201\n", + "[5]\tvalid_0's ndcg@1: 0.92355\tvalid_0's ndcg@2: 0.968961\tvalid_0's ndcg@3: 0.970674\tvalid_0's ndcg@4: 0.97104\tvalid_0's ndcg@5: 0.971098\n", + "[6]\tvalid_0's ndcg@1: 0.9253\tvalid_0's ndcg@2: 0.969607\tvalid_0's ndcg@3: 0.971345\tvalid_0's ndcg@4: 0.971689\tvalid_0's ndcg@5: 0.971747\n", + "[7]\tvalid_0's ndcg@1: 0.926225\tvalid_0's ndcg@2: 0.969933\tvalid_0's ndcg@3: 0.971708\tvalid_0's ndcg@4: 0.972031\tvalid_0's ndcg@5: 0.972079\n", + "[8]\tvalid_0's ndcg@1: 0.926475\tvalid_0's ndcg@2: 0.970104\tvalid_0's ndcg@3: 0.971804\tvalid_0's ndcg@4: 0.972116\tvalid_0's ndcg@5: 0.972184\n", + "[9]\tvalid_0's ndcg@1: 0.9277\tvalid_0's ndcg@2: 0.970682\tvalid_0's ndcg@3: 0.972307\tvalid_0's ndcg@4: 0.972598\tvalid_0's ndcg@5: 0.972675\n", + "[10]\tvalid_0's ndcg@1: 0.92775\tvalid_0's ndcg@2: 0.970653\tvalid_0's ndcg@3: 0.972316\tvalid_0's ndcg@4: 0.972617\tvalid_0's ndcg@5: 0.972685\n", + "[11]\tvalid_0's ndcg@1: 0.9283\tvalid_0's ndcg@2: 0.97084\tvalid_0's ndcg@3: 0.97254\tvalid_0's ndcg@4: 0.97281\tvalid_0's ndcg@5: 0.972887\n", + "[12]\tvalid_0's ndcg@1: 0.9287\tvalid_0's ndcg@2: 0.971051\tvalid_0's ndcg@3: 0.972701\tvalid_0's ndcg@4: 0.97297\tvalid_0's ndcg@5: 0.973048\n", + "[13]\tvalid_0's ndcg@1: 0.9297\tvalid_0's ndcg@2: 0.971389\tvalid_0's ndcg@3: 0.973001\tvalid_0's ndcg@4: 0.973313\tvalid_0's ndcg@5: 0.9734\n", + "[14]\tvalid_0's ndcg@1: 0.92955\tvalid_0's ndcg@2: 0.971444\tvalid_0's ndcg@3: 0.972994\tvalid_0's ndcg@4: 0.973284\tvalid_0's ndcg@5: 0.973371\n", + "[15]\tvalid_0's ndcg@1: 0.930225\tvalid_0's ndcg@2: 0.97174\tvalid_0's ndcg@3: 0.973253\tvalid_0's ndcg@4: 0.973543\tvalid_0's ndcg@5: 0.97363\n", + "[16]\tvalid_0's ndcg@1: 0.930425\tvalid_0's ndcg@2: 0.971798\tvalid_0's ndcg@3: 0.973298\tvalid_0's ndcg@4: 0.97361\tvalid_0's ndcg@5: 0.973698\n", + "[17]\tvalid_0's ndcg@1: 0.93125\tvalid_0's ndcg@2: 0.971992\tvalid_0's ndcg@3: 0.97358\tvalid_0's ndcg@4: 0.973903\tvalid_0's ndcg@5: 0.97398\n", + "[18]\tvalid_0's ndcg@1: 0.931925\tvalid_0's ndcg@2: 0.972257\tvalid_0's ndcg@3: 0.973845\tvalid_0's ndcg@4: 0.974146\tvalid_0's ndcg@5: 0.974224\n", + "[19]\tvalid_0's ndcg@1: 0.932375\tvalid_0's ndcg@2: 0.972376\tvalid_0's ndcg@3: 0.974038\tvalid_0's ndcg@4: 0.974318\tvalid_0's ndcg@5: 0.974376\n", + "[20]\tvalid_0's ndcg@1: 0.932\tvalid_0's ndcg@2: 0.972269\tvalid_0's ndcg@3: 0.973907\tvalid_0's ndcg@4: 0.974187\tvalid_0's ndcg@5: 0.974245\n", + "[21]\tvalid_0's ndcg@1: 0.932725\tvalid_0's ndcg@2: 0.972568\tvalid_0's ndcg@3: 0.974181\tvalid_0's ndcg@4: 0.974471\tvalid_0's ndcg@5: 0.974529\n", + "[22]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.972735\tvalid_0's ndcg@3: 0.974298\tvalid_0's ndcg@4: 0.974599\tvalid_0's ndcg@5: 0.974657\n", + "[23]\tvalid_0's ndcg@1: 0.932925\tvalid_0's ndcg@2: 0.972642\tvalid_0's ndcg@3: 0.974255\tvalid_0's ndcg@4: 0.974545\tvalid_0's ndcg@5: 0.974594\n", + "[24]\tvalid_0's ndcg@1: 0.933175\tvalid_0's ndcg@2: 0.972734\tvalid_0's ndcg@3: 0.974347\tvalid_0's ndcg@4: 0.974638\tvalid_0's ndcg@5: 0.974686\n", + "[25]\tvalid_0's ndcg@1: 0.9331\tvalid_0's ndcg@2: 0.972754\tvalid_0's ndcg@3: 0.974366\tvalid_0's ndcg@4: 0.974636\tvalid_0's ndcg@5: 0.974674\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[26]\tvalid_0's ndcg@1: 0.933275\tvalid_0's ndcg@2: 0.972787\tvalid_0's ndcg@3: 0.974424\tvalid_0's ndcg@4: 0.974694\tvalid_0's ndcg@5: 0.974732\n", + "[27]\tvalid_0's ndcg@1: 0.93325\tvalid_0's ndcg@2: 0.972809\tvalid_0's ndcg@3: 0.974434\tvalid_0's ndcg@4: 0.974703\tvalid_0's ndcg@5: 0.974732\n", + "[28]\tvalid_0's ndcg@1: 0.933625\tvalid_0's ndcg@2: 0.972932\tvalid_0's ndcg@3: 0.974557\tvalid_0's ndcg@4: 0.974826\tvalid_0's ndcg@5: 0.974855\n", + "[29]\tvalid_0's ndcg@1: 0.933725\tvalid_0's ndcg@2: 0.972937\tvalid_0's ndcg@3: 0.974587\tvalid_0's ndcg@4: 0.974856\tvalid_0's ndcg@5: 0.974885\n", + "[30]\tvalid_0's ndcg@1: 0.93355\tvalid_0's ndcg@2: 0.972873\tvalid_0's ndcg@3: 0.974523\tvalid_0's ndcg@4: 0.974792\tvalid_0's ndcg@5: 0.974821\n", + "[31]\tvalid_0's ndcg@1: 0.9342\tvalid_0's ndcg@2: 0.973065\tvalid_0's ndcg@3: 0.974753\tvalid_0's ndcg@4: 0.975022\tvalid_0's ndcg@5: 0.975051\n", + "[32]\tvalid_0's ndcg@1: 0.93435\tvalid_0's ndcg@2: 0.973152\tvalid_0's ndcg@3: 0.974815\tvalid_0's ndcg@4: 0.975084\tvalid_0's ndcg@5: 0.975113\n", + "[33]\tvalid_0's ndcg@1: 0.934475\tvalid_0's ndcg@2: 0.97323\tvalid_0's ndcg@3: 0.974855\tvalid_0's ndcg@4: 0.975135\tvalid_0's ndcg@5: 0.975164\n", + "[34]\tvalid_0's ndcg@1: 0.9342\tvalid_0's ndcg@2: 0.973113\tvalid_0's ndcg@3: 0.974738\tvalid_0's ndcg@4: 0.975028\tvalid_0's ndcg@5: 0.975057\n", + "[35]\tvalid_0's ndcg@1: 0.93455\tvalid_0's ndcg@2: 0.973258\tvalid_0's ndcg@3: 0.97487\tvalid_0's ndcg@4: 0.975172\tvalid_0's ndcg@5: 0.975201\n", + "[36]\tvalid_0's ndcg@1: 0.9344\tvalid_0's ndcg@2: 0.973265\tvalid_0's ndcg@3: 0.974828\tvalid_0's ndcg@4: 0.975129\tvalid_0's ndcg@5: 0.975158\n", + "[37]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.973438\tvalid_0's ndcg@3: 0.975013\tvalid_0's ndcg@4: 0.975304\tvalid_0's ndcg@5: 0.975323\n", + "[38]\tvalid_0's ndcg@1: 0.934975\tvalid_0's ndcg@2: 0.973541\tvalid_0's ndcg@3: 0.975066\tvalid_0's ndcg@4: 0.975367\tvalid_0's ndcg@5: 0.975386\n", + "[39]\tvalid_0's ndcg@1: 0.935275\tvalid_0's ndcg@2: 0.973667\tvalid_0's ndcg@3: 0.975192\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975502\n", + "[40]\tvalid_0's ndcg@1: 0.9352\tvalid_0's ndcg@2: 0.973624\tvalid_0's ndcg@3: 0.975174\tvalid_0's ndcg@4: 0.975454\tvalid_0's ndcg@5: 0.975473\n", + "[41]\tvalid_0's ndcg@1: 0.935325\tvalid_0's ndcg@2: 0.973686\tvalid_0's ndcg@3: 0.975223\tvalid_0's ndcg@4: 0.975503\tvalid_0's ndcg@5: 0.975522\n", + "[42]\tvalid_0's ndcg@1: 0.93545\tvalid_0's ndcg@2: 0.973716\tvalid_0's ndcg@3: 0.975266\tvalid_0's ndcg@4: 0.975546\tvalid_0's ndcg@5: 0.975565\n", + "[43]\tvalid_0's ndcg@1: 0.93615\tvalid_0's ndcg@2: 0.974022\tvalid_0's ndcg@3: 0.975534\tvalid_0's ndcg@4: 0.975814\tvalid_0's ndcg@5: 0.975843\n", + "[44]\tvalid_0's ndcg@1: 0.936225\tvalid_0's ndcg@2: 0.974112\tvalid_0's ndcg@3: 0.975562\tvalid_0's ndcg@4: 0.975853\tvalid_0's ndcg@5: 0.975882\n", + "[45]\tvalid_0's ndcg@1: 0.9365\tvalid_0's ndcg@2: 0.974167\tvalid_0's ndcg@3: 0.975654\tvalid_0's ndcg@4: 0.975945\tvalid_0's ndcg@5: 0.975974\n", + "[46]\tvalid_0's ndcg@1: 0.93665\tvalid_0's ndcg@2: 0.974206\tvalid_0's ndcg@3: 0.975694\tvalid_0's ndcg@4: 0.975995\tvalid_0's ndcg@5: 0.976024\n", + "[47]\tvalid_0's ndcg@1: 0.93685\tvalid_0's ndcg@2: 0.974311\tvalid_0's ndcg@3: 0.975786\tvalid_0's ndcg@4: 0.976077\tvalid_0's ndcg@5: 0.976106\n", + "[48]\tvalid_0's ndcg@1: 0.937025\tvalid_0's ndcg@2: 0.974408\tvalid_0's ndcg@3: 0.975845\tvalid_0's ndcg@4: 0.976147\tvalid_0's ndcg@5: 0.976185\n", + "[49]\tvalid_0's ndcg@1: 0.936975\tvalid_0's ndcg@2: 0.974342\tvalid_0's ndcg@3: 0.975829\tvalid_0's ndcg@4: 0.97612\tvalid_0's ndcg@5: 0.976159\n", + "[50]\tvalid_0's ndcg@1: 0.9371\tvalid_0's ndcg@2: 0.974388\tvalid_0's ndcg@3: 0.97585\tvalid_0's ndcg@4: 0.976152\tvalid_0's ndcg@5: 0.976191\n", + "[51]\tvalid_0's ndcg@1: 0.937025\tvalid_0's ndcg@2: 0.974329\tvalid_0's ndcg@3: 0.975841\tvalid_0's ndcg@4: 0.976121\tvalid_0's ndcg@5: 0.97616\n", + "[52]\tvalid_0's ndcg@1: 0.9377\tvalid_0's ndcg@2: 0.974578\tvalid_0's ndcg@3: 0.976078\tvalid_0's ndcg@4: 0.976369\tvalid_0's ndcg@5: 0.976407\n", + "[53]\tvalid_0's ndcg@1: 0.9378\tvalid_0's ndcg@2: 0.974615\tvalid_0's ndcg@3: 0.976115\tvalid_0's ndcg@4: 0.976405\tvalid_0's ndcg@5: 0.976444\n", + "[54]\tvalid_0's ndcg@1: 0.938\tvalid_0's ndcg@2: 0.974689\tvalid_0's ndcg@3: 0.976214\tvalid_0's ndcg@4: 0.976483\tvalid_0's ndcg@5: 0.976521\n", + "[55]\tvalid_0's ndcg@1: 0.938225\tvalid_0's ndcg@2: 0.974803\tvalid_0's ndcg@3: 0.976303\tvalid_0's ndcg@4: 0.976572\tvalid_0's ndcg@5: 0.976611\n", + "[56]\tvalid_0's ndcg@1: 0.938175\tvalid_0's ndcg@2: 0.9748\tvalid_0's ndcg@3: 0.976275\tvalid_0's ndcg@4: 0.976555\tvalid_0's ndcg@5: 0.976594\n", + "[57]\tvalid_0's ndcg@1: 0.938525\tvalid_0's ndcg@2: 0.974914\tvalid_0's ndcg@3: 0.976414\tvalid_0's ndcg@4: 0.976683\tvalid_0's ndcg@5: 0.976722\n", + "[58]\tvalid_0's ndcg@1: 0.93875\tvalid_0's ndcg@2: 0.975028\tvalid_0's ndcg@3: 0.976503\tvalid_0's ndcg@4: 0.976773\tvalid_0's ndcg@5: 0.976811\n", + "[59]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975198\tvalid_0's ndcg@3: 0.976648\tvalid_0's ndcg@4: 0.976918\tvalid_0's ndcg@5: 0.976956\n", + "[60]\tvalid_0's ndcg@1: 0.939025\tvalid_0's ndcg@2: 0.975177\tvalid_0's ndcg@3: 0.976615\tvalid_0's ndcg@4: 0.976884\tvalid_0's ndcg@5: 0.976923\n", + "[61]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.975205\tvalid_0's ndcg@3: 0.976642\tvalid_0's ndcg@4: 0.976912\tvalid_0's ndcg@5: 0.97695\n", + "[62]\tvalid_0's ndcg@1: 0.93965\tvalid_0's ndcg@2: 0.975424\tvalid_0's ndcg@3: 0.976836\tvalid_0's ndcg@4: 0.977116\tvalid_0's ndcg@5: 0.977155\n", + "[63]\tvalid_0's ndcg@1: 0.940075\tvalid_0's ndcg@2: 0.975596\tvalid_0's ndcg@3: 0.976996\tvalid_0's ndcg@4: 0.977276\tvalid_0's ndcg@5: 0.977315\n", + "[64]\tvalid_0's ndcg@1: 0.940375\tvalid_0's ndcg@2: 0.975723\tvalid_0's ndcg@3: 0.977123\tvalid_0's ndcg@4: 0.977392\tvalid_0's ndcg@5: 0.977431\n", + "[65]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975766\tvalid_0's ndcg@3: 0.977154\tvalid_0's ndcg@4: 0.977423\tvalid_0's ndcg@5: 0.977462\n", + "[66]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.975744\tvalid_0's ndcg@3: 0.977156\tvalid_0's ndcg@4: 0.977426\tvalid_0's ndcg@5: 0.977464\n", + "[67]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.97576\tvalid_0's ndcg@3: 0.977172\tvalid_0's ndcg@4: 0.977431\tvalid_0's ndcg@5: 0.977469\n", + "[68]\tvalid_0's ndcg@1: 0.940675\tvalid_0's ndcg@2: 0.975849\tvalid_0's ndcg@3: 0.977249\tvalid_0's ndcg@4: 0.977508\tvalid_0's ndcg@5: 0.977546\n", + "[69]\tvalid_0's ndcg@1: 0.9413\tvalid_0's ndcg@2: 0.976017\tvalid_0's ndcg@3: 0.977454\tvalid_0's ndcg@4: 0.977724\tvalid_0's ndcg@5: 0.977762\n", + "[70]\tvalid_0's ndcg@1: 0.94105\tvalid_0's ndcg@2: 0.975925\tvalid_0's ndcg@3: 0.977362\tvalid_0's ndcg@4: 0.977631\tvalid_0's ndcg@5: 0.97767\n", + "[71]\tvalid_0's ndcg@1: 0.94105\tvalid_0's ndcg@2: 0.975925\tvalid_0's ndcg@3: 0.97735\tvalid_0's ndcg@4: 0.97763\tvalid_0's ndcg@5: 0.977668\n", + "[72]\tvalid_0's ndcg@1: 0.941325\tvalid_0's ndcg@2: 0.976058\tvalid_0's ndcg@3: 0.97747\tvalid_0's ndcg@4: 0.977739\tvalid_0's ndcg@5: 0.977778\n", + "[73]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.976076\tvalid_0's ndcg@3: 0.977476\tvalid_0's ndcg@4: 0.977756\tvalid_0's ndcg@5: 0.977795\n", + "[74]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.97619\tvalid_0's ndcg@3: 0.97759\tvalid_0's ndcg@4: 0.97788\tvalid_0's ndcg@5: 0.977919\n", + "[75]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.97619\tvalid_0's ndcg@3: 0.977602\tvalid_0's ndcg@4: 0.977882\tvalid_0's ndcg@5: 0.977921\n", + "[76]\tvalid_0's ndcg@1: 0.94195\tvalid_0's ndcg@2: 0.976273\tvalid_0's ndcg@3: 0.977685\tvalid_0's ndcg@4: 0.977965\tvalid_0's ndcg@5: 0.978004\n", + "[77]\tvalid_0's ndcg@1: 0.9419\tvalid_0's ndcg@2: 0.97627\tvalid_0's ndcg@3: 0.97767\tvalid_0's ndcg@4: 0.97795\tvalid_0's ndcg@5: 0.977989\n", + "[78]\tvalid_0's ndcg@1: 0.94235\tvalid_0's ndcg@2: 0.976452\tvalid_0's ndcg@3: 0.977839\tvalid_0's ndcg@4: 0.978119\tvalid_0's ndcg@5: 0.978158\n", + "[79]\tvalid_0's ndcg@1: 0.94265\tvalid_0's ndcg@2: 0.976562\tvalid_0's ndcg@3: 0.977937\tvalid_0's ndcg@4: 0.978228\tvalid_0's ndcg@5: 0.978267\n", + "[80]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976667\tvalid_0's ndcg@3: 0.978067\tvalid_0's ndcg@4: 0.978347\tvalid_0's ndcg@5: 0.978385\n", + "[81]\tvalid_0's ndcg@1: 0.94305\tvalid_0's ndcg@2: 0.97671\tvalid_0's ndcg@3: 0.978098\tvalid_0's ndcg@4: 0.978378\tvalid_0's ndcg@5: 0.978416\n", + "[82]\tvalid_0's ndcg@1: 0.943175\tvalid_0's ndcg@2: 0.97674\tvalid_0's ndcg@3: 0.978115\tvalid_0's ndcg@4: 0.978417\tvalid_0's ndcg@5: 0.978456\n", + "[83]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976752\tvalid_0's ndcg@3: 0.97814\tvalid_0's ndcg@4: 0.978441\tvalid_0's ndcg@5: 0.97848\n", + "[84]\tvalid_0's ndcg@1: 0.943375\tvalid_0's ndcg@2: 0.976767\tvalid_0's ndcg@3: 0.978179\tvalid_0's ndcg@4: 0.978481\tvalid_0's ndcg@5: 0.97852\n", + "[85]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976721\tvalid_0's ndcg@3: 0.978146\tvalid_0's ndcg@4: 0.978437\tvalid_0's ndcg@5: 0.978475\n", + "[86]\tvalid_0's ndcg@1: 0.9434\tvalid_0's ndcg@2: 0.976792\tvalid_0's ndcg@3: 0.978204\tvalid_0's ndcg@4: 0.978506\tvalid_0's ndcg@5: 0.978535\n", + "[87]\tvalid_0's ndcg@1: 0.943475\tvalid_0's ndcg@2: 0.976851\tvalid_0's ndcg@3: 0.978239\tvalid_0's ndcg@4: 0.97854\tvalid_0's ndcg@5: 0.978569\n", + "[88]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976882\tvalid_0's ndcg@3: 0.978282\tvalid_0's ndcg@4: 0.978572\tvalid_0's ndcg@5: 0.978611\n", + "[89]\tvalid_0's ndcg@1: 0.943775\tvalid_0's ndcg@2: 0.976915\tvalid_0's ndcg@3: 0.97834\tvalid_0's ndcg@4: 0.97863\tvalid_0's ndcg@5: 0.978669\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[90]\tvalid_0's ndcg@1: 0.943925\tvalid_0's ndcg@2: 0.976986\tvalid_0's ndcg@3: 0.978398\tvalid_0's ndcg@4: 0.978689\tvalid_0's ndcg@5: 0.978728\n", + "[91]\tvalid_0's ndcg@1: 0.943875\tvalid_0's ndcg@2: 0.976999\tvalid_0's ndcg@3: 0.978399\tvalid_0's ndcg@4: 0.978679\tvalid_0's ndcg@5: 0.978717\n", + "[92]\tvalid_0's ndcg@1: 0.94395\tvalid_0's ndcg@2: 0.977058\tvalid_0's ndcg@3: 0.978421\tvalid_0's ndcg@4: 0.978711\tvalid_0's ndcg@5: 0.97876\n", + "[93]\tvalid_0's ndcg@1: 0.944075\tvalid_0's ndcg@2: 0.977104\tvalid_0's ndcg@3: 0.978479\tvalid_0's ndcg@4: 0.978759\tvalid_0's ndcg@5: 0.978807\n", + "[94]\tvalid_0's ndcg@1: 0.944175\tvalid_0's ndcg@2: 0.977125\tvalid_0's ndcg@3: 0.978513\tvalid_0's ndcg@4: 0.978793\tvalid_0's ndcg@5: 0.978841\n", + "[95]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.977153\tvalid_0's ndcg@3: 0.97854\tvalid_0's ndcg@4: 0.97882\tvalid_0's ndcg@5: 0.978869\n", + "[96]\tvalid_0's ndcg@1: 0.944225\tvalid_0's ndcg@2: 0.977144\tvalid_0's ndcg@3: 0.978531\tvalid_0's ndcg@4: 0.978811\tvalid_0's ndcg@5: 0.97886\n", + "[97]\tvalid_0's ndcg@1: 0.94435\tvalid_0's ndcg@2: 0.977221\tvalid_0's ndcg@3: 0.978584\tvalid_0's ndcg@4: 0.978864\tvalid_0's ndcg@5: 0.978912\n", + "[98]\tvalid_0's ndcg@1: 0.944575\tvalid_0's ndcg@2: 0.977289\tvalid_0's ndcg@3: 0.978651\tvalid_0's ndcg@4: 0.978942\tvalid_0's ndcg@5: 0.97899\n", + "[99]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.977341\tvalid_0's ndcg@3: 0.978691\tvalid_0's ndcg@4: 0.978993\tvalid_0's ndcg@5: 0.979032\n", + "[100]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977482\tvalid_0's ndcg@3: 0.978857\tvalid_0's ndcg@4: 0.979148\tvalid_0's ndcg@5: 0.979187\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977482\tvalid_0's ndcg@3: 0.978857\tvalid_0's ndcg@4: 0.979148\tvalid_0's ndcg@5: 0.979187\n", + "[1]\tvalid_0's ndcg@1: 0.911575\tvalid_0's ndcg@2: 0.964384\tvalid_0's ndcg@3: 0.966321\tvalid_0's ndcg@4: 0.966623\tvalid_0's ndcg@5: 0.966671\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.9136\tvalid_0's ndcg@2: 0.965257\tvalid_0's ndcg@3: 0.967107\tvalid_0's ndcg@4: 0.967398\tvalid_0's ndcg@5: 0.967456\n", + "[3]\tvalid_0's ndcg@1: 0.917425\tvalid_0's ndcg@2: 0.966732\tvalid_0's ndcg@3: 0.968545\tvalid_0's ndcg@4: 0.968814\tvalid_0's ndcg@5: 0.968882\n", + "[4]\tvalid_0's ndcg@1: 0.9222\tvalid_0's ndcg@2: 0.968558\tvalid_0's ndcg@3: 0.970383\tvalid_0's ndcg@4: 0.970619\tvalid_0's ndcg@5: 0.970668\n", + "[5]\tvalid_0's ndcg@1: 0.925875\tvalid_0's ndcg@2: 0.969914\tvalid_0's ndcg@3: 0.971714\tvalid_0's ndcg@4: 0.971972\tvalid_0's ndcg@5: 0.972021\n", + "[6]\tvalid_0's ndcg@1: 0.926875\tvalid_0's ndcg@2: 0.970425\tvalid_0's ndcg@3: 0.972112\tvalid_0's ndcg@4: 0.972371\tvalid_0's ndcg@5: 0.972419\n", + "[7]\tvalid_0's ndcg@1: 0.927475\tvalid_0's ndcg@2: 0.970631\tvalid_0's ndcg@3: 0.972306\tvalid_0's ndcg@4: 0.972586\tvalid_0's ndcg@5: 0.972634\n", + "[8]\tvalid_0's ndcg@1: 0.93015\tvalid_0's ndcg@2: 0.971649\tvalid_0's ndcg@3: 0.973287\tvalid_0's ndcg@4: 0.973567\tvalid_0's ndcg@5: 0.973625\n", + "[9]\tvalid_0's ndcg@1: 0.9312\tvalid_0's ndcg@2: 0.972084\tvalid_0's ndcg@3: 0.973684\tvalid_0's ndcg@4: 0.973964\tvalid_0's ndcg@5: 0.974022\n", + "[10]\tvalid_0's ndcg@1: 0.93225\tvalid_0's ndcg@2: 0.972456\tvalid_0's ndcg@3: 0.974081\tvalid_0's ndcg@4: 0.974361\tvalid_0's ndcg@5: 0.974409\n", + "[11]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.972704\tvalid_0's ndcg@3: 0.974379\tvalid_0's ndcg@4: 0.974648\tvalid_0's ndcg@5: 0.974696\n", + "[12]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972949\tvalid_0's ndcg@3: 0.974574\tvalid_0's ndcg@4: 0.974832\tvalid_0's ndcg@5: 0.974881\n", + "[13]\tvalid_0's ndcg@1: 0.93415\tvalid_0's ndcg@2: 0.97322\tvalid_0's ndcg@3: 0.97482\tvalid_0's ndcg@4: 0.975079\tvalid_0's ndcg@5: 0.975127\n", + "[14]\tvalid_0's ndcg@1: 0.9352\tvalid_0's ndcg@2: 0.973671\tvalid_0's ndcg@3: 0.975246\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975531\n", + "[15]\tvalid_0's ndcg@1: 0.9358\tvalid_0's ndcg@2: 0.973877\tvalid_0's ndcg@3: 0.975452\tvalid_0's ndcg@4: 0.975699\tvalid_0's ndcg@5: 0.975748\n", + "[16]\tvalid_0's ndcg@1: 0.935825\tvalid_0's ndcg@2: 0.973917\tvalid_0's ndcg@3: 0.975442\tvalid_0's ndcg@4: 0.975712\tvalid_0's ndcg@5: 0.97576\n", + "[17]\tvalid_0's ndcg@1: 0.936475\tvalid_0's ndcg@2: 0.97411\tvalid_0's ndcg@3: 0.975697\tvalid_0's ndcg@4: 0.975956\tvalid_0's ndcg@5: 0.975995\n", + "[18]\tvalid_0's ndcg@1: 0.936925\tvalid_0's ndcg@2: 0.974292\tvalid_0's ndcg@3: 0.975867\tvalid_0's ndcg@4: 0.976114\tvalid_0's ndcg@5: 0.976163\n", + "[19]\tvalid_0's ndcg@1: 0.937525\tvalid_0's ndcg@2: 0.974545\tvalid_0's ndcg@3: 0.976095\tvalid_0's ndcg@4: 0.976342\tvalid_0's ndcg@5: 0.976391\n", + "[20]\tvalid_0's ndcg@1: 0.937775\tvalid_0's ndcg@2: 0.974653\tvalid_0's ndcg@3: 0.976203\tvalid_0's ndcg@4: 0.976429\tvalid_0's ndcg@5: 0.976487\n", + "[21]\tvalid_0's ndcg@1: 0.938825\tvalid_0's ndcg@2: 0.975072\tvalid_0's ndcg@3: 0.976597\tvalid_0's ndcg@4: 0.976823\tvalid_0's ndcg@5: 0.976881\n", + "[22]\tvalid_0's ndcg@1: 0.93885\tvalid_0's ndcg@2: 0.975097\tvalid_0's ndcg@3: 0.976609\tvalid_0's ndcg@4: 0.976846\tvalid_0's ndcg@5: 0.976895\n", + "[23]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975246\tvalid_0's ndcg@3: 0.976733\tvalid_0's ndcg@4: 0.976959\tvalid_0's ndcg@5: 0.977008\n", + "[24]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975246\tvalid_0's ndcg@3: 0.976721\tvalid_0's ndcg@4: 0.976947\tvalid_0's ndcg@5: 0.977005\n", + "[25]\tvalid_0's ndcg@1: 0.9396\tvalid_0's ndcg@2: 0.975421\tvalid_0's ndcg@3: 0.976909\tvalid_0's ndcg@4: 0.977124\tvalid_0's ndcg@5: 0.977182\n", + "[26]\tvalid_0's ndcg@1: 0.9393\tvalid_0's ndcg@2: 0.975342\tvalid_0's ndcg@3: 0.976804\tvalid_0's ndcg@4: 0.97702\tvalid_0's ndcg@5: 0.977078\n", + "[27]\tvalid_0's ndcg@1: 0.93925\tvalid_0's ndcg@2: 0.975323\tvalid_0's ndcg@3: 0.976798\tvalid_0's ndcg@4: 0.977014\tvalid_0's ndcg@5: 0.977062\n", + "[28]\tvalid_0's ndcg@1: 0.93925\tvalid_0's ndcg@2: 0.975308\tvalid_0's ndcg@3: 0.976783\tvalid_0's ndcg@4: 0.977009\tvalid_0's ndcg@5: 0.977057\n", + "[29]\tvalid_0's ndcg@1: 0.94\tvalid_0's ndcg@2: 0.975569\tvalid_0's ndcg@3: 0.977056\tvalid_0's ndcg@4: 0.977282\tvalid_0's ndcg@5: 0.977331\n", + "[30]\tvalid_0's ndcg@1: 0.940325\tvalid_0's ndcg@2: 0.975673\tvalid_0's ndcg@3: 0.977173\tvalid_0's ndcg@4: 0.977399\tvalid_0's ndcg@5: 0.977447\n", + "[31]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975731\tvalid_0's ndcg@3: 0.977243\tvalid_0's ndcg@4: 0.977469\tvalid_0's ndcg@5: 0.977518\n", + "[32]\tvalid_0's ndcg@1: 0.940625\tvalid_0's ndcg@2: 0.975831\tvalid_0's ndcg@3: 0.977306\tvalid_0's ndcg@4: 0.977521\tvalid_0's ndcg@5: 0.97757\n", + "[33]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975766\tvalid_0's ndcg@3: 0.977241\tvalid_0's ndcg@4: 0.977457\tvalid_0's ndcg@5: 0.977505\n", + "[34]\tvalid_0's ndcg@1: 0.940625\tvalid_0's ndcg@2: 0.975831\tvalid_0's ndcg@3: 0.977306\tvalid_0's ndcg@4: 0.977521\tvalid_0's ndcg@5: 0.97757\n", + "[35]\tvalid_0's ndcg@1: 0.940725\tvalid_0's ndcg@2: 0.975868\tvalid_0's ndcg@3: 0.977343\tvalid_0's ndcg@4: 0.977558\tvalid_0's ndcg@5: 0.977606\n", + "[36]\tvalid_0's ndcg@1: 0.94115\tvalid_0's ndcg@2: 0.976056\tvalid_0's ndcg@3: 0.977506\tvalid_0's ndcg@4: 0.977722\tvalid_0's ndcg@5: 0.97777\n", + "[37]\tvalid_0's ndcg@1: 0.9414\tvalid_0's ndcg@2: 0.976133\tvalid_0's ndcg@3: 0.977595\tvalid_0's ndcg@4: 0.977811\tvalid_0's ndcg@5: 0.977859\n", + "[38]\tvalid_0's ndcg@1: 0.94175\tvalid_0's ndcg@2: 0.976278\tvalid_0's ndcg@3: 0.977715\tvalid_0's ndcg@4: 0.977941\tvalid_0's ndcg@5: 0.97799\n", + "[39]\tvalid_0's ndcg@1: 0.942075\tvalid_0's ndcg@2: 0.976366\tvalid_0's ndcg@3: 0.977841\tvalid_0's ndcg@4: 0.978056\tvalid_0's ndcg@5: 0.978105\n", + "[40]\tvalid_0's ndcg@1: 0.94215\tvalid_0's ndcg@2: 0.976409\tvalid_0's ndcg@3: 0.977872\tvalid_0's ndcg@4: 0.978087\tvalid_0's ndcg@5: 0.978136\n", + "[41]\tvalid_0's ndcg@1: 0.94245\tvalid_0's ndcg@2: 0.97652\tvalid_0's ndcg@3: 0.977983\tvalid_0's ndcg@4: 0.978198\tvalid_0's ndcg@5: 0.978246\n", + "[42]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976682\tvalid_0's ndcg@3: 0.97817\tvalid_0's ndcg@4: 0.978385\tvalid_0's ndcg@5: 0.978434\n", + "[43]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976682\tvalid_0's ndcg@3: 0.97817\tvalid_0's ndcg@4: 0.978385\tvalid_0's ndcg@5: 0.978434\n", + "[44]\tvalid_0's ndcg@1: 0.94285\tvalid_0's ndcg@2: 0.976636\tvalid_0's ndcg@3: 0.978111\tvalid_0's ndcg@4: 0.978337\tvalid_0's ndcg@5: 0.978386\n", + "[45]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.9768\tvalid_0's ndcg@3: 0.978262\tvalid_0's ndcg@4: 0.978488\tvalid_0's ndcg@5: 0.978537\n", + "[46]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976913\tvalid_0's ndcg@3: 0.978388\tvalid_0's ndcg@4: 0.978614\tvalid_0's ndcg@5: 0.978663\n", + "[47]\tvalid_0's ndcg@1: 0.943525\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.97836\tvalid_0's ndcg@4: 0.978576\tvalid_0's ndcg@5: 0.978634\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[48]\tvalid_0's ndcg@1: 0.943525\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.978373\tvalid_0's ndcg@4: 0.978577\tvalid_0's ndcg@5: 0.978636\n", + "[49]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976913\tvalid_0's ndcg@3: 0.978388\tvalid_0's ndcg@4: 0.978614\tvalid_0's ndcg@5: 0.978663\n", + "[50]\tvalid_0's ndcg@1: 0.943975\tvalid_0's ndcg@2: 0.97702\tvalid_0's ndcg@3: 0.97852\tvalid_0's ndcg@4: 0.978746\tvalid_0's ndcg@5: 0.978794\n", + "[51]\tvalid_0's ndcg@1: 0.9441\tvalid_0's ndcg@2: 0.97705\tvalid_0's ndcg@3: 0.97855\tvalid_0's ndcg@4: 0.978787\tvalid_0's ndcg@5: 0.978836\n", + "[52]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.977121\tvalid_0's ndcg@3: 0.978609\tvalid_0's ndcg@4: 0.978846\tvalid_0's ndcg@5: 0.978894\n", + "[53]\tvalid_0's ndcg@1: 0.944225\tvalid_0's ndcg@2: 0.977081\tvalid_0's ndcg@3: 0.978618\tvalid_0's ndcg@4: 0.978834\tvalid_0's ndcg@5: 0.978882\n", + "[54]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.977071\tvalid_0's ndcg@3: 0.978609\tvalid_0's ndcg@4: 0.978824\tvalid_0's ndcg@5: 0.978873\n", + "[55]\tvalid_0's ndcg@1: 0.94435\tvalid_0's ndcg@2: 0.977143\tvalid_0's ndcg@3: 0.978668\tvalid_0's ndcg@4: 0.978883\tvalid_0's ndcg@5: 0.978931\n", + "[56]\tvalid_0's ndcg@1: 0.9444\tvalid_0's ndcg@2: 0.977177\tvalid_0's ndcg@3: 0.978702\tvalid_0's ndcg@4: 0.978906\tvalid_0's ndcg@5: 0.978955\n", + "[57]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.977263\tvalid_0's ndcg@3: 0.978788\tvalid_0's ndcg@4: 0.979003\tvalid_0's ndcg@5: 0.979051\n", + "[58]\tvalid_0's ndcg@1: 0.9448\tvalid_0's ndcg@2: 0.977293\tvalid_0's ndcg@3: 0.978843\tvalid_0's ndcg@4: 0.979047\tvalid_0's ndcg@5: 0.979096\n", + "[59]\tvalid_0's ndcg@1: 0.9452\tvalid_0's ndcg@2: 0.977472\tvalid_0's ndcg@3: 0.978997\tvalid_0's ndcg@4: 0.979202\tvalid_0's ndcg@5: 0.97925\n", + "[60]\tvalid_0's ndcg@1: 0.9455\tvalid_0's ndcg@2: 0.97763\tvalid_0's ndcg@3: 0.979118\tvalid_0's ndcg@4: 0.979322\tvalid_0's ndcg@5: 0.979371\n", + "[61]\tvalid_0's ndcg@1: 0.945725\tvalid_0's ndcg@2: 0.977682\tvalid_0's ndcg@3: 0.979194\tvalid_0's ndcg@4: 0.979399\tvalid_0's ndcg@5: 0.979447\n", + "[62]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977812\tvalid_0's ndcg@3: 0.979312\tvalid_0's ndcg@4: 0.979495\tvalid_0's ndcg@5: 0.979543\n", + "[63]\tvalid_0's ndcg@1: 0.946\tvalid_0's ndcg@2: 0.977878\tvalid_0's ndcg@3: 0.97934\tvalid_0's ndcg@4: 0.979523\tvalid_0's ndcg@5: 0.979572\n", + "[64]\tvalid_0's ndcg@1: 0.946525\tvalid_0's ndcg@2: 0.978056\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979714\tvalid_0's ndcg@5: 0.979762\n", + "[65]\tvalid_0's ndcg@1: 0.9467\tvalid_0's ndcg@2: 0.978105\tvalid_0's ndcg@3: 0.979592\tvalid_0's ndcg@4: 0.979775\tvalid_0's ndcg@5: 0.979823\n", + "[66]\tvalid_0's ndcg@1: 0.9465\tvalid_0's ndcg@2: 0.978046\tvalid_0's ndcg@3: 0.979534\tvalid_0's ndcg@4: 0.979706\tvalid_0's ndcg@5: 0.979755\n", + "[67]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.978127\tvalid_0's ndcg@3: 0.979614\tvalid_0's ndcg@4: 0.979776\tvalid_0's ndcg@5: 0.979824\n", + "[68]\tvalid_0's ndcg@1: 0.9467\tvalid_0's ndcg@2: 0.97812\tvalid_0's ndcg@3: 0.979608\tvalid_0's ndcg@4: 0.97978\tvalid_0's ndcg@5: 0.979828\n", + "[69]\tvalid_0's ndcg@1: 0.946875\tvalid_0's ndcg@2: 0.978216\tvalid_0's ndcg@3: 0.979679\tvalid_0's ndcg@4: 0.979851\tvalid_0's ndcg@5: 0.9799\n", + "[70]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.978194\tvalid_0's ndcg@3: 0.979682\tvalid_0's ndcg@4: 0.979854\tvalid_0's ndcg@5: 0.979902\n", + "[71]\tvalid_0's ndcg@1: 0.947025\tvalid_0's ndcg@2: 0.978209\tvalid_0's ndcg@3: 0.979721\tvalid_0's ndcg@4: 0.979893\tvalid_0's ndcg@5: 0.979942\n", + "[72]\tvalid_0's ndcg@1: 0.9472\tvalid_0's ndcg@2: 0.978273\tvalid_0's ndcg@3: 0.979773\tvalid_0's ndcg@4: 0.979956\tvalid_0's ndcg@5: 0.980005\n", + "[73]\tvalid_0's ndcg@1: 0.947475\tvalid_0's ndcg@2: 0.978391\tvalid_0's ndcg@3: 0.979878\tvalid_0's ndcg@4: 0.980061\tvalid_0's ndcg@5: 0.980109\n", + "[74]\tvalid_0's ndcg@1: 0.94715\tvalid_0's ndcg@2: 0.978271\tvalid_0's ndcg@3: 0.979758\tvalid_0's ndcg@4: 0.979941\tvalid_0's ndcg@5: 0.97999\n", + "[75]\tvalid_0's ndcg@1: 0.947275\tvalid_0's ndcg@2: 0.978333\tvalid_0's ndcg@3: 0.979808\tvalid_0's ndcg@4: 0.979991\tvalid_0's ndcg@5: 0.980039\n", + "[76]\tvalid_0's ndcg@1: 0.9474\tvalid_0's ndcg@2: 0.97841\tvalid_0's ndcg@3: 0.979873\tvalid_0's ndcg@4: 0.980045\tvalid_0's ndcg@5: 0.980093\n", + "[77]\tvalid_0's ndcg@1: 0.94745\tvalid_0's ndcg@2: 0.97846\tvalid_0's ndcg@3: 0.979898\tvalid_0's ndcg@4: 0.98007\tvalid_0's ndcg@5: 0.980118\n", + "[78]\tvalid_0's ndcg@1: 0.94775\tvalid_0's ndcg@2: 0.978555\tvalid_0's ndcg@3: 0.980005\tvalid_0's ndcg@4: 0.980177\tvalid_0's ndcg@5: 0.980226\n", + "[79]\tvalid_0's ndcg@1: 0.947875\tvalid_0's ndcg@2: 0.978617\tvalid_0's ndcg@3: 0.980055\tvalid_0's ndcg@4: 0.980238\tvalid_0's ndcg@5: 0.980276\n", + "[80]\tvalid_0's ndcg@1: 0.947875\tvalid_0's ndcg@2: 0.978617\tvalid_0's ndcg@3: 0.980055\tvalid_0's ndcg@4: 0.980238\tvalid_0's ndcg@5: 0.980276\n", + "[81]\tvalid_0's ndcg@1: 0.948175\tvalid_0's ndcg@2: 0.978744\tvalid_0's ndcg@3: 0.980169\tvalid_0's ndcg@4: 0.980352\tvalid_0's ndcg@5: 0.98039\n", + "[82]\tvalid_0's ndcg@1: 0.948375\tvalid_0's ndcg@2: 0.97888\tvalid_0's ndcg@3: 0.980255\tvalid_0's ndcg@4: 0.980438\tvalid_0's ndcg@5: 0.980477\n", + "[83]\tvalid_0's ndcg@1: 0.94825\tvalid_0's ndcg@2: 0.978834\tvalid_0's ndcg@3: 0.980209\tvalid_0's ndcg@4: 0.980392\tvalid_0's ndcg@5: 0.980431\n", + "[84]\tvalid_0's ndcg@1: 0.948275\tvalid_0's ndcg@2: 0.978844\tvalid_0's ndcg@3: 0.980219\tvalid_0's ndcg@4: 0.980402\tvalid_0's ndcg@5: 0.98044\n", + "[85]\tvalid_0's ndcg@1: 0.948475\tvalid_0's ndcg@2: 0.978917\tvalid_0's ndcg@3: 0.980292\tvalid_0's ndcg@4: 0.980475\tvalid_0's ndcg@5: 0.980514\n", + "[86]\tvalid_0's ndcg@1: 0.948975\tvalid_0's ndcg@2: 0.979102\tvalid_0's ndcg@3: 0.980477\tvalid_0's ndcg@4: 0.98066\tvalid_0's ndcg@5: 0.980699\n", + "[87]\tvalid_0's ndcg@1: 0.948975\tvalid_0's ndcg@2: 0.979086\tvalid_0's ndcg@3: 0.980474\tvalid_0's ndcg@4: 0.980657\tvalid_0's ndcg@5: 0.980695\n", + "[88]\tvalid_0's ndcg@1: 0.949025\tvalid_0's ndcg@2: 0.979136\tvalid_0's ndcg@3: 0.980499\tvalid_0's ndcg@4: 0.980682\tvalid_0's ndcg@5: 0.98072\n", + "[89]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979285\tvalid_0's ndcg@3: 0.98061\tvalid_0's ndcg@4: 0.980793\tvalid_0's ndcg@5: 0.980832\n", + "[90]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979269\tvalid_0's ndcg@3: 0.980607\tvalid_0's ndcg@4: 0.98079\tvalid_0's ndcg@5: 0.980828\n", + "[91]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979269\tvalid_0's ndcg@3: 0.980607\tvalid_0's ndcg@4: 0.98079\tvalid_0's ndcg@5: 0.980828\n", + "[92]\tvalid_0's ndcg@1: 0.9494\tvalid_0's ndcg@2: 0.97929\tvalid_0's ndcg@3: 0.98064\tvalid_0's ndcg@4: 0.980823\tvalid_0's ndcg@5: 0.980862\n", + "[93]\tvalid_0's ndcg@1: 0.949375\tvalid_0's ndcg@2: 0.979297\tvalid_0's ndcg@3: 0.980634\tvalid_0's ndcg@4: 0.980817\tvalid_0's ndcg@5: 0.980856\n", + "[94]\tvalid_0's ndcg@1: 0.949525\tvalid_0's ndcg@2: 0.979336\tvalid_0's ndcg@3: 0.980686\tvalid_0's ndcg@4: 0.980869\tvalid_0's ndcg@5: 0.980908\n", + "[95]\tvalid_0's ndcg@1: 0.949825\tvalid_0's ndcg@2: 0.979416\tvalid_0's ndcg@3: 0.980791\tvalid_0's ndcg@4: 0.980974\tvalid_0's ndcg@5: 0.981012\n", + "[96]\tvalid_0's ndcg@1: 0.94975\tvalid_0's ndcg@2: 0.979404\tvalid_0's ndcg@3: 0.980779\tvalid_0's ndcg@4: 0.980951\tvalid_0's ndcg@5: 0.98099\n", + "[97]\tvalid_0's ndcg@1: 0.950025\tvalid_0's ndcg@2: 0.979537\tvalid_0's ndcg@3: 0.980874\tvalid_0's ndcg@4: 0.981057\tvalid_0's ndcg@5: 0.981096\n", + "[98]\tvalid_0's ndcg@1: 0.9501\tvalid_0's ndcg@2: 0.979564\tvalid_0's ndcg@3: 0.980889\tvalid_0's ndcg@4: 0.981083\tvalid_0's ndcg@5: 0.981122\n", + "[99]\tvalid_0's ndcg@1: 0.950275\tvalid_0's ndcg@2: 0.979629\tvalid_0's ndcg@3: 0.980967\tvalid_0's ndcg@4: 0.98115\tvalid_0's ndcg@5: 0.981188\n", + "[100]\tvalid_0's ndcg@1: 0.950325\tvalid_0's ndcg@2: 0.979647\tvalid_0's ndcg@3: 0.980985\tvalid_0's ndcg@4: 0.981168\tvalid_0's ndcg@5: 0.981207\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's ndcg@1: 0.950325\tvalid_0's ndcg@2: 0.979647\tvalid_0's ndcg@3: 0.980985\tvalid_0's ndcg@4: 0.981168\tvalid_0's ndcg@5: 0.981207\n", + "[1]\tvalid_0's ndcg@1: 0.910175\tvalid_0's ndcg@2: 0.96382\tvalid_0's ndcg@3: 0.965707\tvalid_0's ndcg@4: 0.966009\tvalid_0's ndcg@5: 0.966086\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.91415\tvalid_0's ndcg@2: 0.965492\tvalid_0's ndcg@3: 0.967254\tvalid_0's ndcg@4: 0.967556\tvalid_0's ndcg@5: 0.967604\n", + "[3]\tvalid_0's ndcg@1: 0.916025\tvalid_0's ndcg@2: 0.966389\tvalid_0's ndcg@3: 0.967976\tvalid_0's ndcg@4: 0.968278\tvalid_0's ndcg@5: 0.968355\n", + "[4]\tvalid_0's ndcg@1: 0.919\tvalid_0's ndcg@2: 0.967392\tvalid_0's ndcg@3: 0.96903\tvalid_0's ndcg@4: 0.969364\tvalid_0's ndcg@5: 0.969431\n", + "[5]\tvalid_0's ndcg@1: 0.921125\tvalid_0's ndcg@2: 0.968192\tvalid_0's ndcg@3: 0.969855\tvalid_0's ndcg@4: 0.970156\tvalid_0's ndcg@5: 0.970224\n", + "[6]\tvalid_0's ndcg@1: 0.921675\tvalid_0's ndcg@2: 0.968411\tvalid_0's ndcg@3: 0.970111\tvalid_0's ndcg@4: 0.97037\tvalid_0's ndcg@5: 0.970437\n", + "[7]\tvalid_0's ndcg@1: 0.9237\tvalid_0's ndcg@2: 0.969332\tvalid_0's ndcg@3: 0.970882\tvalid_0's ndcg@4: 0.97113\tvalid_0's ndcg@5: 0.971217\n", + "[8]\tvalid_0's ndcg@1: 0.925775\tvalid_0's ndcg@2: 0.970129\tvalid_0's ndcg@3: 0.971642\tvalid_0's ndcg@4: 0.971922\tvalid_0's ndcg@5: 0.97199\n", + "[9]\tvalid_0's ndcg@1: 0.926775\tvalid_0's ndcg@2: 0.970435\tvalid_0's ndcg@3: 0.971985\tvalid_0's ndcg@4: 0.972276\tvalid_0's ndcg@5: 0.972334\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[10]\tvalid_0's ndcg@1: 0.9277\tvalid_0's ndcg@2: 0.970761\tvalid_0's ndcg@3: 0.972311\tvalid_0's ndcg@4: 0.972612\tvalid_0's ndcg@5: 0.97267\n", + "[11]\tvalid_0's ndcg@1: 0.928975\tvalid_0's ndcg@2: 0.97131\tvalid_0's ndcg@3: 0.972798\tvalid_0's ndcg@4: 0.973089\tvalid_0's ndcg@5: 0.973166\n", + "[12]\tvalid_0's ndcg@1: 0.929375\tvalid_0's ndcg@2: 0.971505\tvalid_0's ndcg@3: 0.972968\tvalid_0's ndcg@4: 0.973259\tvalid_0's ndcg@5: 0.973326\n", + "[13]\tvalid_0's ndcg@1: 0.929375\tvalid_0's ndcg@2: 0.971426\tvalid_0's ndcg@3: 0.972939\tvalid_0's ndcg@4: 0.97324\tvalid_0's ndcg@5: 0.973318\n", + "[14]\tvalid_0's ndcg@1: 0.929775\tvalid_0's ndcg@2: 0.971621\tvalid_0's ndcg@3: 0.973121\tvalid_0's ndcg@4: 0.973412\tvalid_0's ndcg@5: 0.97348\n", + "[15]\tvalid_0's ndcg@1: 0.9304\tvalid_0's ndcg@2: 0.971868\tvalid_0's ndcg@3: 0.97338\tvalid_0's ndcg@4: 0.97365\tvalid_0's ndcg@5: 0.973717\n", + "[16]\tvalid_0's ndcg@1: 0.930975\tvalid_0's ndcg@2: 0.972096\tvalid_0's ndcg@3: 0.973558\tvalid_0's ndcg@4: 0.973849\tvalid_0's ndcg@5: 0.973926\n", + "[17]\tvalid_0's ndcg@1: 0.93105\tvalid_0's ndcg@2: 0.972108\tvalid_0's ndcg@3: 0.973583\tvalid_0's ndcg@4: 0.973884\tvalid_0's ndcg@5: 0.973952\n", + "[18]\tvalid_0's ndcg@1: 0.931725\tvalid_0's ndcg@2: 0.972373\tvalid_0's ndcg@3: 0.97386\tvalid_0's ndcg@4: 0.974129\tvalid_0's ndcg@5: 0.974207\n", + "[19]\tvalid_0's ndcg@1: 0.932175\tvalid_0's ndcg@2: 0.972681\tvalid_0's ndcg@3: 0.974068\tvalid_0's ndcg@4: 0.974348\tvalid_0's ndcg@5: 0.974406\n", + "[20]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.973019\tvalid_0's ndcg@3: 0.974382\tvalid_0's ndcg@4: 0.974673\tvalid_0's ndcg@5: 0.974731\n", + "[21]\tvalid_0's ndcg@1: 0.933075\tvalid_0's ndcg@2: 0.97306\tvalid_0's ndcg@3: 0.974423\tvalid_0's ndcg@4: 0.974703\tvalid_0's ndcg@5: 0.97477\n", + "[22]\tvalid_0's ndcg@1: 0.93375\tvalid_0's ndcg@2: 0.973262\tvalid_0's ndcg@3: 0.974649\tvalid_0's ndcg@4: 0.974929\tvalid_0's ndcg@5: 0.975007\n", + "[23]\tvalid_0's ndcg@1: 0.933675\tvalid_0's ndcg@2: 0.973219\tvalid_0's ndcg@3: 0.974606\tvalid_0's ndcg@4: 0.974886\tvalid_0's ndcg@5: 0.974973\n", + "[24]\tvalid_0's ndcg@1: 0.934\tvalid_0's ndcg@2: 0.97337\tvalid_0's ndcg@3: 0.974745\tvalid_0's ndcg@4: 0.975014\tvalid_0's ndcg@5: 0.975101\n", + "[25]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.973674\tvalid_0's ndcg@3: 0.975062\tvalid_0's ndcg@4: 0.975342\tvalid_0's ndcg@5: 0.97541\n", + "[26]\tvalid_0's ndcg@1: 0.93495\tvalid_0's ndcg@2: 0.973721\tvalid_0's ndcg@3: 0.975096\tvalid_0's ndcg@4: 0.975365\tvalid_0's ndcg@5: 0.975452\n", + "[27]\tvalid_0's ndcg@1: 0.9358\tvalid_0's ndcg@2: 0.974082\tvalid_0's ndcg@3: 0.975444\tvalid_0's ndcg@4: 0.975713\tvalid_0's ndcg@5: 0.975781\n", + "[28]\tvalid_0's ndcg@1: 0.935325\tvalid_0's ndcg@2: 0.973875\tvalid_0's ndcg@3: 0.975275\tvalid_0's ndcg@4: 0.975512\tvalid_0's ndcg@5: 0.975599\n", + "[29]\tvalid_0's ndcg@1: 0.935925\tvalid_0's ndcg@2: 0.974159\tvalid_0's ndcg@3: 0.975522\tvalid_0's ndcg@4: 0.975759\tvalid_0's ndcg@5: 0.975836\n", + "[30]\tvalid_0's ndcg@1: 0.9362\tvalid_0's ndcg@2: 0.974214\tvalid_0's ndcg@3: 0.975589\tvalid_0's ndcg@4: 0.975847\tvalid_0's ndcg@5: 0.975924\n", + "[31]\tvalid_0's ndcg@1: 0.93625\tvalid_0's ndcg@2: 0.974216\tvalid_0's ndcg@3: 0.975629\tvalid_0's ndcg@4: 0.975876\tvalid_0's ndcg@5: 0.975944\n", + "[32]\tvalid_0's ndcg@1: 0.93665\tvalid_0's ndcg@2: 0.974427\tvalid_0's ndcg@3: 0.975814\tvalid_0's ndcg@4: 0.97603\tvalid_0's ndcg@5: 0.976107\n", + "[33]\tvalid_0's ndcg@1: 0.936775\tvalid_0's ndcg@2: 0.974505\tvalid_0's ndcg@3: 0.975855\tvalid_0's ndcg@4: 0.976081\tvalid_0's ndcg@5: 0.976158\n", + "[34]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.974643\tvalid_0's ndcg@3: 0.975993\tvalid_0's ndcg@4: 0.976219\tvalid_0's ndcg@5: 0.976296\n", + "[35]\tvalid_0's ndcg@1: 0.937675\tvalid_0's ndcg@2: 0.974805\tvalid_0's ndcg@3: 0.97618\tvalid_0's ndcg@4: 0.976406\tvalid_0's ndcg@5: 0.976484\n", + "[36]\tvalid_0's ndcg@1: 0.9382\tvalid_0's ndcg@2: 0.974983\tvalid_0's ndcg@3: 0.976371\tvalid_0's ndcg@4: 0.976597\tvalid_0's ndcg@5: 0.976674\n", + "[37]\tvalid_0's ndcg@1: 0.938175\tvalid_0's ndcg@2: 0.974974\tvalid_0's ndcg@3: 0.976349\tvalid_0's ndcg@4: 0.976586\tvalid_0's ndcg@5: 0.976663\n", + "[38]\tvalid_0's ndcg@1: 0.938675\tvalid_0's ndcg@2: 0.975143\tvalid_0's ndcg@3: 0.976518\tvalid_0's ndcg@4: 0.976776\tvalid_0's ndcg@5: 0.976844\n", + "[39]\tvalid_0's ndcg@1: 0.938575\tvalid_0's ndcg@2: 0.975106\tvalid_0's ndcg@3: 0.976481\tvalid_0's ndcg@4: 0.976739\tvalid_0's ndcg@5: 0.976807\n", + "[40]\tvalid_0's ndcg@1: 0.938675\tvalid_0's ndcg@2: 0.97519\tvalid_0's ndcg@3: 0.976528\tvalid_0's ndcg@4: 0.976775\tvalid_0's ndcg@5: 0.976853\n", + "[41]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.975347\tvalid_0's ndcg@3: 0.976697\tvalid_0's ndcg@4: 0.976934\tvalid_0's ndcg@5: 0.977001\n", + "[42]\tvalid_0's ndcg@1: 0.939825\tvalid_0's ndcg@2: 0.975599\tvalid_0's ndcg@3: 0.976961\tvalid_0's ndcg@4: 0.977198\tvalid_0's ndcg@5: 0.977266\n", + "[43]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.975639\tvalid_0's ndcg@3: 0.976977\tvalid_0's ndcg@4: 0.977214\tvalid_0's ndcg@5: 0.977282\n", + "[44]\tvalid_0's ndcg@1: 0.9398\tvalid_0's ndcg@2: 0.975605\tvalid_0's ndcg@3: 0.976955\tvalid_0's ndcg@4: 0.977192\tvalid_0's ndcg@5: 0.97726\n", + "[45]\tvalid_0's ndcg@1: 0.9401\tvalid_0's ndcg@2: 0.9757\tvalid_0's ndcg@3: 0.977075\tvalid_0's ndcg@4: 0.977291\tvalid_0's ndcg@5: 0.977368\n", + "[46]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975845\tvalid_0's ndcg@3: 0.977183\tvalid_0's ndcg@4: 0.97742\tvalid_0's ndcg@5: 0.977497\n", + "[47]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.975854\tvalid_0's ndcg@3: 0.977204\tvalid_0's ndcg@4: 0.97743\tvalid_0's ndcg@5: 0.977508\n", + "[48]\tvalid_0's ndcg@1: 0.940575\tvalid_0's ndcg@2: 0.975923\tvalid_0's ndcg@3: 0.977273\tvalid_0's ndcg@4: 0.977488\tvalid_0's ndcg@5: 0.977556\n", + "[49]\tvalid_0's ndcg@1: 0.9407\tvalid_0's ndcg@2: 0.975922\tvalid_0's ndcg@3: 0.977297\tvalid_0's ndcg@4: 0.977501\tvalid_0's ndcg@5: 0.977588\n", + "[50]\tvalid_0's ndcg@1: 0.940725\tvalid_0's ndcg@2: 0.975947\tvalid_0's ndcg@3: 0.977322\tvalid_0's ndcg@4: 0.977505\tvalid_0's ndcg@5: 0.977592\n", + "[51]\tvalid_0's ndcg@1: 0.9406\tvalid_0's ndcg@2: 0.975837\tvalid_0's ndcg@3: 0.97725\tvalid_0's ndcg@4: 0.977422\tvalid_0's ndcg@5: 0.977509\n", + "[52]\tvalid_0's ndcg@1: 0.941075\tvalid_0's ndcg@2: 0.975997\tvalid_0's ndcg@3: 0.977422\tvalid_0's ndcg@4: 0.977594\tvalid_0's ndcg@5: 0.977691\n", + "[53]\tvalid_0's ndcg@1: 0.940925\tvalid_0's ndcg@2: 0.975989\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.977538\tvalid_0's ndcg@5: 0.977644\n", + "[54]\tvalid_0's ndcg@1: 0.94125\tvalid_0's ndcg@2: 0.976062\tvalid_0's ndcg@3: 0.977487\tvalid_0's ndcg@4: 0.977659\tvalid_0's ndcg@5: 0.977756\n", + "[55]\tvalid_0's ndcg@1: 0.94145\tvalid_0's ndcg@2: 0.976183\tvalid_0's ndcg@3: 0.97757\tvalid_0's ndcg@4: 0.977742\tvalid_0's ndcg@5: 0.977839\n", + "[56]\tvalid_0's ndcg@1: 0.941475\tvalid_0's ndcg@2: 0.976176\tvalid_0's ndcg@3: 0.977576\tvalid_0's ndcg@4: 0.977748\tvalid_0's ndcg@5: 0.977845\n", + "[57]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.976139\tvalid_0's ndcg@3: 0.977539\tvalid_0's ndcg@4: 0.977712\tvalid_0's ndcg@5: 0.977808\n", + "[58]\tvalid_0's ndcg@1: 0.941675\tvalid_0's ndcg@2: 0.97625\tvalid_0's ndcg@3: 0.97765\tvalid_0's ndcg@4: 0.977822\tvalid_0's ndcg@5: 0.977919\n", + "[59]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.976253\tvalid_0's ndcg@3: 0.977653\tvalid_0's ndcg@4: 0.977836\tvalid_0's ndcg@5: 0.977932\n", + "[60]\tvalid_0's ndcg@1: 0.941675\tvalid_0's ndcg@2: 0.976234\tvalid_0's ndcg@3: 0.977634\tvalid_0's ndcg@4: 0.977817\tvalid_0's ndcg@5: 0.977914\n", + "[61]\tvalid_0's ndcg@1: 0.9419\tvalid_0's ndcg@2: 0.976333\tvalid_0's ndcg@3: 0.977745\tvalid_0's ndcg@4: 0.977918\tvalid_0's ndcg@5: 0.978005\n", + "[62]\tvalid_0's ndcg@1: 0.941975\tvalid_0's ndcg@2: 0.976345\tvalid_0's ndcg@3: 0.977757\tvalid_0's ndcg@4: 0.97794\tvalid_0's ndcg@5: 0.978027\n", + "[63]\tvalid_0's ndcg@1: 0.9423\tvalid_0's ndcg@2: 0.976496\tvalid_0's ndcg@3: 0.977871\tvalid_0's ndcg@4: 0.978065\tvalid_0's ndcg@5: 0.978152\n", + "[64]\tvalid_0's ndcg@1: 0.942625\tvalid_0's ndcg@2: 0.976632\tvalid_0's ndcg@3: 0.977995\tvalid_0's ndcg@4: 0.978188\tvalid_0's ndcg@5: 0.978275\n", + "[65]\tvalid_0's ndcg@1: 0.942575\tvalid_0's ndcg@2: 0.976629\tvalid_0's ndcg@3: 0.977979\tvalid_0's ndcg@4: 0.978173\tvalid_0's ndcg@5: 0.97826\n", + "[66]\tvalid_0's ndcg@1: 0.942725\tvalid_0's ndcg@2: 0.976685\tvalid_0's ndcg@3: 0.978035\tvalid_0's ndcg@4: 0.978229\tvalid_0's ndcg@5: 0.978316\n", + "[67]\tvalid_0's ndcg@1: 0.94275\tvalid_0's ndcg@2: 0.976678\tvalid_0's ndcg@3: 0.978041\tvalid_0's ndcg@4: 0.978224\tvalid_0's ndcg@5: 0.97832\n", + "[68]\tvalid_0's ndcg@1: 0.94275\tvalid_0's ndcg@2: 0.976694\tvalid_0's ndcg@3: 0.978044\tvalid_0's ndcg@4: 0.978227\tvalid_0's ndcg@5: 0.978324\n", + "[69]\tvalid_0's ndcg@1: 0.943\tvalid_0's ndcg@2: 0.976834\tvalid_0's ndcg@3: 0.978146\tvalid_0's ndcg@4: 0.978329\tvalid_0's ndcg@5: 0.978426\n", + "[70]\tvalid_0's ndcg@1: 0.943025\tvalid_0's ndcg@2: 0.976827\tvalid_0's ndcg@3: 0.978152\tvalid_0's ndcg@4: 0.978324\tvalid_0's ndcg@5: 0.978431\n", + "[71]\tvalid_0's ndcg@1: 0.9432\tvalid_0's ndcg@2: 0.976923\tvalid_0's ndcg@3: 0.978236\tvalid_0's ndcg@4: 0.978397\tvalid_0's ndcg@5: 0.978504\n", + "[72]\tvalid_0's ndcg@1: 0.943225\tvalid_0's ndcg@2: 0.976917\tvalid_0's ndcg@3: 0.978254\tvalid_0's ndcg@4: 0.978405\tvalid_0's ndcg@5: 0.978511\n", + "[73]\tvalid_0's ndcg@1: 0.94315\tvalid_0's ndcg@2: 0.976936\tvalid_0's ndcg@3: 0.978236\tvalid_0's ndcg@4: 0.978409\tvalid_0's ndcg@5: 0.978496\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[74]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976957\tvalid_0's ndcg@3: 0.97827\tvalid_0's ndcg@4: 0.978431\tvalid_0's ndcg@5: 0.978528\n", + "[75]\tvalid_0's ndcg@1: 0.943075\tvalid_0's ndcg@2: 0.976861\tvalid_0's ndcg@3: 0.978199\tvalid_0's ndcg@4: 0.97836\tvalid_0's ndcg@5: 0.978457\n", + "[76]\tvalid_0's ndcg@1: 0.94335\tvalid_0's ndcg@2: 0.976963\tvalid_0's ndcg@3: 0.978288\tvalid_0's ndcg@4: 0.978471\tvalid_0's ndcg@5: 0.978568\n", + "[77]\tvalid_0's ndcg@1: 0.94345\tvalid_0's ndcg@2: 0.977031\tvalid_0's ndcg@3: 0.978331\tvalid_0's ndcg@4: 0.978514\tvalid_0's ndcg@5: 0.978611\n", + "[78]\tvalid_0's ndcg@1: 0.943475\tvalid_0's ndcg@2: 0.977088\tvalid_0's ndcg@3: 0.97835\tvalid_0's ndcg@4: 0.978533\tvalid_0's ndcg@5: 0.97863\n", + "[79]\tvalid_0's ndcg@1: 0.943625\tvalid_0's ndcg@2: 0.977096\tvalid_0's ndcg@3: 0.978396\tvalid_0's ndcg@4: 0.978579\tvalid_0's ndcg@5: 0.978676\n", + "[80]\tvalid_0's ndcg@1: 0.943825\tvalid_0's ndcg@2: 0.977154\tvalid_0's ndcg@3: 0.978479\tvalid_0's ndcg@4: 0.978651\tvalid_0's ndcg@5: 0.978748\n", + "[81]\tvalid_0's ndcg@1: 0.943775\tvalid_0's ndcg@2: 0.977135\tvalid_0's ndcg@3: 0.97846\tvalid_0's ndcg@4: 0.978633\tvalid_0's ndcg@5: 0.978729\n", + "[82]\tvalid_0's ndcg@1: 0.9443\tvalid_0's ndcg@2: 0.977361\tvalid_0's ndcg@3: 0.978673\tvalid_0's ndcg@4: 0.978845\tvalid_0's ndcg@5: 0.978933\n", + "[83]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.977324\tvalid_0's ndcg@3: 0.978624\tvalid_0's ndcg@4: 0.978796\tvalid_0's ndcg@5: 0.978893\n", + "[84]\tvalid_0's ndcg@1: 0.94405\tvalid_0's ndcg@2: 0.977253\tvalid_0's ndcg@3: 0.978565\tvalid_0's ndcg@4: 0.978737\tvalid_0's ndcg@5: 0.978834\n", + "[85]\tvalid_0's ndcg@1: 0.944175\tvalid_0's ndcg@2: 0.977283\tvalid_0's ndcg@3: 0.978633\tvalid_0's ndcg@4: 0.978795\tvalid_0's ndcg@5: 0.978882\n", + "[86]\tvalid_0's ndcg@1: 0.9445\tvalid_0's ndcg@2: 0.97745\tvalid_0's ndcg@3: 0.978763\tvalid_0's ndcg@4: 0.978924\tvalid_0's ndcg@5: 0.979011\n", + "[87]\tvalid_0's ndcg@1: 0.9445\tvalid_0's ndcg@2: 0.977419\tvalid_0's ndcg@3: 0.978756\tvalid_0's ndcg@4: 0.978918\tvalid_0's ndcg@5: 0.979005\n", + "[88]\tvalid_0's ndcg@1: 0.944825\tvalid_0's ndcg@2: 0.977554\tvalid_0's ndcg@3: 0.978867\tvalid_0's ndcg@4: 0.979039\tvalid_0's ndcg@5: 0.979126\n", + "[89]\tvalid_0's ndcg@1: 0.9454\tvalid_0's ndcg@2: 0.977767\tvalid_0's ndcg@3: 0.979079\tvalid_0's ndcg@4: 0.979262\tvalid_0's ndcg@5: 0.97934\n", + "[90]\tvalid_0's ndcg@1: 0.945375\tvalid_0's ndcg@2: 0.977773\tvalid_0's ndcg@3: 0.979073\tvalid_0's ndcg@4: 0.979256\tvalid_0's ndcg@5: 0.979334\n", + "[91]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977792\tvalid_0's ndcg@3: 0.979092\tvalid_0's ndcg@4: 0.979275\tvalid_0's ndcg@5: 0.979352\n", + "[92]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977776\tvalid_0's ndcg@3: 0.979088\tvalid_0's ndcg@4: 0.979261\tvalid_0's ndcg@5: 0.979348\n", + "[93]\tvalid_0's ndcg@1: 0.945375\tvalid_0's ndcg@2: 0.977757\tvalid_0's ndcg@3: 0.979082\tvalid_0's ndcg@4: 0.979244\tvalid_0's ndcg@5: 0.979331\n", + "[94]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977761\tvalid_0's ndcg@3: 0.979061\tvalid_0's ndcg@4: 0.979223\tvalid_0's ndcg@5: 0.97931\n", + "[95]\tvalid_0's ndcg@1: 0.9454\tvalid_0's ndcg@2: 0.977798\tvalid_0's ndcg@3: 0.979086\tvalid_0's ndcg@4: 0.979258\tvalid_0's ndcg@5: 0.979345\n", + "[96]\tvalid_0's ndcg@1: 0.945825\tvalid_0's ndcg@2: 0.977955\tvalid_0's ndcg@3: 0.97923\tvalid_0's ndcg@4: 0.979413\tvalid_0's ndcg@5: 0.9795\n", + "[97]\tvalid_0's ndcg@1: 0.945925\tvalid_0's ndcg@2: 0.97796\tvalid_0's ndcg@3: 0.97926\tvalid_0's ndcg@4: 0.979443\tvalid_0's ndcg@5: 0.979531\n", + "[98]\tvalid_0's ndcg@1: 0.9464\tvalid_0's ndcg@2: 0.97812\tvalid_0's ndcg@3: 0.97942\tvalid_0's ndcg@4: 0.979625\tvalid_0's ndcg@5: 0.979702\n", + "[99]\tvalid_0's ndcg@1: 0.94655\tvalid_0's ndcg@2: 0.978191\tvalid_0's ndcg@3: 0.979479\tvalid_0's ndcg@4: 0.979683\tvalid_0's ndcg@5: 0.97977\n", + "[100]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.978244\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979725\tvalid_0's ndcg@5: 0.979812\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.978244\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979725\tvalid_0's ndcg@5: 0.979812\n", + "[1]\tvalid_0's ndcg@1: 0.910175\tvalid_0's ndcg@2: 0.963031\tvalid_0's ndcg@3: 0.965281\tvalid_0's ndcg@4: 0.965819\tvalid_0's ndcg@5: 0.965887\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.9141\tvalid_0's ndcg@2: 0.964748\tvalid_0's ndcg@3: 0.96681\tvalid_0's ndcg@4: 0.967316\tvalid_0's ndcg@5: 0.967394\n", + "[3]\tvalid_0's ndcg@1: 0.915925\tvalid_0's ndcg@2: 0.9655\tvalid_0's ndcg@3: 0.967575\tvalid_0's ndcg@4: 0.968028\tvalid_0's ndcg@5: 0.968105\n", + "[4]\tvalid_0's ndcg@1: 0.91915\tvalid_0's ndcg@2: 0.966943\tvalid_0's ndcg@3: 0.968968\tvalid_0's ndcg@4: 0.969334\tvalid_0's ndcg@5: 0.969373\n", + "[5]\tvalid_0's ndcg@1: 0.920625\tvalid_0's ndcg@2: 0.967598\tvalid_0's ndcg@3: 0.969498\tvalid_0's ndcg@4: 0.969896\tvalid_0's ndcg@5: 0.969944\n", + "[6]\tvalid_0's ndcg@1: 0.922625\tvalid_0's ndcg@2: 0.968336\tvalid_0's ndcg@3: 0.970261\tvalid_0's ndcg@4: 0.970659\tvalid_0's ndcg@5: 0.970688\n", + "[7]\tvalid_0's ndcg@1: 0.923625\tvalid_0's ndcg@2: 0.968768\tvalid_0's ndcg@3: 0.970656\tvalid_0's ndcg@4: 0.971043\tvalid_0's ndcg@5: 0.971072\n", + "[8]\tvalid_0's ndcg@1: 0.925825\tvalid_0's ndcg@2: 0.969612\tvalid_0's ndcg@3: 0.971462\tvalid_0's ndcg@4: 0.97186\tvalid_0's ndcg@5: 0.971879\n", + "[9]\tvalid_0's ndcg@1: 0.926475\tvalid_0's ndcg@2: 0.969899\tvalid_0's ndcg@3: 0.971711\tvalid_0's ndcg@4: 0.97211\tvalid_0's ndcg@5: 0.972129\n", + "[10]\tvalid_0's ndcg@1: 0.927775\tvalid_0's ndcg@2: 0.97041\tvalid_0's ndcg@3: 0.972185\tvalid_0's ndcg@4: 0.972594\tvalid_0's ndcg@5: 0.972614\n", + "[11]\tvalid_0's ndcg@1: 0.92885\tvalid_0's ndcg@2: 0.970838\tvalid_0's ndcg@3: 0.972588\tvalid_0's ndcg@4: 0.973008\tvalid_0's ndcg@5: 0.973028\n", + "[12]\tvalid_0's ndcg@1: 0.930325\tvalid_0's ndcg@2: 0.971367\tvalid_0's ndcg@3: 0.973129\tvalid_0's ndcg@4: 0.973549\tvalid_0's ndcg@5: 0.973569\n", + "[13]\tvalid_0's ndcg@1: 0.931125\tvalid_0's ndcg@2: 0.971631\tvalid_0's ndcg@3: 0.973443\tvalid_0's ndcg@4: 0.973842\tvalid_0's ndcg@5: 0.973871\n", + "[14]\tvalid_0's ndcg@1: 0.931525\tvalid_0's ndcg@2: 0.971778\tvalid_0's ndcg@3: 0.973616\tvalid_0's ndcg@4: 0.973993\tvalid_0's ndcg@5: 0.974022\n", + "[15]\tvalid_0's ndcg@1: 0.9311\tvalid_0's ndcg@2: 0.9717\tvalid_0's ndcg@3: 0.973475\tvalid_0's ndcg@4: 0.973852\tvalid_0's ndcg@5: 0.973872\n", + "[16]\tvalid_0's ndcg@1: 0.931775\tvalid_0's ndcg@2: 0.971902\tvalid_0's ndcg@3: 0.973702\tvalid_0's ndcg@4: 0.97409\tvalid_0's ndcg@5: 0.974109\n", + "[17]\tvalid_0's ndcg@1: 0.931425\tvalid_0's ndcg@2: 0.971805\tvalid_0's ndcg@3: 0.97358\tvalid_0's ndcg@4: 0.973967\tvalid_0's ndcg@5: 0.973986\n", + "[18]\tvalid_0's ndcg@1: 0.931575\tvalid_0's ndcg@2: 0.971876\tvalid_0's ndcg@3: 0.973651\tvalid_0's ndcg@4: 0.974027\tvalid_0's ndcg@5: 0.974047\n", + "[19]\tvalid_0's ndcg@1: 0.932\tvalid_0's ndcg@2: 0.97208\tvalid_0's ndcg@3: 0.973805\tvalid_0's ndcg@4: 0.974192\tvalid_0's ndcg@5: 0.974212\n", + "[20]\tvalid_0's ndcg@1: 0.932075\tvalid_0's ndcg@2: 0.972092\tvalid_0's ndcg@3: 0.973829\tvalid_0's ndcg@4: 0.974217\tvalid_0's ndcg@5: 0.974236\n", + "[21]\tvalid_0's ndcg@1: 0.932675\tvalid_0's ndcg@2: 0.972282\tvalid_0's ndcg@3: 0.974057\tvalid_0's ndcg@4: 0.974444\tvalid_0's ndcg@5: 0.974454\n", + "[22]\tvalid_0's ndcg@1: 0.932925\tvalid_0's ndcg@2: 0.972358\tvalid_0's ndcg@3: 0.974146\tvalid_0's ndcg@4: 0.974533\tvalid_0's ndcg@5: 0.974543\n", + "[23]\tvalid_0's ndcg@1: 0.93325\tvalid_0's ndcg@2: 0.972478\tvalid_0's ndcg@3: 0.974253\tvalid_0's ndcg@4: 0.974651\tvalid_0's ndcg@5: 0.974661\n", + "[24]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972539\tvalid_0's ndcg@3: 0.974351\tvalid_0's ndcg@4: 0.974739\tvalid_0's ndcg@5: 0.974749\n", + "[25]\tvalid_0's ndcg@1: 0.93475\tvalid_0's ndcg@2: 0.973\tvalid_0's ndcg@3: 0.974788\tvalid_0's ndcg@4: 0.975197\tvalid_0's ndcg@5: 0.975206\n", + "[26]\tvalid_0's ndcg@1: 0.935075\tvalid_0's ndcg@2: 0.97312\tvalid_0's ndcg@3: 0.974895\tvalid_0's ndcg@4: 0.975315\tvalid_0's ndcg@5: 0.975325\n", + "[27]\tvalid_0's ndcg@1: 0.9349\tvalid_0's ndcg@2: 0.973103\tvalid_0's ndcg@3: 0.974865\tvalid_0's ndcg@4: 0.975264\tvalid_0's ndcg@5: 0.975273\n", + "[28]\tvalid_0's ndcg@1: 0.935075\tvalid_0's ndcg@2: 0.973152\tvalid_0's ndcg@3: 0.974939\tvalid_0's ndcg@4: 0.975327\tvalid_0's ndcg@5: 0.975336\n", + "[29]\tvalid_0's ndcg@1: 0.935475\tvalid_0's ndcg@2: 0.973315\tvalid_0's ndcg@3: 0.975128\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975492\n", + "[30]\tvalid_0's ndcg@1: 0.93595\tvalid_0's ndcg@2: 0.973522\tvalid_0's ndcg@3: 0.975297\tvalid_0's ndcg@4: 0.975663\tvalid_0's ndcg@5: 0.975673\n", + "[31]\tvalid_0's ndcg@1: 0.93595\tvalid_0's ndcg@2: 0.973506\tvalid_0's ndcg@3: 0.975281\tvalid_0's ndcg@4: 0.975658\tvalid_0's ndcg@5: 0.975668\n", + "[32]\tvalid_0's ndcg@1: 0.93675\tvalid_0's ndcg@2: 0.973833\tvalid_0's ndcg@3: 0.975595\tvalid_0's ndcg@4: 0.975961\tvalid_0's ndcg@5: 0.975971\n", + "[33]\tvalid_0's ndcg@1: 0.936475\tvalid_0's ndcg@2: 0.973763\tvalid_0's ndcg@3: 0.975488\tvalid_0's ndcg@4: 0.975865\tvalid_0's ndcg@5: 0.975874\n", + "[34]\tvalid_0's ndcg@1: 0.9367\tvalid_0's ndcg@2: 0.973893\tvalid_0's ndcg@3: 0.975568\tvalid_0's ndcg@4: 0.975956\tvalid_0's ndcg@5: 0.975966\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[35]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.974059\tvalid_0's ndcg@3: 0.975722\tvalid_0's ndcg@4: 0.97612\tvalid_0's ndcg@5: 0.97613\n", + "[36]\tvalid_0's ndcg@1: 0.9374\tvalid_0's ndcg@2: 0.974183\tvalid_0's ndcg@3: 0.975846\tvalid_0's ndcg@4: 0.976223\tvalid_0's ndcg@5: 0.976232\n", + "[37]\tvalid_0's ndcg@1: 0.9374\tvalid_0's ndcg@2: 0.974183\tvalid_0's ndcg@3: 0.975846\tvalid_0's ndcg@4: 0.976223\tvalid_0's ndcg@5: 0.976232\n", + "[38]\tvalid_0's ndcg@1: 0.938725\tvalid_0's ndcg@2: 0.974672\tvalid_0's ndcg@3: 0.97636\tvalid_0's ndcg@4: 0.976715\tvalid_0's ndcg@5: 0.976725\n", + "[39]\tvalid_0's ndcg@1: 0.93865\tvalid_0's ndcg@2: 0.974676\tvalid_0's ndcg@3: 0.976364\tvalid_0's ndcg@4: 0.976697\tvalid_0's ndcg@5: 0.976707\n", + "[40]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.974867\tvalid_0's ndcg@3: 0.97653\tvalid_0's ndcg@4: 0.976874\tvalid_0's ndcg@5: 0.976884\n", + "[41]\tvalid_0's ndcg@1: 0.9396\tvalid_0's ndcg@2: 0.975042\tvalid_0's ndcg@3: 0.976705\tvalid_0's ndcg@4: 0.97705\tvalid_0's ndcg@5: 0.977059\n", + "[42]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.975072\tvalid_0's ndcg@3: 0.976784\tvalid_0's ndcg@4: 0.977129\tvalid_0's ndcg@5: 0.977138\n", + "[43]\tvalid_0's ndcg@1: 0.940075\tvalid_0's ndcg@2: 0.97517\tvalid_0's ndcg@3: 0.97687\tvalid_0's ndcg@4: 0.977215\tvalid_0's ndcg@5: 0.977225\n", + "[44]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.97534\tvalid_0's ndcg@3: 0.977015\tvalid_0's ndcg@4: 0.97736\tvalid_0's ndcg@5: 0.97737\n", + "[45]\tvalid_0's ndcg@1: 0.94055\tvalid_0's ndcg@2: 0.975409\tvalid_0's ndcg@3: 0.977059\tvalid_0's ndcg@4: 0.977403\tvalid_0's ndcg@5: 0.977413\n", + "[46]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975415\tvalid_0's ndcg@3: 0.97704\tvalid_0's ndcg@4: 0.977396\tvalid_0's ndcg@5: 0.977405\n", + "[47]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975363\tvalid_0's ndcg@3: 0.977013\tvalid_0's ndcg@4: 0.977357\tvalid_0's ndcg@5: 0.977367\n", + "[48]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975388\tvalid_0's ndcg@3: 0.977025\tvalid_0's ndcg@4: 0.97737\tvalid_0's ndcg@5: 0.977379\n", + "[49]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975447\tvalid_0's ndcg@3: 0.977097\tvalid_0's ndcg@4: 0.977409\tvalid_0's ndcg@5: 0.977419\n", + "[50]\tvalid_0's ndcg@1: 0.941075\tvalid_0's ndcg@2: 0.975666\tvalid_0's ndcg@3: 0.977303\tvalid_0's ndcg@4: 0.977615\tvalid_0's ndcg@5: 0.977625\n", + "[51]\tvalid_0's ndcg@1: 0.94135\tvalid_0's ndcg@2: 0.975751\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.97771\tvalid_0's ndcg@5: 0.97772\n", + "[52]\tvalid_0's ndcg@1: 0.9413\tvalid_0's ndcg@2: 0.975717\tvalid_0's ndcg@3: 0.977355\tvalid_0's ndcg@4: 0.977688\tvalid_0's ndcg@5: 0.977698\n", + "[53]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.975713\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.977699\tvalid_0's ndcg@5: 0.977718\n", + "[54]\tvalid_0's ndcg@1: 0.94185\tvalid_0's ndcg@2: 0.975857\tvalid_0's ndcg@3: 0.977557\tvalid_0's ndcg@4: 0.977869\tvalid_0's ndcg@5: 0.977889\n", + "[55]\tvalid_0's ndcg@1: 0.941925\tvalid_0's ndcg@2: 0.975837\tvalid_0's ndcg@3: 0.9776\tvalid_0's ndcg@4: 0.977891\tvalid_0's ndcg@5: 0.97791\n", + "[56]\tvalid_0's ndcg@1: 0.942325\tvalid_0's ndcg@2: 0.975969\tvalid_0's ndcg@3: 0.977719\tvalid_0's ndcg@4: 0.978032\tvalid_0's ndcg@5: 0.978051\n", + "[57]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976022\tvalid_0's ndcg@3: 0.977772\tvalid_0's ndcg@4: 0.978073\tvalid_0's ndcg@5: 0.978093\n", + "[58]\tvalid_0's ndcg@1: 0.9425\tvalid_0's ndcg@2: 0.976081\tvalid_0's ndcg@3: 0.977806\tvalid_0's ndcg@4: 0.978108\tvalid_0's ndcg@5: 0.978127\n", + "[59]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976076\tvalid_0's ndcg@3: 0.977788\tvalid_0's ndcg@4: 0.978079\tvalid_0's ndcg@5: 0.978098\n", + "[60]\tvalid_0's ndcg@1: 0.942375\tvalid_0's ndcg@2: 0.976067\tvalid_0's ndcg@3: 0.977779\tvalid_0's ndcg@4: 0.97807\tvalid_0's ndcg@5: 0.978089\n", + "[61]\tvalid_0's ndcg@1: 0.942225\tvalid_0's ndcg@2: 0.976043\tvalid_0's ndcg@3: 0.97773\tvalid_0's ndcg@4: 0.978021\tvalid_0's ndcg@5: 0.97804\n", + "[62]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976117\tvalid_0's ndcg@3: 0.977792\tvalid_0's ndcg@4: 0.978093\tvalid_0's ndcg@5: 0.978112\n", + "[63]\tvalid_0's ndcg@1: 0.942675\tvalid_0's ndcg@2: 0.976193\tvalid_0's ndcg@3: 0.977881\tvalid_0's ndcg@4: 0.978182\tvalid_0's ndcg@5: 0.978201\n", + "[64]\tvalid_0's ndcg@1: 0.942925\tvalid_0's ndcg@2: 0.976254\tvalid_0's ndcg@3: 0.977966\tvalid_0's ndcg@4: 0.978268\tvalid_0's ndcg@5: 0.978287\n", + "[65]\tvalid_0's ndcg@1: 0.9431\tvalid_0's ndcg@2: 0.97635\tvalid_0's ndcg@3: 0.978025\tvalid_0's ndcg@4: 0.978337\tvalid_0's ndcg@5: 0.978357\n", + "[66]\tvalid_0's ndcg@1: 0.9434\tvalid_0's ndcg@2: 0.976445\tvalid_0's ndcg@3: 0.978132\tvalid_0's ndcg@4: 0.978445\tvalid_0's ndcg@5: 0.978464\n", + "[67]\tvalid_0's ndcg@1: 0.943275\tvalid_0's ndcg@2: 0.976399\tvalid_0's ndcg@3: 0.978074\tvalid_0's ndcg@4: 0.978397\tvalid_0's ndcg@5: 0.978416\n", + "[68]\tvalid_0's ndcg@1: 0.943325\tvalid_0's ndcg@2: 0.976401\tvalid_0's ndcg@3: 0.978089\tvalid_0's ndcg@4: 0.978412\tvalid_0's ndcg@5: 0.978431\n", + "[69]\tvalid_0's ndcg@1: 0.943675\tvalid_0's ndcg@2: 0.976578\tvalid_0's ndcg@3: 0.97819\tvalid_0's ndcg@4: 0.978546\tvalid_0's ndcg@5: 0.978565\n", + "[70]\tvalid_0's ndcg@1: 0.944025\tvalid_0's ndcg@2: 0.976707\tvalid_0's ndcg@3: 0.97832\tvalid_0's ndcg@4: 0.978675\tvalid_0's ndcg@5: 0.978694\n", + "[71]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.976772\tvalid_0's ndcg@3: 0.978384\tvalid_0's ndcg@4: 0.97874\tvalid_0's ndcg@5: 0.978759\n", + "[72]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.976822\tvalid_0's ndcg@3: 0.978409\tvalid_0's ndcg@4: 0.978765\tvalid_0's ndcg@5: 0.978784\n", + "[73]\tvalid_0's ndcg@1: 0.94445\tvalid_0's ndcg@2: 0.976864\tvalid_0's ndcg@3: 0.978464\tvalid_0's ndcg@4: 0.97883\tvalid_0's ndcg@5: 0.978849\n", + "[74]\tvalid_0's ndcg@1: 0.9446\tvalid_0's ndcg@2: 0.976919\tvalid_0's ndcg@3: 0.978519\tvalid_0's ndcg@4: 0.978885\tvalid_0's ndcg@5: 0.978905\n", + "[75]\tvalid_0's ndcg@1: 0.9446\tvalid_0's ndcg@2: 0.976919\tvalid_0's ndcg@3: 0.978519\tvalid_0's ndcg@4: 0.978885\tvalid_0's ndcg@5: 0.978905\n", + "[76]\tvalid_0's ndcg@1: 0.944625\tvalid_0's ndcg@2: 0.97696\tvalid_0's ndcg@3: 0.978535\tvalid_0's ndcg@4: 0.978901\tvalid_0's ndcg@5: 0.978921\n", + "[77]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.976979\tvalid_0's ndcg@3: 0.978554\tvalid_0's ndcg@4: 0.97892\tvalid_0's ndcg@5: 0.978939\n", + "[78]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.976979\tvalid_0's ndcg@3: 0.978554\tvalid_0's ndcg@4: 0.97892\tvalid_0's ndcg@5: 0.978939\n", + "[79]\tvalid_0's ndcg@1: 0.944525\tvalid_0's ndcg@2: 0.976907\tvalid_0's ndcg@3: 0.978507\tvalid_0's ndcg@4: 0.978863\tvalid_0's ndcg@5: 0.978882\n", + "[80]\tvalid_0's ndcg@1: 0.94455\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.97851\tvalid_0's ndcg@4: 0.978865\tvalid_0's ndcg@5: 0.978885\n", + "[81]\tvalid_0's ndcg@1: 0.944725\tvalid_0's ndcg@2: 0.97695\tvalid_0's ndcg@3: 0.978575\tvalid_0's ndcg@4: 0.978919\tvalid_0's ndcg@5: 0.978948\n", + "[82]\tvalid_0's ndcg@1: 0.945225\tvalid_0's ndcg@2: 0.977103\tvalid_0's ndcg@3: 0.978765\tvalid_0's ndcg@4: 0.97911\tvalid_0's ndcg@5: 0.979129\n", + "[83]\tvalid_0's ndcg@1: 0.945125\tvalid_0's ndcg@2: 0.977066\tvalid_0's ndcg@3: 0.978716\tvalid_0's ndcg@4: 0.979071\tvalid_0's ndcg@5: 0.97909\n", + "[84]\tvalid_0's ndcg@1: 0.945225\tvalid_0's ndcg@2: 0.97715\tvalid_0's ndcg@3: 0.978775\tvalid_0's ndcg@4: 0.97912\tvalid_0's ndcg@5: 0.979139\n", + "[85]\tvalid_0's ndcg@1: 0.945025\tvalid_0's ndcg@2: 0.977092\tvalid_0's ndcg@3: 0.978692\tvalid_0's ndcg@4: 0.979047\tvalid_0's ndcg@5: 0.979067\n", + "[86]\tvalid_0's ndcg@1: 0.9452\tvalid_0's ndcg@2: 0.977172\tvalid_0's ndcg@3: 0.97876\tvalid_0's ndcg@4: 0.979115\tvalid_0's ndcg@5: 0.979135\n", + "[87]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977178\tvalid_0's ndcg@3: 0.97879\tvalid_0's ndcg@4: 0.979156\tvalid_0's ndcg@5: 0.979166\n", + "[88]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977178\tvalid_0's ndcg@3: 0.978815\tvalid_0's ndcg@4: 0.979149\tvalid_0's ndcg@5: 0.979168\n", + "[89]\tvalid_0's ndcg@1: 0.94555\tvalid_0's ndcg@2: 0.977333\tvalid_0's ndcg@3: 0.978933\tvalid_0's ndcg@4: 0.979267\tvalid_0's ndcg@5: 0.979277\n", + "[90]\tvalid_0's ndcg@1: 0.9459\tvalid_0's ndcg@2: 0.977462\tvalid_0's ndcg@3: 0.979062\tvalid_0's ndcg@4: 0.979396\tvalid_0's ndcg@5: 0.979406\n", + "[91]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977481\tvalid_0's ndcg@3: 0.979081\tvalid_0's ndcg@4: 0.979414\tvalid_0's ndcg@5: 0.979424\n", + "[92]\tvalid_0's ndcg@1: 0.945875\tvalid_0's ndcg@2: 0.977437\tvalid_0's ndcg@3: 0.97905\tvalid_0's ndcg@4: 0.979384\tvalid_0's ndcg@5: 0.979393\n", + "[93]\tvalid_0's ndcg@1: 0.945875\tvalid_0's ndcg@2: 0.977421\tvalid_0's ndcg@3: 0.979046\tvalid_0's ndcg@4: 0.97938\tvalid_0's ndcg@5: 0.97939\n", + "[94]\tvalid_0's ndcg@1: 0.9459\tvalid_0's ndcg@2: 0.977431\tvalid_0's ndcg@3: 0.979068\tvalid_0's ndcg@4: 0.979391\tvalid_0's ndcg@5: 0.979401\n", + "[95]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977449\tvalid_0's ndcg@3: 0.979074\tvalid_0's ndcg@4: 0.979408\tvalid_0's ndcg@5: 0.979418\n", + "[96]\tvalid_0's ndcg@1: 0.946075\tvalid_0's ndcg@2: 0.977527\tvalid_0's ndcg@3: 0.979127\tvalid_0's ndcg@4: 0.979461\tvalid_0's ndcg@5: 0.97947\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[97]\tvalid_0's ndcg@1: 0.946375\tvalid_0's ndcg@2: 0.977622\tvalid_0's ndcg@3: 0.979222\tvalid_0's ndcg@4: 0.979577\tvalid_0's ndcg@5: 0.979577\n", + "[98]\tvalid_0's ndcg@1: 0.946625\tvalid_0's ndcg@2: 0.977714\tvalid_0's ndcg@3: 0.979339\tvalid_0's ndcg@4: 0.979673\tvalid_0's ndcg@5: 0.979673\n", + "[99]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.977739\tvalid_0's ndcg@3: 0.979352\tvalid_0's ndcg@4: 0.979685\tvalid_0's ndcg@5: 0.979685\n", + "[100]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.97778\tvalid_0's ndcg@3: 0.97938\tvalid_0's ndcg@4: 0.979703\tvalid_0's ndcg@5: 0.979703\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.97778\tvalid_0's ndcg@3: 0.97938\tvalid_0's ndcg@4: 0.979703\tvalid_0's ndcg@5: 0.979703\n" + ] + } + ], + "source": [ + "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", + "# 这一部分与前面的单独训练和验证是分开的\n", + "def get_kfold_users(trn_df, n=5):\n", + " user_ids = trn_df['user_id'].unique()\n", + " user_set = [user_ids[i::n] for i in range(n)]\n", + " return user_set\n", + "\n", + "k_fold = 5\n", + "trn_df = trn_user_item_feats_df_rank_model\n", + "user_set = get_kfold_users(trn_df, n=k_fold)\n", + "\n", + "score_list = []\n", + "score_df = trn_df[['user_id', 'click_article_id','label']]\n", + "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", + "\n", + "# 五折交叉验证,并将中间结果保存用于staking\n", + "for n_fold, valid_user in enumerate(user_set):\n", + " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", + " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", + " \n", + " # 训练集与验证集的用户分组\n", + " train_idx.sort_values(by=['user_id'], inplace=True)\n", + " g_train = train_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", + " \n", + " valid_idx.sort_values(by=['user_id'], inplace=True)\n", + " g_val = valid_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", + " \n", + " # 定义模型\n", + " lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", + " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", + " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) \n", + " # 训练模型\n", + " lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,\n", + " eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], \n", + " eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n", + " \n", + " # 预测验证集结果\n", + " valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n", + " \n", + " # 对输出结果进行归一化\n", + " valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))\n", + " \n", + " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", + " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", + " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", + " \n", + " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", + " if not offline:\n", + " sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)\n", + " \n", + "score_df_ = pd.concat(score_list, axis=0)\n", + "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", + "# 保存训练集交叉验证产生的新特征\n", + "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)\n", + " \n", + "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", + "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n", + "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n", + "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n", + "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + "\n", + "# 保存测试集交叉验证的新特征\n", + "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/2\n", - "290964/290964 [==============================] - 55s 189us/sample - loss: 0.4209 - binary_crossentropy: 0.4206 - auc: 0.7842\n", - "Epoch 2/2\n", - "290964/290964 [==============================] - 52s 178us/sample - loss: 0.3630 - binary_crossentropy: 0.3618 - auc: 0.8478\n" - ] - } - ], - "source": [ - "# 模型训练\n", - "if offline:\n", - " history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)\n", - "else:\n", - " # 也可以使用上面的语句用自己采样出来的验证集\n", - " # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)\n", - " history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:29:20.436591Z", - "start_time": "2020-11-18T04:28:58.102057Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:22:52.604397Z", + "start_time": "2020-11-18T04:22:43.253034Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "# 单模型生成提交结果\n", + "rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]\n", + "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", + "submit(rank_results, topk=5, model_name='lgb_ranker')" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "500000/500000 [==============================] - 20s 39us/sample\n" - ] - } - ], - "source": [ - "# 模型预测\n", - "tst_user_item_feats_df_din_model['pred_score'] = model.predict(x_tst, verbose=1, batch_size=256)\n", - "tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'din_rank_score.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:29:34.985535Z", - "start_time": "2020-11-18T04:29:26.264531Z" - } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']]\n", - "submit(rank_results, topk=5, model_name='din')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-15T06:15:49.490705Z", - "start_time": "2020-11-15T06:15:49.473794Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:38:53.760383Z", - "start_time": "2020-11-18T04:29:51.737721Z" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LGB分类模型" + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 232681 samples, validate on 58283 samples\n", - "Epoch 1/2\n", - "232681/232681 [==============================] - 44s 189us/sample - loss: 0.2864 - binary_crossentropy: 0.2846 - auc: 0.9008 - val_loss: 0.2830 - val_binary_crossentropy: 0.2813 - val_auc: 0.9072\n", - "Epoch 2/2\n", - "232681/232681 [==============================] - 44s 187us/sample - loss: 0.2832 - binary_crossentropy: 0.2816 - auc: 0.9034 - val_loss: 0.2846 - val_binary_crossentropy: 0.2830 - val_auc: 0.9053\n", - "58283/58283 [==============================] - 2s 36us/sample\n", - "500000/500000 [==============================] - 19s 37us/sample\n", - "Train on 232798 samples, validate on 58166 samples\n", - "Epoch 1/2\n", - "232798/232798 [==============================] - 43s 184us/sample - loss: 0.2818 - binary_crossentropy: 0.2802 - auc: 0.9051 - val_loss: 0.2968 - val_binary_crossentropy: 0.2953 - val_auc: 0.9062\n", - "Epoch 2/2\n", - "232798/232798 [==============================] - 44s 187us/sample - loss: 0.2796 - binary_crossentropy: 0.2782 - auc: 0.9069 - val_loss: 0.2820 - val_binary_crossentropy: 0.2806 - val_auc: 0.9071\n", - "58166/58166 [==============================] - 2s 38us/sample\n", - "500000/500000 [==============================] - 18s 37us/sample\n", - "Train on 232847 samples, validate on 58117 samples\n", - "Epoch 1/2\n", - "232847/232847 [==============================] - 43s 185us/sample - loss: 0.2786 - binary_crossentropy: 0.2773 - auc: 0.9080 - val_loss: 0.2761 - val_binary_crossentropy: 0.2749 - val_auc: 0.9113\n", - "Epoch 2/2\n", - "232847/232847 [==============================] - 39s 166us/sample - loss: 0.2766 - binary_crossentropy: 0.2754 - auc: 0.9097 - val_loss: 0.2872 - val_binary_crossentropy: 0.2862 - val_auc: 0.9090\n", - "58117/58117 [==============================] - 2s 34us/sample\n", - "500000/500000 [==============================] - 17s 33us/sample\n", - "Train on 232716 samples, validate on 58248 samples\n", - "Epoch 1/2\n", - "232716/232716 [==============================] - 39s 169us/sample - loss: 0.2763 - binary_crossentropy: 0.2753 - auc: 0.9100 - val_loss: 0.2739 - val_binary_crossentropy: 0.2730 - val_auc: 0.9116\n", - "Epoch 2/2\n", - "232716/232716 [==============================] - 39s 168us/sample - loss: 0.2743 - binary_crossentropy: 0.2735 - auc: 0.9119 - val_loss: 0.2859 - val_binary_crossentropy: 0.2851 - val_auc: 0.9090\n", - "58248/58248 [==============================] - 2s 35us/sample\n", - "500000/500000 [==============================] - 17s 34us/sample\n", - "Train on 232814 samples, validate on 58150 samples\n", - "Epoch 1/2\n", - "232814/232814 [==============================] - 40s 170us/sample - loss: 0.2747 - binary_crossentropy: 0.2739 - auc: 0.9115 - val_loss: 0.2702 - val_binary_crossentropy: 0.2695 - val_auc: 0.9163\n", - "Epoch 2/2\n", - "232814/232814 [==============================] - 40s 170us/sample - loss: 0.2725 - binary_crossentropy: 0.2719 - auc: 0.9132 - val_loss: 0.2751 - val_binary_crossentropy: 0.2745 - val_auc: 0.9151\n", - "58150/58150 [==============================] - 2s 34us/sample\n", - "500000/500000 [==============================] - 17s 34us/sample\n" - ] - } - ], - "source": [ - "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", - "# 这一部分与前面的单独训练和验证是分开的\n", - "def get_kfold_users(trn_df, n=5):\n", - " user_ids = trn_df['user_id'].unique()\n", - " user_set = [user_ids[i::n] for i in range(n)]\n", - " return user_set\n", - "\n", - "k_fold = 5\n", - "trn_df = trn_user_item_feats_df_din_model\n", - "user_set = get_kfold_users(trn_df, n=k_fold)\n", - "\n", - "score_list = []\n", - "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n", - "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", - "\n", - "dense_fea = [x for x in dense_fea if x != 'label']\n", - "x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - "\n", - "# 五折交叉验证,并将中间结果保存用于staking\n", - "for n_fold, valid_user in enumerate(user_set):\n", - " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", - " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", - " \n", - " # 准备训练数据\n", - " x_trn, dnn_feature_columns = get_din_feats_columns(train_idx, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - " y_trn = train_idx['label'].values\n", - "\n", - " # 准备验证数据\n", - " x_val, dnn_feature_columns = get_din_feats_columns(valid_idx, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - " y_val = valid_idx['label'].values\n", - " \n", - " history = model.fit(x_trn, y_trn, verbose=1, epochs=2, validation_data=(x_val, y_val) , batch_size=256)\n", - " \n", - " # 预测验证集结果\n", - " valid_idx['pred_score'] = model.predict(x_val, verbose=1, batch_size=256) \n", - " \n", - " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", - " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", - " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", - " \n", - " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", - " if not offline:\n", - " sub_preds += model.predict(x_tst, verbose=1, batch_size=256)[:, 0] \n", - " \n", - "score_df_ = pd.concat(score_list, axis=0)\n", - "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", - "# 保存训练集交叉验证产生的新特征\n", - "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_din_cls_feats.csv', index=False)\n", - " \n", - "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", - "tst_user_item_feats_df_din_model['pred_score'] = sub_preds / k_fold\n", - "tst_user_item_feats_df_din_model['pred_score'] = tst_user_item_feats_df_din_model['pred_score'].transform(lambda x: norm_sim(x))\n", - "tst_user_item_feats_df_din_model.sort_values(by=['user_id', 'pred_score'])\n", - "tst_user_item_feats_df_din_model['pred_rank'] = tst_user_item_feats_df_din_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - "\n", - "# 保存测试集交叉验证的新特征\n", - "tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_din_cls_feats.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 模型融合" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 加权融合" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:27.351996Z", - "start_time": "2020-11-18T04:44:26.561275Z" - } - }, - "outputs": [], - "source": [ - "# 读取多个模型的排序结果文件\n", - "lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv')\n", - "lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv')\n", - "din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')\n", - "\n", - "# 这里也可以换成交叉验证输出的测试结果进行加权融合" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:31.593981Z", - "start_time": "2020-11-18T04:44:31.589439Z" - } - }, - "outputs": [], - "source": [ - "rank_model = {'lgb_ranker': lgb_ranker, \n", - " 'lgb_cls': lgb_cls, \n", - " 'din_ranker': din_ranker}" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:36.135860Z", - "start_time": "2020-11-18T04:44:36.130577Z" - } - }, - "outputs": [], - "source": [ - "def get_ensumble_predict_topk(rank_model, topk=5):\n", - " final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])\n", - " rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))\n", - " \n", - " final_recall = final_recall.append(rank_model['lgb_ranker'])\n", - " final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()\n", - " \n", - " submit(final_recall, topk=topk, model_name='ensemble_fuse')" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:51.659270Z", - "start_time": "2020-11-18T04:44:40.445659Z" - } - }, - "outputs": [], - "source": [ - "get_ensumble_predict_topk(rank_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Staking" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:58.025992Z", - "start_time": "2020-11-18T04:44:56.146962Z" - } - }, - "outputs": [], - "source": [ - "# 读取多个模型的交叉验证生成的结果文件\n", - "# 训练集\n", - "trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')\n", - "trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')\n", - "trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')\n", - "\n", - "# 测试集\n", - "tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')\n", - "tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')\n", - "tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:45:07.701862Z", - "start_time": "2020-11-18T04:45:07.644335Z" - } - }, - "outputs": [], - "source": [ - "# 将多个模型输出的特征进行拼接\n", - "\n", - "finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]\n", - "finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]\n", - "\n", - "for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats, trn_din_cls_feats]):\n", - " for feat in [ 'pred_score', 'pred_rank']:\n", - " col_name = feat + '_' + str(idx)\n", - " finall_trn_ranker_feats[col_name] = trn_model[feat]\n", - "\n", - "for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats, tst_din_cls_feats]):\n", - " for feat in [ 'pred_score', 'pred_rank']:\n", - " col_name = feat + '_' + str(idx)\n", - " finall_tst_ranker_feats[col_name] = tst_model[feat]" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:45:15.044242Z", - "start_time": "2020-11-18T04:45:13.138252Z" - } - }, - "outputs": [], - "source": [ - "# 定义一个逻辑回归模型再次拟合交叉验证产生的特征对测试集进行预测\n", - "# 这里需要注意的是,在做交叉验证的时候可以构造多一些与输出预测值相关的特征,来丰富这里简单模型的特征\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1', 'pred_score_2', 'pred_rank_2']\n", - "\n", - "trn_x = finall_trn_ranker_feats[feat_cols]\n", - "trn_y = finall_trn_ranker_feats['label']\n", - "\n", - "tst_x = finall_tst_ranker_feats[feat_cols]\n", - "\n", - "# 定义模型\n", - "lr = LogisticRegression()\n", - "\n", - "# 模型训练\n", - "lr.fit(trn_x, trn_y)\n", - "\n", - "# 模型预测\n", - "finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:45:29.018764Z", - "start_time": "2020-11-18T04:45:19.423130Z" + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:22:58.259730Z", + "start_time": "2020-11-18T04:22:58.254297Z" + } + }, + "outputs": [], + "source": [ + "# 模型及参数的定义\n", + "lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", + " max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", + " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:23:11.258774Z", + "start_time": "2020-11-18T04:23:00.861936Z" + } + }, + "outputs": [], + "source": [ + "# 模型训练\n", + "if offline:\n", + " lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],\n", + " eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], \n", + " eval_metric=['auc', ],early_stopping_rounds=50, )\n", + "else:\n", + " lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:23:19.591396Z", + "start_time": "2020-11-18T04:23:13.813850Z" + } + }, + "outputs": [], + "source": [ + "# 模型预测\n", + "tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]\n", + "\n", + "# 将这里的排序结果保存一份,用户后面的模型融合\n", + "tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:23:32.352931Z", + "start_time": "2020-11-18T04:23:22.346609Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]\n", + "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", + "submit(rank_results, topk=5, model_name='lgb_cls')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:24:11.241196Z", + "start_time": "2020-11-18T04:23:41.377394Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1]\tvalid_0's auc: 0.764896\tvalid_0's binary_logloss: 0.522153\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.767857\tvalid_0's binary_logloss: 0.52057\n", + "[3]\tvalid_0's auc: 0.783096\tvalid_0's binary_logloss: 0.519584\n", + "[4]\tvalid_0's auc: 0.784354\tvalid_0's binary_logloss: 0.518485\n", + "[5]\tvalid_0's auc: 0.790554\tvalid_0's binary_logloss: 0.516886\n", + "[6]\tvalid_0's auc: 0.791954\tvalid_0's binary_logloss: 0.515334\n", + "[7]\tvalid_0's auc: 0.794257\tvalid_0's binary_logloss: 0.514032\n", + "[8]\tvalid_0's auc: 0.795222\tvalid_0's binary_logloss: 0.512516\n", + "[9]\tvalid_0's auc: 0.795417\tvalid_0's binary_logloss: 0.511671\n", + "[10]\tvalid_0's auc: 0.795913\tvalid_0's binary_logloss: 0.510226\n", + "[11]\tvalid_0's auc: 0.798222\tvalid_0's binary_logloss: 0.508858\n", + "[12]\tvalid_0's auc: 0.79825\tvalid_0's binary_logloss: 0.507928\n", + "[13]\tvalid_0's auc: 0.798842\tvalid_0's binary_logloss: 0.50708\n", + "[14]\tvalid_0's auc: 0.798935\tvalid_0's binary_logloss: 0.505752\n", + "[15]\tvalid_0's auc: 0.799543\tvalid_0's binary_logloss: 0.504388\n", + "[16]\tvalid_0's auc: 0.800844\tvalid_0's binary_logloss: 0.503126\n", + "[17]\tvalid_0's auc: 0.800855\tvalid_0's binary_logloss: 0.501809\n", + "[18]\tvalid_0's auc: 0.801653\tvalid_0's binary_logloss: 0.500676\n", + "[19]\tvalid_0's auc: 0.801518\tvalid_0's binary_logloss: 0.49987\n", + "[20]\tvalid_0's auc: 0.801662\tvalid_0's binary_logloss: 0.498625\n", + "[21]\tvalid_0's auc: 0.802093\tvalid_0's binary_logloss: 0.498113\n", + "[22]\tvalid_0's auc: 0.803071\tvalid_0's binary_logloss: 0.496933\n", + "[23]\tvalid_0's auc: 0.803222\tvalid_0's binary_logloss: 0.495864\n", + "[24]\tvalid_0's auc: 0.802927\tvalid_0's binary_logloss: 0.494691\n", + "[25]\tvalid_0's auc: 0.802581\tvalid_0's binary_logloss: 0.493543\n", + "[26]\tvalid_0's auc: 0.802965\tvalid_0's binary_logloss: 0.492444\n", + "[27]\tvalid_0's auc: 0.80298\tvalid_0's binary_logloss: 0.491336\n", + "[28]\tvalid_0's auc: 0.803226\tvalid_0's binary_logloss: 0.490275\n", + "[29]\tvalid_0's auc: 0.803436\tvalid_0's binary_logloss: 0.489126\n", + "[30]\tvalid_0's auc: 0.803796\tvalid_0's binary_logloss: 0.48802\n", + "[31]\tvalid_0's auc: 0.803601\tvalid_0's binary_logloss: 0.486988\n", + "[32]\tvalid_0's auc: 0.804416\tvalid_0's binary_logloss: 0.485972\n", + "[33]\tvalid_0's auc: 0.804529\tvalid_0's binary_logloss: 0.484939\n", + "[34]\tvalid_0's auc: 0.804534\tvalid_0's binary_logloss: 0.483927\n", + "[35]\tvalid_0's auc: 0.804819\tvalid_0's binary_logloss: 0.483271\n", + "[36]\tvalid_0's auc: 0.804774\tvalid_0's binary_logloss: 0.482273\n", + "[37]\tvalid_0's auc: 0.805237\tvalid_0's binary_logloss: 0.481639\n", + "[38]\tvalid_0's auc: 0.805546\tvalid_0's binary_logloss: 0.480959\n", + "[39]\tvalid_0's auc: 0.805598\tvalid_0's binary_logloss: 0.479955\n", + "[40]\tvalid_0's auc: 0.806011\tvalid_0's binary_logloss: 0.47903\n", + "[41]\tvalid_0's auc: 0.806664\tvalid_0's binary_logloss: 0.478439\n", + "[42]\tvalid_0's auc: 0.807021\tvalid_0's binary_logloss: 0.477798\n", + "[43]\tvalid_0's auc: 0.80726\tvalid_0's binary_logloss: 0.476829\n", + "[44]\tvalid_0's auc: 0.807157\tvalid_0's binary_logloss: 0.475976\n", + "[45]\tvalid_0's auc: 0.807788\tvalid_0's binary_logloss: 0.475056\n", + "[46]\tvalid_0's auc: 0.80805\tvalid_0's binary_logloss: 0.474446\n", + "[47]\tvalid_0's auc: 0.808097\tvalid_0's binary_logloss: 0.473576\n", + "[48]\tvalid_0's auc: 0.80815\tvalid_0's binary_logloss: 0.472676\n", + "[49]\tvalid_0's auc: 0.808304\tvalid_0's binary_logloss: 0.471918\n", + "[50]\tvalid_0's auc: 0.808749\tvalid_0's binary_logloss: 0.471481\n", + "[51]\tvalid_0's auc: 0.808972\tvalid_0's binary_logloss: 0.471104\n", + "[52]\tvalid_0's auc: 0.809326\tvalid_0's binary_logloss: 0.470289\n", + "[53]\tvalid_0's auc: 0.809472\tvalid_0's binary_logloss: 0.469508\n", + "[54]\tvalid_0's auc: 0.809505\tvalid_0's binary_logloss: 0.46869\n", + "[55]\tvalid_0's auc: 0.809594\tvalid_0's binary_logloss: 0.467885\n", + "[56]\tvalid_0's auc: 0.809847\tvalid_0's binary_logloss: 0.467356\n", + "[57]\tvalid_0's auc: 0.810262\tvalid_0's binary_logloss: 0.466531\n", + "[58]\tvalid_0's auc: 0.810407\tvalid_0's binary_logloss: 0.46573\n", + "[59]\tvalid_0's auc: 0.810618\tvalid_0's binary_logloss: 0.465205\n", + "[60]\tvalid_0's auc: 0.81066\tvalid_0's binary_logloss: 0.464435\n", + "[61]\tvalid_0's auc: 0.810638\tvalid_0's binary_logloss: 0.463721\n", + "[62]\tvalid_0's auc: 0.810658\tvalid_0's binary_logloss: 0.462982\n", + "[63]\tvalid_0's auc: 0.811106\tvalid_0's binary_logloss: 0.462246\n", + "[64]\tvalid_0's auc: 0.811313\tvalid_0's binary_logloss: 0.461748\n", + "[65]\tvalid_0's auc: 0.811351\tvalid_0's binary_logloss: 0.461038\n", + "[66]\tvalid_0's auc: 0.811433\tvalid_0's binary_logloss: 0.460323\n", + "[67]\tvalid_0's auc: 0.81158\tvalid_0's binary_logloss: 0.459662\n", + "[68]\tvalid_0's auc: 0.811561\tvalid_0's binary_logloss: 0.458988\n", + "[69]\tvalid_0's auc: 0.811748\tvalid_0's binary_logloss: 0.458592\n", + "[70]\tvalid_0's auc: 0.811919\tvalid_0's binary_logloss: 0.457934\n", + "[71]\tvalid_0's auc: 0.812073\tvalid_0's binary_logloss: 0.457508\n", + "[72]\tvalid_0's auc: 0.812273\tvalid_0's binary_logloss: 0.457038\n", + "[73]\tvalid_0's auc: 0.812561\tvalid_0's binary_logloss: 0.456439\n", + "[74]\tvalid_0's auc: 0.812633\tvalid_0's binary_logloss: 0.455789\n", + "[75]\tvalid_0's auc: 0.812757\tvalid_0's binary_logloss: 0.455173\n", + "[76]\tvalid_0's auc: 0.812923\tvalid_0's binary_logloss: 0.454533\n", + "[77]\tvalid_0's auc: 0.81295\tvalid_0's binary_logloss: 0.45392\n", + "[78]\tvalid_0's auc: 0.813073\tvalid_0's binary_logloss: 0.453517\n", + "[79]\tvalid_0's auc: 0.813202\tvalid_0's binary_logloss: 0.452932\n", + "[80]\tvalid_0's auc: 0.813611\tvalid_0's binary_logloss: 0.452285\n", + "[81]\tvalid_0's auc: 0.813769\tvalid_0's binary_logloss: 0.45191\n", + "[82]\tvalid_0's auc: 0.814468\tvalid_0's binary_logloss: 0.451455\n", + "[83]\tvalid_0's auc: 0.814656\tvalid_0's binary_logloss: 0.450885\n", + "[84]\tvalid_0's auc: 0.814755\tvalid_0's binary_logloss: 0.450308\n", + "[85]\tvalid_0's auc: 0.814824\tvalid_0's binary_logloss: 0.449739\n", + "[86]\tvalid_0's auc: 0.81499\tvalid_0's binary_logloss: 0.449348\n", + "[87]\tvalid_0's auc: 0.815232\tvalid_0's binary_logloss: 0.448759\n", + "[88]\tvalid_0's auc: 0.815452\tvalid_0's binary_logloss: 0.44823\n", + "[89]\tvalid_0's auc: 0.815593\tvalid_0's binary_logloss: 0.447861\n", + "[90]\tvalid_0's auc: 0.815591\tvalid_0's binary_logloss: 0.447323\n", + "[91]\tvalid_0's auc: 0.815672\tvalid_0's binary_logloss: 0.446796\n", + "[92]\tvalid_0's auc: 0.815875\tvalid_0's binary_logloss: 0.446472\n", + "[93]\tvalid_0's auc: 0.815984\tvalid_0's binary_logloss: 0.445961\n", + "[94]\tvalid_0's auc: 0.816026\tvalid_0's binary_logloss: 0.445439\n", + "[95]\tvalid_0's auc: 0.816172\tvalid_0's binary_logloss: 0.444909\n", + "[96]\tvalid_0's auc: 0.816321\tvalid_0's binary_logloss: 0.444413\n", + "[97]\tvalid_0's auc: 0.816751\tvalid_0's binary_logloss: 0.44405\n", + "[98]\tvalid_0's auc: 0.817226\tvalid_0's binary_logloss: 0.443626\n", + "[99]\tvalid_0's auc: 0.817286\tvalid_0's binary_logloss: 0.443136\n", + "[100]\tvalid_0's auc: 0.817391\tvalid_0's binary_logloss: 0.442854\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.817391\tvalid_0's binary_logloss: 0.442854\n", + "[1]\tvalid_0's auc: 0.771584\tvalid_0's binary_logloss: 0.527139\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.775446\tvalid_0's binary_logloss: 0.525462\n", + "[3]\tvalid_0's auc: 0.790092\tvalid_0's binary_logloss: 0.524461\n", + "[4]\tvalid_0's auc: 0.791432\tvalid_0's binary_logloss: 0.523322\n", + "[5]\tvalid_0's auc: 0.797482\tvalid_0's binary_logloss: 0.521614\n", + "[6]\tvalid_0's auc: 0.79893\tvalid_0's binary_logloss: 0.520007\n", + "[7]\tvalid_0's auc: 0.800753\tvalid_0's binary_logloss: 0.5187\n", + "[8]\tvalid_0's auc: 0.802197\tvalid_0's binary_logloss: 0.517125\n", + "[9]\tvalid_0's auc: 0.802828\tvalid_0's binary_logloss: 0.516269\n", + "[10]\tvalid_0's auc: 0.803496\tvalid_0's binary_logloss: 0.51474\n", + "[11]\tvalid_0's auc: 0.804972\tvalid_0's binary_logloss: 0.513321\n", + "[12]\tvalid_0's auc: 0.804995\tvalid_0's binary_logloss: 0.512334\n", + "[13]\tvalid_0's auc: 0.80525\tvalid_0's binary_logloss: 0.51151\n", + "[14]\tvalid_0's auc: 0.805026\tvalid_0's binary_logloss: 0.510149\n", + "[15]\tvalid_0's auc: 0.805622\tvalid_0's binary_logloss: 0.508708\n", + "[16]\tvalid_0's auc: 0.806974\tvalid_0's binary_logloss: 0.507384\n", + "[17]\tvalid_0's auc: 0.807045\tvalid_0's binary_logloss: 0.506017\n", + "[18]\tvalid_0's auc: 0.807265\tvalid_0's binary_logloss: 0.504853\n", + "[19]\tvalid_0's auc: 0.807126\tvalid_0's binary_logloss: 0.503972\n", + "[20]\tvalid_0's auc: 0.806948\tvalid_0's binary_logloss: 0.502693\n", + "[21]\tvalid_0's auc: 0.807315\tvalid_0's binary_logloss: 0.502166\n", + "[22]\tvalid_0's auc: 0.808067\tvalid_0's binary_logloss: 0.500948\n", + "[23]\tvalid_0's auc: 0.808226\tvalid_0's binary_logloss: 0.49987\n", + "[24]\tvalid_0's auc: 0.808268\tvalid_0's binary_logloss: 0.498623\n", + "[25]\tvalid_0's auc: 0.808569\tvalid_0's binary_logloss: 0.497389\n", + "[26]\tvalid_0's auc: 0.809069\tvalid_0's binary_logloss: 0.49624\n", + "[27]\tvalid_0's auc: 0.809312\tvalid_0's binary_logloss: 0.495095\n", + "[28]\tvalid_0's auc: 0.809549\tvalid_0's binary_logloss: 0.494012\n", + "[29]\tvalid_0's auc: 0.809944\tvalid_0's binary_logloss: 0.492834\n", + "[30]\tvalid_0's auc: 0.810047\tvalid_0's binary_logloss: 0.491735\n", + "[31]\tvalid_0's auc: 0.810086\tvalid_0's binary_logloss: 0.490633\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[32]\tvalid_0's auc: 0.810566\tvalid_0's binary_logloss: 0.489595\n", + "[33]\tvalid_0's auc: 0.810539\tvalid_0's binary_logloss: 0.488536\n", + "[34]\tvalid_0's auc: 0.810529\tvalid_0's binary_logloss: 0.487489\n", + "[35]\tvalid_0's auc: 0.810932\tvalid_0's binary_logloss: 0.486775\n", + "[36]\tvalid_0's auc: 0.810769\tvalid_0's binary_logloss: 0.48577\n", + "[37]\tvalid_0's auc: 0.811363\tvalid_0's binary_logloss: 0.485123\n", + "[38]\tvalid_0's auc: 0.811801\tvalid_0's binary_logloss: 0.484413\n", + "[39]\tvalid_0's auc: 0.811987\tvalid_0's binary_logloss: 0.483371\n", + "[40]\tvalid_0's auc: 0.812268\tvalid_0's binary_logloss: 0.482407\n", + "[41]\tvalid_0's auc: 0.813297\tvalid_0's binary_logloss: 0.481742\n", + "[42]\tvalid_0's auc: 0.813453\tvalid_0's binary_logloss: 0.481108\n", + "[43]\tvalid_0's auc: 0.813603\tvalid_0's binary_logloss: 0.480163\n", + "[44]\tvalid_0's auc: 0.813654\tvalid_0's binary_logloss: 0.479239\n", + "[45]\tvalid_0's auc: 0.814267\tvalid_0's binary_logloss: 0.478299\n", + "[46]\tvalid_0's auc: 0.81455\tvalid_0's binary_logloss: 0.477678\n", + "[47]\tvalid_0's auc: 0.81452\tvalid_0's binary_logloss: 0.476766\n", + "[48]\tvalid_0's auc: 0.814925\tvalid_0's binary_logloss: 0.475815\n", + "[49]\tvalid_0's auc: 0.814907\tvalid_0's binary_logloss: 0.47503\n", + "[50]\tvalid_0's auc: 0.815278\tvalid_0's binary_logloss: 0.474588\n", + "[51]\tvalid_0's auc: 0.815535\tvalid_0's binary_logloss: 0.474171\n", + "[52]\tvalid_0's auc: 0.815685\tvalid_0's binary_logloss: 0.473335\n", + "[53]\tvalid_0's auc: 0.815787\tvalid_0's binary_logloss: 0.472509\n", + "[54]\tvalid_0's auc: 0.815827\tvalid_0's binary_logloss: 0.471686\n", + "[55]\tvalid_0's auc: 0.815871\tvalid_0's binary_logloss: 0.470838\n", + "[56]\tvalid_0's auc: 0.816238\tvalid_0's binary_logloss: 0.470285\n", + "[57]\tvalid_0's auc: 0.816269\tvalid_0's binary_logloss: 0.469495\n", + "[58]\tvalid_0's auc: 0.816528\tvalid_0's binary_logloss: 0.468654\n", + "[59]\tvalid_0's auc: 0.816706\tvalid_0's binary_logloss: 0.468122\n", + "[60]\tvalid_0's auc: 0.816821\tvalid_0's binary_logloss: 0.467352\n", + "[61]\tvalid_0's auc: 0.816759\tvalid_0's binary_logloss: 0.466622\n", + "[62]\tvalid_0's auc: 0.81682\tvalid_0's binary_logloss: 0.465867\n", + "[63]\tvalid_0's auc: 0.817251\tvalid_0's binary_logloss: 0.465112\n", + "[64]\tvalid_0's auc: 0.817476\tvalid_0's binary_logloss: 0.464589\n", + "[65]\tvalid_0's auc: 0.817613\tvalid_0's binary_logloss: 0.463831\n", + "[66]\tvalid_0's auc: 0.817648\tvalid_0's binary_logloss: 0.463098\n", + "[67]\tvalid_0's auc: 0.817719\tvalid_0's binary_logloss: 0.462414\n", + "[68]\tvalid_0's auc: 0.817814\tvalid_0's binary_logloss: 0.461727\n", + "[69]\tvalid_0's auc: 0.817973\tvalid_0's binary_logloss: 0.461329\n", + "[70]\tvalid_0's auc: 0.818108\tvalid_0's binary_logloss: 0.460674\n", + "[71]\tvalid_0's auc: 0.818347\tvalid_0's binary_logloss: 0.460222\n", + "[72]\tvalid_0's auc: 0.818456\tvalid_0's binary_logloss: 0.45977\n", + "[73]\tvalid_0's auc: 0.818727\tvalid_0's binary_logloss: 0.459157\n", + "[74]\tvalid_0's auc: 0.818988\tvalid_0's binary_logloss: 0.458437\n", + "[75]\tvalid_0's auc: 0.819144\tvalid_0's binary_logloss: 0.457808\n", + "[76]\tvalid_0's auc: 0.819259\tvalid_0's binary_logloss: 0.457159\n", + "[77]\tvalid_0's auc: 0.819343\tvalid_0's binary_logloss: 0.456512\n", + "[78]\tvalid_0's auc: 0.81954\tvalid_0's binary_logloss: 0.456045\n", + "[79]\tvalid_0's auc: 0.819687\tvalid_0's binary_logloss: 0.455416\n", + "[80]\tvalid_0's auc: 0.819958\tvalid_0's binary_logloss: 0.454765\n", + "[81]\tvalid_0's auc: 0.820115\tvalid_0's binary_logloss: 0.45436\n", + "[82]\tvalid_0's auc: 0.820536\tvalid_0's binary_logloss: 0.453965\n", + "[83]\tvalid_0's auc: 0.820649\tvalid_0's binary_logloss: 0.453383\n", + "[84]\tvalid_0's auc: 0.820663\tvalid_0's binary_logloss: 0.452804\n", + "[85]\tvalid_0's auc: 0.820809\tvalid_0's binary_logloss: 0.452167\n", + "[86]\tvalid_0's auc: 0.821024\tvalid_0's binary_logloss: 0.451735\n", + "[87]\tvalid_0's auc: 0.821124\tvalid_0's binary_logloss: 0.451167\n", + "[88]\tvalid_0's auc: 0.821243\tvalid_0's binary_logloss: 0.45061\n", + "[89]\tvalid_0's auc: 0.821404\tvalid_0's binary_logloss: 0.450215\n", + "[90]\tvalid_0's auc: 0.821488\tvalid_0's binary_logloss: 0.449656\n", + "[91]\tvalid_0's auc: 0.821538\tvalid_0's binary_logloss: 0.449107\n", + "[92]\tvalid_0's auc: 0.82172\tvalid_0's binary_logloss: 0.448752\n", + "[93]\tvalid_0's auc: 0.821809\tvalid_0's binary_logloss: 0.448188\n", + "[94]\tvalid_0's auc: 0.82184\tvalid_0's binary_logloss: 0.447659\n", + "[95]\tvalid_0's auc: 0.821971\tvalid_0's binary_logloss: 0.447108\n", + "[96]\tvalid_0's auc: 0.822086\tvalid_0's binary_logloss: 0.446596\n", + "[97]\tvalid_0's auc: 0.82247\tvalid_0's binary_logloss: 0.446244\n", + "[98]\tvalid_0's auc: 0.822951\tvalid_0's binary_logloss: 0.445812\n", + "[99]\tvalid_0's auc: 0.822991\tvalid_0's binary_logloss: 0.445329\n", + "[100]\tvalid_0's auc: 0.823174\tvalid_0's binary_logloss: 0.445037\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.823174\tvalid_0's binary_logloss: 0.445037\n", + "[1]\tvalid_0's auc: 0.769525\tvalid_0's binary_logloss: 0.526256\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.775857\tvalid_0's binary_logloss: 0.524594\n", + "[3]\tvalid_0's auc: 0.785307\tvalid_0's binary_logloss: 0.523606\n", + "[4]\tvalid_0's auc: 0.786356\tvalid_0's binary_logloss: 0.522495\n", + "[5]\tvalid_0's auc: 0.793385\tvalid_0's binary_logloss: 0.520812\n", + "[6]\tvalid_0's auc: 0.794014\tvalid_0's binary_logloss: 0.519253\n", + "[7]\tvalid_0's auc: 0.795454\tvalid_0's binary_logloss: 0.517961\n", + "[8]\tvalid_0's auc: 0.79807\tvalid_0's binary_logloss: 0.516363\n", + "[9]\tvalid_0's auc: 0.798756\tvalid_0's binary_logloss: 0.51548\n", + "[10]\tvalid_0's auc: 0.798314\tvalid_0's binary_logloss: 0.514021\n", + "[11]\tvalid_0's auc: 0.799343\tvalid_0's binary_logloss: 0.512678\n", + "[12]\tvalid_0's auc: 0.799573\tvalid_0's binary_logloss: 0.511708\n", + "[13]\tvalid_0's auc: 0.799563\tvalid_0's binary_logloss: 0.510892\n", + "[14]\tvalid_0's auc: 0.800333\tvalid_0's binary_logloss: 0.509532\n", + "[15]\tvalid_0's auc: 0.800672\tvalid_0's binary_logloss: 0.508117\n", + "[16]\tvalid_0's auc: 0.801953\tvalid_0's binary_logloss: 0.506866\n", + "[17]\tvalid_0's auc: 0.802078\tvalid_0's binary_logloss: 0.5055\n", + "[18]\tvalid_0's auc: 0.802449\tvalid_0's binary_logloss: 0.504358\n", + "[19]\tvalid_0's auc: 0.802329\tvalid_0's binary_logloss: 0.503503\n", + "[20]\tvalid_0's auc: 0.802437\tvalid_0's binary_logloss: 0.502233\n", + "[21]\tvalid_0's auc: 0.802653\tvalid_0's binary_logloss: 0.50174\n", + "[22]\tvalid_0's auc: 0.803753\tvalid_0's binary_logloss: 0.50056\n", + "[23]\tvalid_0's auc: 0.803956\tvalid_0's binary_logloss: 0.499496\n", + "[24]\tvalid_0's auc: 0.804231\tvalid_0's binary_logloss: 0.498283\n", + "[25]\tvalid_0's auc: 0.804554\tvalid_0's binary_logloss: 0.497059\n", + "[26]\tvalid_0's auc: 0.805133\tvalid_0's binary_logloss: 0.495963\n", + "[27]\tvalid_0's auc: 0.805333\tvalid_0's binary_logloss: 0.494842\n", + "[28]\tvalid_0's auc: 0.805644\tvalid_0's binary_logloss: 0.493771\n", + "[29]\tvalid_0's auc: 0.806029\tvalid_0's binary_logloss: 0.492598\n", + "[30]\tvalid_0's auc: 0.806321\tvalid_0's binary_logloss: 0.491474\n", + "[31]\tvalid_0's auc: 0.806201\tvalid_0's binary_logloss: 0.490419\n", + "[32]\tvalid_0's auc: 0.806671\tvalid_0's binary_logloss: 0.489393\n", + "[33]\tvalid_0's auc: 0.806899\tvalid_0's binary_logloss: 0.488331\n", + "[34]\tvalid_0's auc: 0.807105\tvalid_0's binary_logloss: 0.487277\n", + "[35]\tvalid_0's auc: 0.807257\tvalid_0's binary_logloss: 0.486592\n", + "[36]\tvalid_0's auc: 0.80729\tvalid_0's binary_logloss: 0.485607\n", + "[37]\tvalid_0's auc: 0.807752\tvalid_0's binary_logloss: 0.484951\n", + "[38]\tvalid_0's auc: 0.808191\tvalid_0's binary_logloss: 0.484269\n", + "[39]\tvalid_0's auc: 0.808417\tvalid_0's binary_logloss: 0.483242\n", + "[40]\tvalid_0's auc: 0.808761\tvalid_0's binary_logloss: 0.482291\n", + "[41]\tvalid_0's auc: 0.80965\tvalid_0's binary_logloss: 0.48164\n", + "[42]\tvalid_0's auc: 0.810065\tvalid_0's binary_logloss: 0.480962\n", + "[43]\tvalid_0's auc: 0.810209\tvalid_0's binary_logloss: 0.479995\n", + "[44]\tvalid_0's auc: 0.810091\tvalid_0's binary_logloss: 0.479077\n", + "[45]\tvalid_0's auc: 0.810573\tvalid_0's binary_logloss: 0.478185\n", + "[46]\tvalid_0's auc: 0.810924\tvalid_0's binary_logloss: 0.477558\n", + "[47]\tvalid_0's auc: 0.810951\tvalid_0's binary_logloss: 0.476662\n", + "[48]\tvalid_0's auc: 0.811101\tvalid_0's binary_logloss: 0.475745\n", + "[49]\tvalid_0's auc: 0.811269\tvalid_0's binary_logloss: 0.474951\n", + "[50]\tvalid_0's auc: 0.81173\tvalid_0's binary_logloss: 0.474514\n", + "[51]\tvalid_0's auc: 0.811937\tvalid_0's binary_logloss: 0.474114\n", + "[52]\tvalid_0's auc: 0.812136\tvalid_0's binary_logloss: 0.473297\n", + "[53]\tvalid_0's auc: 0.812249\tvalid_0's binary_logloss: 0.472497\n", + "[54]\tvalid_0's auc: 0.812121\tvalid_0's binary_logloss: 0.471696\n", + "[55]\tvalid_0's auc: 0.812164\tvalid_0's binary_logloss: 0.470905\n", + "[56]\tvalid_0's auc: 0.812462\tvalid_0's binary_logloss: 0.470384\n", + "[57]\tvalid_0's auc: 0.812613\tvalid_0's binary_logloss: 0.4696\n", + "[58]\tvalid_0's auc: 0.812615\tvalid_0's binary_logloss: 0.468778\n", + "[59]\tvalid_0's auc: 0.812842\tvalid_0's binary_logloss: 0.468211\n", + "[60]\tvalid_0's auc: 0.81312\tvalid_0's binary_logloss: 0.467385\n", + "[61]\tvalid_0's auc: 0.813039\tvalid_0's binary_logloss: 0.466632\n", + "[62]\tvalid_0's auc: 0.812942\tvalid_0's binary_logloss: 0.465933\n", + "[63]\tvalid_0's auc: 0.813274\tvalid_0's binary_logloss: 0.465214\n", + "[64]\tvalid_0's auc: 0.813572\tvalid_0's binary_logloss: 0.464692\n", + "[65]\tvalid_0's auc: 0.813594\tvalid_0's binary_logloss: 0.463925\n", + "[66]\tvalid_0's auc: 0.813719\tvalid_0's binary_logloss: 0.463177\n", + "[67]\tvalid_0's auc: 0.814011\tvalid_0's binary_logloss: 0.462513\n", + "[68]\tvalid_0's auc: 0.813989\tvalid_0's binary_logloss: 0.461843\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[69]\tvalid_0's auc: 0.814218\tvalid_0's binary_logloss: 0.461443\n", + "[70]\tvalid_0's auc: 0.814334\tvalid_0's binary_logloss: 0.460775\n", + "[71]\tvalid_0's auc: 0.814493\tvalid_0's binary_logloss: 0.460332\n", + "[72]\tvalid_0's auc: 0.814663\tvalid_0's binary_logloss: 0.459867\n", + "[73]\tvalid_0's auc: 0.814856\tvalid_0's binary_logloss: 0.459266\n", + "[74]\tvalid_0's auc: 0.815017\tvalid_0's binary_logloss: 0.458585\n", + "[75]\tvalid_0's auc: 0.815186\tvalid_0's binary_logloss: 0.457958\n", + "[76]\tvalid_0's auc: 0.815374\tvalid_0's binary_logloss: 0.457316\n", + "[77]\tvalid_0's auc: 0.81554\tvalid_0's binary_logloss: 0.45665\n", + "[78]\tvalid_0's auc: 0.81569\tvalid_0's binary_logloss: 0.456217\n", + "[79]\tvalid_0's auc: 0.815861\tvalid_0's binary_logloss: 0.455615\n", + "[80]\tvalid_0's auc: 0.816443\tvalid_0's binary_logloss: 0.454895\n", + "[81]\tvalid_0's auc: 0.816659\tvalid_0's binary_logloss: 0.454503\n", + "[82]\tvalid_0's auc: 0.817017\tvalid_0's binary_logloss: 0.454149\n", + "[83]\tvalid_0's auc: 0.817162\tvalid_0's binary_logloss: 0.453578\n", + "[84]\tvalid_0's auc: 0.817274\tvalid_0's binary_logloss: 0.452984\n", + "[85]\tvalid_0's auc: 0.817283\tvalid_0's binary_logloss: 0.452416\n", + "[86]\tvalid_0's auc: 0.817339\tvalid_0's binary_logloss: 0.452022\n", + "[87]\tvalid_0's auc: 0.817494\tvalid_0's binary_logloss: 0.45146\n", + "[88]\tvalid_0's auc: 0.817594\tvalid_0's binary_logloss: 0.450926\n", + "[89]\tvalid_0's auc: 0.817771\tvalid_0's binary_logloss: 0.450553\n", + "[90]\tvalid_0's auc: 0.81789\tvalid_0's binary_logloss: 0.449985\n", + "[91]\tvalid_0's auc: 0.817931\tvalid_0's binary_logloss: 0.449439\n", + "[92]\tvalid_0's auc: 0.818138\tvalid_0's binary_logloss: 0.449094\n", + "[93]\tvalid_0's auc: 0.818334\tvalid_0's binary_logloss: 0.448527\n", + "[94]\tvalid_0's auc: 0.818426\tvalid_0's binary_logloss: 0.447989\n", + "[95]\tvalid_0's auc: 0.818676\tvalid_0's binary_logloss: 0.447407\n", + "[96]\tvalid_0's auc: 0.818852\tvalid_0's binary_logloss: 0.446884\n", + "[97]\tvalid_0's auc: 0.81945\tvalid_0's binary_logloss: 0.446455\n", + "[98]\tvalid_0's auc: 0.819861\tvalid_0's binary_logloss: 0.446045\n", + "[99]\tvalid_0's auc: 0.819943\tvalid_0's binary_logloss: 0.445543\n", + "[100]\tvalid_0's auc: 0.820076\tvalid_0's binary_logloss: 0.445258\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.820076\tvalid_0's binary_logloss: 0.445258\n", + "[1]\tvalid_0's auc: 0.770032\tvalid_0's binary_logloss: 0.527241\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.779881\tvalid_0's binary_logloss: 0.525545\n", + "[3]\tvalid_0's auc: 0.791308\tvalid_0's binary_logloss: 0.524508\n", + "[4]\tvalid_0's auc: 0.790788\tvalid_0's binary_logloss: 0.52341\n", + "[5]\tvalid_0's auc: 0.795645\tvalid_0's binary_logloss: 0.521753\n", + "[6]\tvalid_0's auc: 0.797745\tvalid_0's binary_logloss: 0.520131\n", + "[7]\tvalid_0's auc: 0.79931\tvalid_0's binary_logloss: 0.518872\n", + "[8]\tvalid_0's auc: 0.800014\tvalid_0's binary_logloss: 0.517353\n", + "[9]\tvalid_0's auc: 0.800549\tvalid_0's binary_logloss: 0.516487\n", + "[10]\tvalid_0's auc: 0.800261\tvalid_0's binary_logloss: 0.515039\n", + "[11]\tvalid_0's auc: 0.801261\tvalid_0's binary_logloss: 0.513695\n", + "[12]\tvalid_0's auc: 0.801062\tvalid_0's binary_logloss: 0.512735\n", + "[13]\tvalid_0's auc: 0.801155\tvalid_0's binary_logloss: 0.51192\n", + "[14]\tvalid_0's auc: 0.801315\tvalid_0's binary_logloss: 0.510559\n", + "[15]\tvalid_0's auc: 0.80185\tvalid_0's binary_logloss: 0.509147\n", + "[16]\tvalid_0's auc: 0.803029\tvalid_0's binary_logloss: 0.507914\n", + "[17]\tvalid_0's auc: 0.803035\tvalid_0's binary_logloss: 0.506583\n", + "[18]\tvalid_0's auc: 0.803433\tvalid_0's binary_logloss: 0.505441\n", + "[19]\tvalid_0's auc: 0.803717\tvalid_0's binary_logloss: 0.504599\n", + "[20]\tvalid_0's auc: 0.803819\tvalid_0's binary_logloss: 0.503327\n", + "[21]\tvalid_0's auc: 0.803923\tvalid_0's binary_logloss: 0.502782\n", + "[22]\tvalid_0's auc: 0.804939\tvalid_0's binary_logloss: 0.501596\n", + "[23]\tvalid_0's auc: 0.804707\tvalid_0's binary_logloss: 0.500572\n", + "[24]\tvalid_0's auc: 0.804632\tvalid_0's binary_logloss: 0.499367\n", + "[25]\tvalid_0's auc: 0.804756\tvalid_0's binary_logloss: 0.498161\n", + "[26]\tvalid_0's auc: 0.805067\tvalid_0's binary_logloss: 0.497061\n", + "[27]\tvalid_0's auc: 0.805119\tvalid_0's binary_logloss: 0.495933\n", + "[28]\tvalid_0's auc: 0.805304\tvalid_0's binary_logloss: 0.494849\n", + "[29]\tvalid_0's auc: 0.805688\tvalid_0's binary_logloss: 0.493677\n", + "[30]\tvalid_0's auc: 0.805822\tvalid_0's binary_logloss: 0.492594\n", + "[31]\tvalid_0's auc: 0.805869\tvalid_0's binary_logloss: 0.49152\n", + "[32]\tvalid_0's auc: 0.807267\tvalid_0's binary_logloss: 0.490435\n", + "[33]\tvalid_0's auc: 0.807301\tvalid_0's binary_logloss: 0.489392\n", + "[34]\tvalid_0's auc: 0.80736\tvalid_0's binary_logloss: 0.488325\n", + "[35]\tvalid_0's auc: 0.807706\tvalid_0's binary_logloss: 0.487654\n", + "[36]\tvalid_0's auc: 0.807758\tvalid_0's binary_logloss: 0.486651\n", + "[37]\tvalid_0's auc: 0.808051\tvalid_0's binary_logloss: 0.486012\n", + "[38]\tvalid_0's auc: 0.808429\tvalid_0's binary_logloss: 0.485355\n", + "[39]\tvalid_0's auc: 0.808663\tvalid_0's binary_logloss: 0.484327\n", + "[40]\tvalid_0's auc: 0.809007\tvalid_0's binary_logloss: 0.483386\n", + "[41]\tvalid_0's auc: 0.809781\tvalid_0's binary_logloss: 0.482745\n", + "[42]\tvalid_0's auc: 0.810071\tvalid_0's binary_logloss: 0.482124\n", + "[43]\tvalid_0's auc: 0.810383\tvalid_0's binary_logloss: 0.481154\n", + "[44]\tvalid_0's auc: 0.810446\tvalid_0's binary_logloss: 0.480243\n", + "[45]\tvalid_0's auc: 0.811148\tvalid_0's binary_logloss: 0.479261\n", + "[46]\tvalid_0's auc: 0.811245\tvalid_0's binary_logloss: 0.478687\n", + "[47]\tvalid_0's auc: 0.811214\tvalid_0's binary_logloss: 0.477812\n", + "[48]\tvalid_0's auc: 0.811408\tvalid_0's binary_logloss: 0.47689\n", + "[49]\tvalid_0's auc: 0.811486\tvalid_0's binary_logloss: 0.476132\n", + "[50]\tvalid_0's auc: 0.811806\tvalid_0's binary_logloss: 0.475718\n", + "[51]\tvalid_0's auc: 0.812017\tvalid_0's binary_logloss: 0.475342\n", + "[52]\tvalid_0's auc: 0.812255\tvalid_0's binary_logloss: 0.474505\n", + "[53]\tvalid_0's auc: 0.812249\tvalid_0's binary_logloss: 0.473707\n", + "[54]\tvalid_0's auc: 0.812235\tvalid_0's binary_logloss: 0.47289\n", + "[55]\tvalid_0's auc: 0.812233\tvalid_0's binary_logloss: 0.472091\n", + "[56]\tvalid_0's auc: 0.812492\tvalid_0's binary_logloss: 0.471563\n", + "[57]\tvalid_0's auc: 0.812579\tvalid_0's binary_logloss: 0.47077\n", + "[58]\tvalid_0's auc: 0.812598\tvalid_0's binary_logloss: 0.469992\n", + "[59]\tvalid_0's auc: 0.812885\tvalid_0's binary_logloss: 0.469458\n", + "[60]\tvalid_0's auc: 0.812995\tvalid_0's binary_logloss: 0.468676\n", + "[61]\tvalid_0's auc: 0.812961\tvalid_0's binary_logloss: 0.467939\n", + "[62]\tvalid_0's auc: 0.812919\tvalid_0's binary_logloss: 0.467232\n", + "[63]\tvalid_0's auc: 0.813291\tvalid_0's binary_logloss: 0.466491\n", + "[64]\tvalid_0's auc: 0.813702\tvalid_0's binary_logloss: 0.465945\n", + "[65]\tvalid_0's auc: 0.813803\tvalid_0's binary_logloss: 0.465197\n", + "[66]\tvalid_0's auc: 0.813851\tvalid_0's binary_logloss: 0.4645\n", + "[67]\tvalid_0's auc: 0.814011\tvalid_0's binary_logloss: 0.463814\n", + "[68]\tvalid_0's auc: 0.814027\tvalid_0's binary_logloss: 0.463113\n", + "[69]\tvalid_0's auc: 0.814138\tvalid_0's binary_logloss: 0.462727\n", + "[70]\tvalid_0's auc: 0.814365\tvalid_0's binary_logloss: 0.462077\n", + "[71]\tvalid_0's auc: 0.814432\tvalid_0's binary_logloss: 0.461655\n", + "[72]\tvalid_0's auc: 0.8146\tvalid_0's binary_logloss: 0.461194\n", + "[73]\tvalid_0's auc: 0.815324\tvalid_0's binary_logloss: 0.460477\n", + "[74]\tvalid_0's auc: 0.815411\tvalid_0's binary_logloss: 0.459805\n", + "[75]\tvalid_0's auc: 0.815548\tvalid_0's binary_logloss: 0.459189\n", + "[76]\tvalid_0's auc: 0.815625\tvalid_0's binary_logloss: 0.458525\n", + "[77]\tvalid_0's auc: 0.81562\tvalid_0's binary_logloss: 0.457905\n", + "[78]\tvalid_0's auc: 0.815786\tvalid_0's binary_logloss: 0.45747\n", + "[79]\tvalid_0's auc: 0.815834\tvalid_0's binary_logloss: 0.456884\n", + "[80]\tvalid_0's auc: 0.816475\tvalid_0's binary_logloss: 0.45617\n", + "[81]\tvalid_0's auc: 0.816677\tvalid_0's binary_logloss: 0.455787\n", + "[82]\tvalid_0's auc: 0.817255\tvalid_0's binary_logloss: 0.455358\n", + "[83]\tvalid_0's auc: 0.817383\tvalid_0's binary_logloss: 0.454775\n", + "[84]\tvalid_0's auc: 0.817509\tvalid_0's binary_logloss: 0.454176\n", + "[85]\tvalid_0's auc: 0.817572\tvalid_0's binary_logloss: 0.453609\n", + "[86]\tvalid_0's auc: 0.817721\tvalid_0's binary_logloss: 0.453213\n", + "[87]\tvalid_0's auc: 0.817992\tvalid_0's binary_logloss: 0.452586\n", + "[88]\tvalid_0's auc: 0.81808\tvalid_0's binary_logloss: 0.45204\n", + "[89]\tvalid_0's auc: 0.818202\tvalid_0's binary_logloss: 0.451643\n", + "[90]\tvalid_0's auc: 0.818336\tvalid_0's binary_logloss: 0.451081\n", + "[91]\tvalid_0's auc: 0.818347\tvalid_0's binary_logloss: 0.450531\n", + "[92]\tvalid_0's auc: 0.818558\tvalid_0's binary_logloss: 0.450179\n", + "[93]\tvalid_0's auc: 0.818743\tvalid_0's binary_logloss: 0.449647\n", + "[94]\tvalid_0's auc: 0.818789\tvalid_0's binary_logloss: 0.449133\n", + "[95]\tvalid_0's auc: 0.818849\tvalid_0's binary_logloss: 0.44862\n", + "[96]\tvalid_0's auc: 0.81913\tvalid_0's binary_logloss: 0.448072\n", + "[97]\tvalid_0's auc: 0.819526\tvalid_0's binary_logloss: 0.447713\n", + "[98]\tvalid_0's auc: 0.819971\tvalid_0's binary_logloss: 0.447296\n", + "[99]\tvalid_0's auc: 0.819972\tvalid_0's binary_logloss: 0.446814\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[100]\tvalid_0's auc: 0.820086\tvalid_0's binary_logloss: 0.446533\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.820086\tvalid_0's binary_logloss: 0.446533\n", + "[1]\tvalid_0's auc: 0.768646\tvalid_0's binary_logloss: 0.527167\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.779902\tvalid_0's binary_logloss: 0.525481\n", + "[3]\tvalid_0's auc: 0.789868\tvalid_0's binary_logloss: 0.524485\n", + "[4]\tvalid_0's auc: 0.791895\tvalid_0's binary_logloss: 0.523382\n", + "[5]\tvalid_0's auc: 0.795453\tvalid_0's binary_logloss: 0.521759\n", + "[6]\tvalid_0's auc: 0.796672\tvalid_0's binary_logloss: 0.520166\n", + "[7]\tvalid_0's auc: 0.798023\tvalid_0's binary_logloss: 0.518857\n", + "[8]\tvalid_0's auc: 0.799331\tvalid_0's binary_logloss: 0.517297\n", + "[9]\tvalid_0's auc: 0.800181\tvalid_0's binary_logloss: 0.516416\n", + "[10]\tvalid_0's auc: 0.800373\tvalid_0's binary_logloss: 0.514967\n", + "[11]\tvalid_0's auc: 0.801087\tvalid_0's binary_logloss: 0.513631\n", + "[12]\tvalid_0's auc: 0.801122\tvalid_0's binary_logloss: 0.512658\n", + "[13]\tvalid_0's auc: 0.801043\tvalid_0's binary_logloss: 0.511833\n", + "[14]\tvalid_0's auc: 0.801238\tvalid_0's binary_logloss: 0.510461\n", + "[15]\tvalid_0's auc: 0.801847\tvalid_0's binary_logloss: 0.509034\n", + "[16]\tvalid_0's auc: 0.803139\tvalid_0's binary_logloss: 0.507759\n", + "[17]\tvalid_0's auc: 0.803577\tvalid_0's binary_logloss: 0.506361\n", + "[18]\tvalid_0's auc: 0.803834\tvalid_0's binary_logloss: 0.505229\n", + "[19]\tvalid_0's auc: 0.803943\tvalid_0's binary_logloss: 0.504371\n", + "[20]\tvalid_0's auc: 0.80415\tvalid_0's binary_logloss: 0.503102\n", + "[21]\tvalid_0's auc: 0.804446\tvalid_0's binary_logloss: 0.502564\n", + "[22]\tvalid_0's auc: 0.805163\tvalid_0's binary_logloss: 0.501396\n", + "[23]\tvalid_0's auc: 0.805323\tvalid_0's binary_logloss: 0.500327\n", + "[24]\tvalid_0's auc: 0.805314\tvalid_0's binary_logloss: 0.499123\n", + "[25]\tvalid_0's auc: 0.80535\tvalid_0's binary_logloss: 0.497927\n", + "[26]\tvalid_0's auc: 0.805864\tvalid_0's binary_logloss: 0.496834\n", + "[27]\tvalid_0's auc: 0.805919\tvalid_0's binary_logloss: 0.495667\n", + "[28]\tvalid_0's auc: 0.806272\tvalid_0's binary_logloss: 0.494606\n", + "[29]\tvalid_0's auc: 0.806599\tvalid_0's binary_logloss: 0.49343\n", + "[30]\tvalid_0's auc: 0.806932\tvalid_0's binary_logloss: 0.492303\n", + "[31]\tvalid_0's auc: 0.806656\tvalid_0's binary_logloss: 0.491249\n", + "[32]\tvalid_0's auc: 0.807436\tvalid_0's binary_logloss: 0.490188\n", + "[33]\tvalid_0's auc: 0.807629\tvalid_0's binary_logloss: 0.489117\n", + "[34]\tvalid_0's auc: 0.807501\tvalid_0's binary_logloss: 0.48808\n", + "[35]\tvalid_0's auc: 0.807885\tvalid_0's binary_logloss: 0.487383\n", + "[36]\tvalid_0's auc: 0.807921\tvalid_0's binary_logloss: 0.48636\n", + "[37]\tvalid_0's auc: 0.808267\tvalid_0's binary_logloss: 0.485724\n", + "[38]\tvalid_0's auc: 0.808563\tvalid_0's binary_logloss: 0.485076\n", + "[39]\tvalid_0's auc: 0.808813\tvalid_0's binary_logloss: 0.484039\n", + "[40]\tvalid_0's auc: 0.809023\tvalid_0's binary_logloss: 0.483091\n", + "[41]\tvalid_0's auc: 0.809782\tvalid_0's binary_logloss: 0.482441\n", + "[42]\tvalid_0's auc: 0.810135\tvalid_0's binary_logloss: 0.48179\n", + "[43]\tvalid_0's auc: 0.810219\tvalid_0's binary_logloss: 0.48082\n", + "[44]\tvalid_0's auc: 0.81031\tvalid_0's binary_logloss: 0.479906\n", + "[45]\tvalid_0's auc: 0.810514\tvalid_0's binary_logloss: 0.479024\n", + "[46]\tvalid_0's auc: 0.810566\tvalid_0's binary_logloss: 0.478437\n", + "[47]\tvalid_0's auc: 0.810611\tvalid_0's binary_logloss: 0.477529\n", + "[48]\tvalid_0's auc: 0.810781\tvalid_0's binary_logloss: 0.476637\n", + "[49]\tvalid_0's auc: 0.81089\tvalid_0's binary_logloss: 0.475883\n", + "[50]\tvalid_0's auc: 0.811266\tvalid_0's binary_logloss: 0.475459\n", + "[51]\tvalid_0's auc: 0.811402\tvalid_0's binary_logloss: 0.475078\n", + "[52]\tvalid_0's auc: 0.811765\tvalid_0's binary_logloss: 0.474246\n", + "[53]\tvalid_0's auc: 0.811891\tvalid_0's binary_logloss: 0.473452\n", + "[54]\tvalid_0's auc: 0.811868\tvalid_0's binary_logloss: 0.47263\n", + "[55]\tvalid_0's auc: 0.81192\tvalid_0's binary_logloss: 0.471804\n", + "[56]\tvalid_0's auc: 0.812272\tvalid_0's binary_logloss: 0.471275\n", + "[57]\tvalid_0's auc: 0.812639\tvalid_0's binary_logloss: 0.470396\n", + "[58]\tvalid_0's auc: 0.812764\tvalid_0's binary_logloss: 0.469597\n", + "[59]\tvalid_0's auc: 0.813084\tvalid_0's binary_logloss: 0.469049\n", + "[60]\tvalid_0's auc: 0.813342\tvalid_0's binary_logloss: 0.468244\n", + "[61]\tvalid_0's auc: 0.813302\tvalid_0's binary_logloss: 0.467499\n", + "[62]\tvalid_0's auc: 0.813221\tvalid_0's binary_logloss: 0.466758\n", + "[63]\tvalid_0's auc: 0.813697\tvalid_0's binary_logloss: 0.466017\n", + "[64]\tvalid_0's auc: 0.813985\tvalid_0's binary_logloss: 0.465501\n", + "[65]\tvalid_0's auc: 0.81416\tvalid_0's binary_logloss: 0.464725\n", + "[66]\tvalid_0's auc: 0.814227\tvalid_0's binary_logloss: 0.46398\n", + "[67]\tvalid_0's auc: 0.814397\tvalid_0's binary_logloss: 0.463309\n", + "[68]\tvalid_0's auc: 0.814426\tvalid_0's binary_logloss: 0.462627\n", + "[69]\tvalid_0's auc: 0.814593\tvalid_0's binary_logloss: 0.462244\n", + "[70]\tvalid_0's auc: 0.814789\tvalid_0's binary_logloss: 0.461571\n", + "[71]\tvalid_0's auc: 0.814889\tvalid_0's binary_logloss: 0.461144\n", + "[72]\tvalid_0's auc: 0.815078\tvalid_0's binary_logloss: 0.460684\n", + "[73]\tvalid_0's auc: 0.815439\tvalid_0's binary_logloss: 0.460063\n", + "[74]\tvalid_0's auc: 0.815511\tvalid_0's binary_logloss: 0.459386\n", + "[75]\tvalid_0's auc: 0.815574\tvalid_0's binary_logloss: 0.45877\n", + "[76]\tvalid_0's auc: 0.815634\tvalid_0's binary_logloss: 0.458128\n", + "[77]\tvalid_0's auc: 0.815618\tvalid_0's binary_logloss: 0.457495\n", + "[78]\tvalid_0's auc: 0.81582\tvalid_0's binary_logloss: 0.457057\n", + "[79]\tvalid_0's auc: 0.81594\tvalid_0's binary_logloss: 0.456475\n", + "[80]\tvalid_0's auc: 0.815961\tvalid_0's binary_logloss: 0.455885\n", + "[81]\tvalid_0's auc: 0.816153\tvalid_0's binary_logloss: 0.455511\n", + "[82]\tvalid_0's auc: 0.816433\tvalid_0's binary_logloss: 0.455186\n", + "[83]\tvalid_0's auc: 0.816546\tvalid_0's binary_logloss: 0.454625\n", + "[84]\tvalid_0's auc: 0.816586\tvalid_0's binary_logloss: 0.454039\n", + "[85]\tvalid_0's auc: 0.816584\tvalid_0's binary_logloss: 0.453482\n", + "[86]\tvalid_0's auc: 0.816881\tvalid_0's binary_logloss: 0.453048\n", + "[87]\tvalid_0's auc: 0.817029\tvalid_0's binary_logloss: 0.452485\n", + "[88]\tvalid_0's auc: 0.81707\tvalid_0's binary_logloss: 0.451941\n", + "[89]\tvalid_0's auc: 0.817298\tvalid_0's binary_logloss: 0.451544\n", + "[90]\tvalid_0's auc: 0.817343\tvalid_0's binary_logloss: 0.450975\n", + "[91]\tvalid_0's auc: 0.817357\tvalid_0's binary_logloss: 0.450422\n", + "[92]\tvalid_0's auc: 0.817592\tvalid_0's binary_logloss: 0.450109\n", + "[93]\tvalid_0's auc: 0.817729\tvalid_0's binary_logloss: 0.449542\n", + "[94]\tvalid_0's auc: 0.817834\tvalid_0's binary_logloss: 0.448982\n", + "[95]\tvalid_0's auc: 0.81809\tvalid_0's binary_logloss: 0.448398\n", + "[96]\tvalid_0's auc: 0.818269\tvalid_0's binary_logloss: 0.447908\n", + "[97]\tvalid_0's auc: 0.818682\tvalid_0's binary_logloss: 0.447547\n", + "[98]\tvalid_0's auc: 0.819015\tvalid_0's binary_logloss: 0.447165\n", + "[99]\tvalid_0's auc: 0.819016\tvalid_0's binary_logloss: 0.446669\n", + "[100]\tvalid_0's auc: 0.819127\tvalid_0's binary_logloss: 0.446397\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.819127\tvalid_0's binary_logloss: 0.446397\n" + ] + } + ], + "source": [ + "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", + "# 这一部分与前面的单独训练和验证是分开的\n", + "def get_kfold_users(trn_df, n=5):\n", + " user_ids = trn_df['user_id'].unique()\n", + " user_set = [user_ids[i::n] for i in range(n)]\n", + " return user_set\n", + "\n", + "k_fold = 5\n", + "trn_df = trn_user_item_feats_df_rank_model\n", + "user_set = get_kfold_users(trn_df, n=k_fold)\n", + "\n", + "score_list = []\n", + "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n", + "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", + "\n", + "# 五折交叉验证,并将中间结果保存用于staking\n", + "for n_fold, valid_user in enumerate(user_set):\n", + " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", + " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", + " \n", + " # 模型及参数的定义\n", + " lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", + " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", + " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) \n", + " # 训练模型\n", + " lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], \n", + " eval_metric=['auc', ],early_stopping_rounds=50, )\n", + " \n", + " # 预测验证集结果\n", + " valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], \n", + " num_iteration=lgb_Classfication.best_iteration_)[:,1]\n", + " \n", + " # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化\n", + " # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))\n", + " \n", + " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", + " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", + " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", + " \n", + " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", + " if not offline:\n", + " sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], \n", + " num_iteration=lgb_Classfication.best_iteration_)[:,1]\n", + " \n", + "score_df_ = pd.concat(score_list, axis=0)\n", + "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", + "# 保存训练集交叉验证产生的新特征\n", + "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)\n", + " \n", + "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", + "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n", + "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n", + "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n", + "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + "\n", + "# 保存测试集交叉验证的新特征\n", + "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:24:23.074237Z", + "start_time": "2020-11-18T04:24:13.812284Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]\n", + "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", + "submit(rank_results, topk=5, model_name='lgb_cls')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DIN模型" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户的历史点击行为列表\n", + "这个是为后面的DIN模型服务的" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:24:30.508213Z", + "start_time": "2020-11-18T04:24:27.426372Z" + } + }, + "outputs": [], + "source": [ + "if offline:\n", + " all_data = pd.read_csv('./data_raw/train_click_log.csv')\n", + "else:\n", + " trn_data = pd.read_csv('./data_raw/train_click_log.csv')\n", + " tst_data = pd.read_csv('./data_raw/testA_click_log.csv')\n", + " all_data = trn_data.append(tst_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:25:28.082071Z", + "start_time": "2020-11-18T04:24:33.649524Z" + } + }, + "outputs": [], + "source": [ + "hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()\n", + "his_behavior_df = pd.DataFrame()\n", + "his_behavior_df['user_id'] = hist_click['user_id']\n", + "his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:25:52.925866Z", + "start_time": "2020-11-18T04:25:52.863922Z" + } + }, + "outputs": [], + "source": [ + "trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()\n", + "\n", + "if offline:\n", + " val_user_item_feats_df_din_model = val_user_item_feats_df.copy()\n", + "else: \n", + " val_user_item_feats_df_din_model = None\n", + " \n", + "tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:00.070681Z", + "start_time": "2020-11-18T04:25:56.417197Z" + } + }, + "outputs": [], + "source": [ + "trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')\n", + "\n", + "if offline:\n", + " val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')\n", + "else:\n", + " val_user_item_feats_df_din_model = None\n", + "\n", + "tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DIN模型简介\n", + "我们下面尝试使用DIN模型, DIN的全称是Deep Interest Network, 这是阿里2018年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型, 它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性,来计算用户兴趣的表示向量。具体来说就是通过引入局部激活单元,通过软搜索历史行为的相关部分来关注相关的用户兴趣,并采用加权和来获得有关候选广告的用户兴趣的表示。与候选广告相关性较高的行为会获得较高的激活权重,并支配着用户兴趣。该表示向量在不同广告上有所不同,大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合, 我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。 该模型的结构如下:\n", + "\n", + "![image-20201116201646983](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png)\n", + "\n", + "\n", + "我们这里直接调包来使用这个模型, 关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用:deepctr的函数原型如下:\n", + "> def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False,\n", + "> dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation=\"dice\",\n", + "> att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024,\n", + "> task='binary'):\n", + "> \n", + "> * dnn_feature_columns: 特征列, 包含数据所有特征的列表\n", + "> * history_feature_list: 用户历史行为列, 反应用户历史行为的特征的列表\n", + "> * dnn_use_bn: 是否使用BatchNormalization\n", + "> * dnn_hidden_units: 全连接层网络的层数和每一层神经元的个数, 一个列表或者元组\n", + "> * dnn_activation_relu: 全连接网络的激活单元类型\n", + "> * att_hidden_size: 注意力层的全连接网络的层数和每一层神经元的个数\n", + "> * att_activation: 注意力层的激活单元类型\n", + "> * att_weight_normalization: 是否归一化注意力得分\n", + "> * l2_reg_dnn: 全连接网络的正则化系数\n", + "> * l2_reg_embedding: embedding向量的正则化稀疏\n", + "> * dnn_dropout: 全连接网络的神经元的失活概率\n", + "> * task: 任务, 可以是分类, 也可是是回归\n", + "\n", + "在具体使用的时候, 我们必须要传入特征列和历史行为列, 但是再传入之前, 我们需要进行一下特征列的预处理。具体如下:\n", + "\n", + "1. 首先,我们要处理数据集, 得到数据, 由于我们是基于用户过去的行为去预测用户是否点击当前文章, 所以我们需要把数据的特征列划分成数值型特征, 离散型特征和历史行为特征列三部分, 对于每一部分, DIN模型的处理会有不同\n", + " 1. 对于离散型特征, 在我们的数据集中就是那些类别型的特征, 比如user_id这种, 这种类别型特征, 我们首先要经过embedding处理得到每个特征的低维稠密型表示, 既然要经过embedding, 那么我们就需要为每一列的类别特征的取值建立一个字典,并指明embedding维度, 所以在使用deepctr的DIN模型准备数据的时候, 我们需要通过SparseFeat函数指明这些类别型特征, 这个函数的传入参数就是列名, 列的唯一取值(建立字典用)和embedding维度。\n", + " 2. 对于用户历史行为特征列, 比如文章id, 文章的类别等这种, 同样的我们需要先经过embedding处理, 只不过和上面不一样的地方是,对于这种特征, 我们在得到每个特征的embedding表示之后, 还需要通过一个Attention_layer计算用户的历史行为和当前候选文章的相关性以此得到当前用户的embedding向量, 这个向量就可以基于当前的候选文章与用户过去点击过得历史文章的相似性的程度来反应用户的兴趣, 并且随着用户的不同的历史点击来变化,去动态的模拟用户兴趣的变化过程。这类特征对于每个用户都是一个历史行为序列, 对于每个用户, 历史行为序列长度会不一样, 可能有的用户点击的历史文章多,有的点击的历史文章少, 所以我们还需要把这个长度统一起来, 在为DIN模型准备数据的时候, 我们首先要通过SparseFeat函数指明这些类别型特征, 然后还需要通过VarLenSparseFeat函数再进行序列填充, 使得每个用户的历史序列一样长, 所以这个函数参数中会有个maxlen,来指明序列的最大长度是多少。\n", + " 3. 对于连续型特征列, 我们只需要用DenseFeat函数来指明列名和维度即可。\n", + "2. 处理完特征列之后, 我们把相应的数据与列进行对应,就得到了最后的数据。\n", + "\n", + "下面根据具体的代码感受一下, 逻辑是这样, 首先我们需要写一个数据准备函数, 在这里面就是根据上面的具体步骤准备数据, 得到数据和特征列, 然后就是建立DIN模型并训练, 最后基于模型进行测试。" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:08.405211Z", + "start_time": "2020-11-18T04:26:04.887013Z" + } + }, + "outputs": [], + "source": [ + "# 导入deepctr\n", + "from deepctr.models import DIN\n", + "from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "\n", + "from tensorflow.keras import backend as K\n", + "from tensorflow.keras.layers import *\n", + "from tensorflow.keras.models import *\n", + "from tensorflow.keras.callbacks import * \n", + "import tensorflow as tf\n", + "\n", + "import os\n", + "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:13.485712Z", + "start_time": "2020-11-18T04:26:13.476042Z" + } + }, + "outputs": [], + "source": [ + "# 数据准备函数\n", + "def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100):\n", + " \"\"\"\n", + " 数据准备函数:\n", + " df: 数据集\n", + " dense_fea: 数值型特征列\n", + " sparse_fea: 离散型特征列\n", + " behavior_fea: 用户的候选行为特征列\n", + " his_behavior_fea: 用户的历史行为特征列\n", + " embedding_dim: embedding的维度, 这里为了简单, 统一把离散型特征列采用一样的隐向量维度\n", + " max_len: 用户序列的最大长度\n", + " \"\"\"\n", + " \n", + " sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea]\n", + " \n", + " dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]\n", + " \n", + " var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,\n", + " embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in hist_behavior_fea]\n", + " \n", + " dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns\n", + " \n", + " # 建立x, x是一个字典的形式\n", + " x = {}\n", + " for name in get_feature_names(dnn_feature_columns):\n", + " if name in his_behavior_fea:\n", + " # 这是历史行为序列\n", + " his_list = [l for l in df[name]]\n", + " x[name] = pad_sequences(his_list, maxlen=max_len, padding='post') # 二维数组\n", + " else:\n", + " x[name] = df[name].values\n", + " \n", + " return x, dnn_feature_columns" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:18.783217Z", + "start_time": "2020-11-18T04:26:18.776795Z" + } + }, + "outputs": [], + "source": [ + "# 把特征分开\n", + "sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', \n", + " 'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']\n", + "\n", + "behavior_fea = ['click_article_id']\n", + "\n", + "hist_behavior_fea = ['hist_click_article_id']\n", + "\n", + "dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',\n", + " 'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',\n", + " 'words_hbo','words_count']" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:25.469810Z", + "start_time": "2020-11-18T04:26:24.779347Z" + } + }, + "outputs": [], + "source": [ + "# dense特征进行归一化, 神经网络训练都需要将数值进行归一化处理\n", + "mm = MinMaxScaler()\n", + "\n", + "# 下面是做一些特殊处理,当在其他的地方出现无效值的时候,不处理无法进行归一化,刚开始可以先把他注释掉,在运行了下面的代码\n", + "# 之后如果发现报错,应该先去想办法处理如何不出现inf之类的值\n", + "# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)\n", + "# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)\n", + "\n", + "for feat in dense_fea:\n", + " trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])\n", + " \n", + " if val_user_item_feats_df_din_model is not None:\n", + " val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])\n", + " \n", + " tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:36.727753Z", + "start_time": "2020-11-18T04:26:28.854705Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n" + ] + } + ], + "source": [ + "# 准备训练数据\n", + "x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + "y_trn = trn_user_item_feats_df_din_model['label'].values\n", + "\n", + "if offline:\n", + " # 准备验证数据\n", + " x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + " y_val = val_user_item_feats_df_din_model['label'].values\n", + " \n", + "dense_fea = [x for x in dense_fea if x != 'label']\n", + "x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:45.146318Z", + "start_time": "2020-11-18T04:26:40.423914Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:255: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", + "Model: \"model\"\n", + "__________________________________________________________________________________________________\n", + "Layer (type) Output Shape Param # Connected to \n", + "==================================================================================================\n", + "user_id (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_article_id (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "category_id (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_environment (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_deviceGroup (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_os (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_country (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_region (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_referrer_type (InputLayer [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "is_cat_hab (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_user_id (Embedding) (None, 1, 32) 1600032 user_id[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_seq_emb_hist_click_artic multiple 525664 click_article_id[0][0] \n", + " hist_click_article_id[0][0] \n", + " click_article_id[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_category_id (Embeddi (None, 1, 32) 7776 category_id[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_environment (E (None, 1, 32) 128 click_environment[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_deviceGroup (E (None, 1, 32) 160 click_deviceGroup[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_os (Embedding) (None, 1, 32) 288 click_os[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_country (Embed (None, 1, 32) 384 click_country[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_region (Embedd (None, 1, 32) 928 click_region[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_referrer_type (None, 1, 32) 256 click_referrer_type[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_is_cat_hab (Embeddin (None, 1, 32) 64 is_cat_hab[0][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask (NoMask) (None, 1, 32) 0 sparse_emb_user_id[0][0] \n", + " sparse_seq_emb_hist_click_article\n", + " sparse_emb_category_id[0][0] \n", + " sparse_emb_click_environment[0][0\n", + " sparse_emb_click_deviceGroup[0][0\n", + " sparse_emb_click_os[0][0] \n", + " sparse_emb_click_country[0][0] \n", + " sparse_emb_click_region[0][0] \n", + " sparse_emb_click_referrer_type[0]\n", + " sparse_emb_is_cat_hab[0][0] \n", + "__________________________________________________________________________________________________\n", + "hist_click_article_id (InputLay [(None, 50)] 0 \n", + "__________________________________________________________________________________________________\n", + "concatenate (Concatenate) (None, 1, 320) 0 no_mask[0][0] \n", + " no_mask[1][0] \n", + " no_mask[2][0] \n", + " no_mask[3][0] \n", + " no_mask[4][0] \n", + " no_mask[5][0] \n", + " no_mask[6][0] \n", + " no_mask[7][0] \n", + " no_mask[8][0] \n", + " no_mask[9][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask_1 (NoMask) (None, 1, 320) 0 concatenate[0][0] \n", + "__________________________________________________________________________________________________\n", + "attention_sequence_pooling_laye (None, 1, 32) 13961 sparse_seq_emb_hist_click_article\n", + " sparse_seq_emb_hist_click_article\n", + "__________________________________________________________________________________________________\n", + "concatenate_1 (Concatenate) (None, 1, 352) 0 no_mask_1[0][0] \n", + " attention_sequence_pooling_layer[\n", + "__________________________________________________________________________________________________\n", + "sim0 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "time_diff0 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "word_diff0 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sim_max (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sim_min (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sim_sum (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sim_mean (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "score (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "rank (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_size (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "time_diff_mean (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "active_level (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "user_time_hob1 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "user_time_hob2 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "words_hbo (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "words_count (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "flatten (Flatten) (None, 352) 0 concatenate_1[0][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask_3 (NoMask) (None, 1) 0 sim0[0][0] \n", + " time_diff0[0][0] \n", + " word_diff0[0][0] \n", + " sim_max[0][0] \n", + " sim_min[0][0] \n", + " sim_sum[0][0] \n", + " sim_mean[0][0] \n", + " score[0][0] \n", + " rank[0][0] \n", + " click_size[0][0] \n", + " time_diff_mean[0][0] \n", + " active_level[0][0] \n", + " user_time_hob1[0][0] \n", + " user_time_hob2[0][0] \n", + " words_hbo[0][0] \n", + " words_count[0][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask_2 (NoMask) (None, 352) 0 flatten[0][0] \n", + "__________________________________________________________________________________________________\n", + "concatenate_2 (Concatenate) (None, 16) 0 no_mask_3[0][0] \n", + " no_mask_3[1][0] \n", + " no_mask_3[2][0] \n", + " no_mask_3[3][0] \n", + " no_mask_3[4][0] \n", + " no_mask_3[5][0] \n", + " no_mask_3[6][0] \n", + " no_mask_3[7][0] \n", + " no_mask_3[8][0] \n", + " no_mask_3[9][0] \n", + " no_mask_3[10][0] \n", + " no_mask_3[11][0] \n", + " no_mask_3[12][0] \n", + " no_mask_3[13][0] \n", + " no_mask_3[14][0] \n", + " no_mask_3[15][0] \n", + "__________________________________________________________________________________________________\n", + "flatten_1 (Flatten) (None, 352) 0 no_mask_2[0][0] \n", + "__________________________________________________________________________________________________\n", + "flatten_2 (Flatten) (None, 16) 0 concatenate_2[0][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask_4 (NoMask) multiple 0 flatten_1[0][0] \n", + " flatten_2[0][0] \n", + "__________________________________________________________________________________________________\n", + "concatenate_3 (Concatenate) (None, 368) 0 no_mask_4[0][0] \n", + " no_mask_4[1][0] \n", + "__________________________________________________________________________________________________\n", + "dnn_1 (DNN) (None, 80) 89880 concatenate_3[0][0] \n", + "__________________________________________________________________________________________________\n", + "dense (Dense) (None, 1) 80 dnn_1[0][0] \n", + "__________________________________________________________________________________________________\n", + "prediction_layer (PredictionLay (None, 1) 1 dense[0][0] \n", + "==================================================================================================\n", + "Total params: 2,239,602\n", + "Trainable params: 2,239,362\n", + "Non-trainable params: 240\n", + "__________________________________________________________________________________________________\n" + ] + } + ], + "source": [ + "# 建立模型\n", + "model = DIN(dnn_feature_columns, behavior_fea)\n", + "\n", + "# 查看模型结构\n", + "model.summary()\n", + "\n", + "# 模型编译\n", + "model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:28:43.885773Z", + "start_time": "2020-11-18T04:26:48.746787Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/2\n", + "290964/290964 [==============================] - 55s 189us/sample - loss: 0.4209 - binary_crossentropy: 0.4206 - auc: 0.7842\n", + "Epoch 2/2\n", + "290964/290964 [==============================] - 52s 178us/sample - loss: 0.3630 - binary_crossentropy: 0.3618 - auc: 0.8478\n" + ] + } + ], + "source": [ + "# 模型训练\n", + "if offline:\n", + " history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)\n", + "else:\n", + " # 也可以使用上面的语句用自己采样出来的验证集\n", + " # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)\n", + " history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:29:20.436591Z", + "start_time": "2020-11-18T04:28:58.102057Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "500000/500000 [==============================] - 20s 39us/sample\n" + ] + } + ], + "source": [ + "# 模型预测\n", + "tst_user_item_feats_df_din_model['pred_score'] = model.predict(x_tst, verbose=1, batch_size=256)\n", + "tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'din_rank_score.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:29:34.985535Z", + "start_time": "2020-11-18T04:29:26.264531Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']]\n", + "submit(rank_results, topk=5, model_name='din')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-15T06:15:49.490705Z", + "start_time": "2020-11-15T06:15:49.473794Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:38:53.760383Z", + "start_time": "2020-11-18T04:29:51.737721Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train on 232681 samples, validate on 58283 samples\n", + "Epoch 1/2\n", + "232681/232681 [==============================] - 44s 189us/sample - loss: 0.2864 - binary_crossentropy: 0.2846 - auc: 0.9008 - val_loss: 0.2830 - val_binary_crossentropy: 0.2813 - val_auc: 0.9072\n", + "Epoch 2/2\n", + "232681/232681 [==============================] - 44s 187us/sample - loss: 0.2832 - binary_crossentropy: 0.2816 - auc: 0.9034 - val_loss: 0.2846 - val_binary_crossentropy: 0.2830 - val_auc: 0.9053\n", + "58283/58283 [==============================] - 2s 36us/sample\n", + "500000/500000 [==============================] - 19s 37us/sample\n", + "Train on 232798 samples, validate on 58166 samples\n", + "Epoch 1/2\n", + "232798/232798 [==============================] - 43s 184us/sample - loss: 0.2818 - binary_crossentropy: 0.2802 - auc: 0.9051 - val_loss: 0.2968 - val_binary_crossentropy: 0.2953 - val_auc: 0.9062\n", + "Epoch 2/2\n", + "232798/232798 [==============================] - 44s 187us/sample - loss: 0.2796 - binary_crossentropy: 0.2782 - auc: 0.9069 - val_loss: 0.2820 - val_binary_crossentropy: 0.2806 - val_auc: 0.9071\n", + "58166/58166 [==============================] - 2s 38us/sample\n", + "500000/500000 [==============================] - 18s 37us/sample\n", + "Train on 232847 samples, validate on 58117 samples\n", + "Epoch 1/2\n", + "232847/232847 [==============================] - 43s 185us/sample - loss: 0.2786 - binary_crossentropy: 0.2773 - auc: 0.9080 - val_loss: 0.2761 - val_binary_crossentropy: 0.2749 - val_auc: 0.9113\n", + "Epoch 2/2\n", + "232847/232847 [==============================] - 39s 166us/sample - loss: 0.2766 - binary_crossentropy: 0.2754 - auc: 0.9097 - val_loss: 0.2872 - val_binary_crossentropy: 0.2862 - val_auc: 0.9090\n", + "58117/58117 [==============================] - 2s 34us/sample\n", + "500000/500000 [==============================] - 17s 33us/sample\n", + "Train on 232716 samples, validate on 58248 samples\n", + "Epoch 1/2\n", + "232716/232716 [==============================] - 39s 169us/sample - loss: 0.2763 - binary_crossentropy: 0.2753 - auc: 0.9100 - val_loss: 0.2739 - val_binary_crossentropy: 0.2730 - val_auc: 0.9116\n", + "Epoch 2/2\n", + "232716/232716 [==============================] - 39s 168us/sample - loss: 0.2743 - binary_crossentropy: 0.2735 - auc: 0.9119 - val_loss: 0.2859 - val_binary_crossentropy: 0.2851 - val_auc: 0.9090\n", + "58248/58248 [==============================] - 2s 35us/sample\n", + "500000/500000 [==============================] - 17s 34us/sample\n", + "Train on 232814 samples, validate on 58150 samples\n", + "Epoch 1/2\n", + "232814/232814 [==============================] - 40s 170us/sample - loss: 0.2747 - binary_crossentropy: 0.2739 - auc: 0.9115 - val_loss: 0.2702 - val_binary_crossentropy: 0.2695 - val_auc: 0.9163\n", + "Epoch 2/2\n", + "232814/232814 [==============================] - 40s 170us/sample - loss: 0.2725 - binary_crossentropy: 0.2719 - auc: 0.9132 - val_loss: 0.2751 - val_binary_crossentropy: 0.2745 - val_auc: 0.9151\n", + "58150/58150 [==============================] - 2s 34us/sample\n", + "500000/500000 [==============================] - 17s 34us/sample\n" + ] + } + ], + "source": [ + "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", + "# 这一部分与前面的单独训练和验证是分开的\n", + "def get_kfold_users(trn_df, n=5):\n", + " user_ids = trn_df['user_id'].unique()\n", + " user_set = [user_ids[i::n] for i in range(n)]\n", + " return user_set\n", + "\n", + "k_fold = 5\n", + "trn_df = trn_user_item_feats_df_din_model\n", + "user_set = get_kfold_users(trn_df, n=k_fold)\n", + "\n", + "score_list = []\n", + "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n", + "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", + "\n", + "dense_fea = [x for x in dense_fea if x != 'label']\n", + "x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + "\n", + "# 五折交叉验证,并将中间结果保存用于staking\n", + "for n_fold, valid_user in enumerate(user_set):\n", + " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", + " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", + " \n", + " # 准备训练数据\n", + " x_trn, dnn_feature_columns = get_din_feats_columns(train_idx, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + " y_trn = train_idx['label'].values\n", + "\n", + " # 准备验证数据\n", + " x_val, dnn_feature_columns = get_din_feats_columns(valid_idx, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + " y_val = valid_idx['label'].values\n", + " \n", + " history = model.fit(x_trn, y_trn, verbose=1, epochs=2, validation_data=(x_val, y_val) , batch_size=256)\n", + " \n", + " # 预测验证集结果\n", + " valid_idx['pred_score'] = model.predict(x_val, verbose=1, batch_size=256) \n", + " \n", + " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", + " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", + " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", + " \n", + " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", + " if not offline:\n", + " sub_preds += model.predict(x_tst, verbose=1, batch_size=256)[:, 0] \n", + " \n", + "score_df_ = pd.concat(score_list, axis=0)\n", + "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", + "# 保存训练集交叉验证产生的新特征\n", + "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_din_cls_feats.csv', index=False)\n", + " \n", + "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", + "tst_user_item_feats_df_din_model['pred_score'] = sub_preds / k_fold\n", + "tst_user_item_feats_df_din_model['pred_score'] = tst_user_item_feats_df_din_model['pred_score'].transform(lambda x: norm_sim(x))\n", + "tst_user_item_feats_df_din_model.sort_values(by=['user_id', 'pred_score'])\n", + "tst_user_item_feats_df_din_model['pred_rank'] = tst_user_item_feats_df_din_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + "\n", + "# 保存测试集交叉验证的新特征\n", + "tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_din_cls_feats.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 模型融合" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 加权融合" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:27.351996Z", + "start_time": "2020-11-18T04:44:26.561275Z" + } + }, + "outputs": [], + "source": [ + "# 读取多个模型的排序结果文件\n", + "lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv')\n", + "lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv')\n", + "din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')\n", + "\n", + "# 这里也可以换成交叉验证输出的测试结果进行加权融合" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:31.593981Z", + "start_time": "2020-11-18T04:44:31.589439Z" + } + }, + "outputs": [], + "source": [ + "rank_model = {'lgb_ranker': lgb_ranker, \n", + " 'lgb_cls': lgb_cls, \n", + " 'din_ranker': din_ranker}" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:36.135860Z", + "start_time": "2020-11-18T04:44:36.130577Z" + } + }, + "outputs": [], + "source": [ + "def get_ensumble_predict_topk(rank_model, topk=5):\n", + " final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])\n", + " rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))\n", + " \n", + " final_recall = final_recall.append(rank_model['lgb_ranker'])\n", + " final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()\n", + " \n", + " submit(final_recall, topk=topk, model_name='ensemble_fuse')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:51.659270Z", + "start_time": "2020-11-18T04:44:40.445659Z" + } + }, + "outputs": [], + "source": [ + "get_ensumble_predict_topk(rank_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Staking" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:58.025992Z", + "start_time": "2020-11-18T04:44:56.146962Z" + } + }, + "outputs": [], + "source": [ + "# 读取多个模型的交叉验证生成的结果文件\n", + "# 训练集\n", + "trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')\n", + "trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')\n", + "trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')\n", + "\n", + "# 测试集\n", + "tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')\n", + "tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')\n", + "tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:45:07.701862Z", + "start_time": "2020-11-18T04:45:07.644335Z" + } + }, + "outputs": [], + "source": [ + "# 将多个模型输出的特征进行拼接\n", + "\n", + "finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]\n", + "finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]\n", + "\n", + "for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats, trn_din_cls_feats]):\n", + " for feat in [ 'pred_score', 'pred_rank']:\n", + " col_name = feat + '_' + str(idx)\n", + " finall_trn_ranker_feats[col_name] = trn_model[feat]\n", + "\n", + "for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats, tst_din_cls_feats]):\n", + " for feat in [ 'pred_score', 'pred_rank']:\n", + " col_name = feat + '_' + str(idx)\n", + " finall_tst_ranker_feats[col_name] = tst_model[feat]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:45:15.044242Z", + "start_time": "2020-11-18T04:45:13.138252Z" + } + }, + "outputs": [], + "source": [ + "# 定义一个逻辑回归模型再次拟合交叉验证产生的特征对测试集进行预测\n", + "# 这里需要注意的是,在做交叉验证的时候可以构造多一些与输出预测值相关的特征,来丰富这里简单模型的特征\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1', 'pred_score_2', 'pred_rank_2']\n", + "\n", + "trn_x = finall_trn_ranker_feats[feat_cols]\n", + "trn_y = finall_trn_ranker_feats['label']\n", + "\n", + "tst_x = finall_tst_ranker_feats[feat_cols]\n", + "\n", + "# 定义模型\n", + "lr = LogisticRegression()\n", + "\n", + "# 模型训练\n", + "lr.fit(trn_x, trn_y)\n", + "\n", + "# 模型预测\n", + "finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:45:29.018764Z", + "start_time": "2020-11-18T04:45:19.423130Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]\n", + "submit(rank_results, topk=5, model_name='ensumble_staking')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 总结\n", + "本章主要学习了三个排序模型,包括LGB的Rank, LGB的Classifier还有深度学习的DIN模型, 当然,对于这三个模型的原理部分,我们并没有给出详细的介绍, 请大家课下自己探索原理,也欢迎大家把自己的探索与所学分享出来,我们一块学习和进步。最后,我们进行了简单的模型融合策略,包括简单的加权和Stacking。\n", + "\n", + "关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", + "\n", + "![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" + ] } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]\n", - "submit(rank_results, topk=5, model_name='ensumble_staking')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 总结\n", - "本章主要学习了三个排序模型,包括LGB的Rank, LGB的Classifier还有深度学习的DIN模型, 当然,对于这三个模型的原理部分,我们并没有给出详细的介绍, 请大家课下自己探索原理,也欢迎大家把自己的探索与所学分享出来,我们一块学习和进步。最后,我们进行了简单的模型融合策略,包括简单的加权和Stacking。\n", - "\n", - "关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "170px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "170px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git "a/docs/ch03/ch3.1/jupyter/\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" "b/docs/ch03/ch3.1/jupyter/\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" index c9cbc0c37..6bc2d7d2b 100644 --- "a/docs/ch03/ch3.1/jupyter/\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" +++ "b/docs/ch03/ch3.1/jupyter/\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" @@ -1,3980 +1,3980 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 数据分析\n", - "\n", - "数据分析的价值主要在于熟悉了解整个数据集的基本情况包括每个文件里有哪些数据,具体的文件中的每个字段表示什么实际含义,以及数据集中特征之间的相关性,在推荐场景下主要就是分析用户本身的基本属性,文章基本属性,以及用户和文章交互的一些分布,这些都有利于后面的召回策略的选择,以及特征工程。\n", - "\n", - "**建议:当特征工程和模型调参已经很难继续上分了,可以回来在重新从新的角度去分析这些数据,或许可以找到上分的灵感**\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 导包" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:13:59.322486Z", - "start_time": "2020-11-13T15:13:55.601445Z" - } - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "plt.rc('font', family='SimHei', size=13)\n", - "\n", - "import os,gc,re,warnings,sys\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取数据" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:14:18.918041Z", - "start_time": "2020-11-13T15:14:02.568798Z" - } - }, - "outputs": [], - "source": [ - "# path = './data/' # 自定义的路径\n", - "path = './' # 天池平台路径\n", - "\n", - "#####train\n", - "trn_click = pd.read_csv(path+'train_click_log.csv')\n", - "#trn_click = pd.read_csv(path+'train_click_log.csv', names=['user_id','item_id','click_time','click_environment','click_deviceGroup','click_os','click_country','click_region','click_referrer_type'])\n", - "item_df = pd.read_csv(path+'articles.csv')\n", - "item_df = item_df.rename(columns={'article_id': 'click_article_id'}) #重命名,方便后续match\n", - "item_emb_df = pd.read_csv(path+'articles_emb.csv')\n", - "\n", - "#####test\n", - "tst_click = pd.read_csv(path+'testA_click_log.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 数据预处理\n", - "计算用户点击rank和点击次数" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:14:31.746748Z", - "start_time": "2020-11-13T15:14:31.409643Z" - } - }, - "outputs": [], - "source": [ - "# 对每个用户的点击时间戳进行排序\n", - "trn_click['rank'] = trn_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)\n", - "tst_click['rank'] = tst_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:15:04.503079Z", - "start_time": "2020-11-13T15:15:04.394329Z" - } - }, - "outputs": [], - "source": [ - "#计算用户点击文章的次数,并添加新的一列count\n", - "trn_click['click_cnts'] = trn_click.groupby(['user_id'])['click_timestamp'].transform('count')\n", - "tst_click['click_cnts'] = tst_click.groupby(['user_id'])['click_timestamp'].transform('count')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 数据浏览" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户点击日志文件_训练集" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:16:07.764776Z", - "start_time": "2020-11-13T15:16:07.536342Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
019999916041715070295701904117113111112811506942089000173
11999995408150702957147841171131101141506994257000118
219999950823150702960147841171131911991507013614000213
319999815777015070295322004117125540402811506983935000201
41999989661315070296718314117125539402091506938444000185
\n", - "
" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 数据分析\n", + "\n", + "数据分析的价值主要在于熟悉了解整个数据集的基本情况包括每个文件里有哪些数据,具体的文件中的每个字段表示什么实际含义,以及数据集中特征之间的相关性,在推荐场景下主要就是分析用户本身的基本属性,文章基本属性,以及用户和文章交互的一些分布,这些都有利于后面的召回策略的选择,以及特征工程。\n", + "\n", + "**建议:当特征工程和模型调参已经很难继续上分了,可以回来在重新从新的角度去分析这些数据,或许可以找到上分的灵感**\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 导包" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:13:59.322486Z", + "start_time": "2020-11-13T15:13:55.601445Z" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "plt.rc('font', family='SimHei', size=13)\n", + "\n", + "import os,gc,re,warnings,sys\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取数据" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:14:18.918041Z", + "start_time": "2020-11-13T15:14:02.568798Z" + } + }, + "outputs": [], + "source": [ + "# path = './data/' # 自定义的路径\n", + "path = './' # 天池平台路径\n", + "\n", + "#####train\n", + "trn_click = pd.read_csv(path+'train_click_log.csv')\n", + "#trn_click = pd.read_csv(path+'train_click_log.csv', names=['user_id','item_id','click_time','click_environment','click_deviceGroup','click_os','click_country','click_region','click_referrer_type'])\n", + "item_df = pd.read_csv(path+'articles.csv')\n", + "item_df = item_df.rename(columns={'article_id': 'click_article_id'}) #重命名,方便后续match\n", + "item_emb_df = pd.read_csv(path+'articles_emb.csv')\n", + "\n", + "#####test\n", + "tst_click = pd.read_csv(path+'testA_click_log.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据预处理\n", + "计算用户点击rank和点击次数" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:14:31.746748Z", + "start_time": "2020-11-13T15:14:31.409643Z" + } + }, + "outputs": [], + "source": [ + "# 对每个用户的点击时间戳进行排序\n", + "trn_click['rank'] = trn_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)\n", + "tst_click['rank'] = tst_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:15:04.503079Z", + "start_time": "2020-11-13T15:15:04.394329Z" + } + }, + "outputs": [], + "source": [ + "#计算用户点击文章的次数,并添加新的一列count\n", + "trn_click['click_cnts'] = trn_click.groupby(['user_id'])['click_timestamp'].transform('count')\n", + "tst_click['click_cnts'] = tst_click.groupby(['user_id'])['click_timestamp'].transform('count')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据浏览" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户点击日志文件_训练集" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:16:07.764776Z", + "start_time": "2020-11-13T15:16:07.536342Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
019999916041715070295701904117113111112811506942089000173
11999995408150702957147841171131101141506994257000118
219999950823150702960147841171131911991507013614000213
319999815777015070295322004117125540402811506983935000201
41999989661315070296718314117125539402091506938444000185
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "0 199999 160417 1507029570190 4 \n", + "1 199999 5408 1507029571478 4 \n", + "2 199999 50823 1507029601478 4 \n", + "3 199998 157770 1507029532200 4 \n", + "4 199998 96613 1507029671831 4 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "0 1 17 1 13 \n", + "1 1 17 1 13 \n", + "2 1 17 1 13 \n", + "3 1 17 1 25 \n", + "4 1 17 1 25 \n", + "\n", + " click_referrer_type rank click_cnts category_id created_at_ts \\\n", + "0 1 11 11 281 1506942089000 \n", + "1 1 10 11 4 1506994257000 \n", + "2 1 9 11 99 1507013614000 \n", + "3 5 40 40 281 1506983935000 \n", + "4 5 39 40 209 1506938444000 \n", + "\n", + " words_count \n", + "0 173 \n", + "1 118 \n", + "2 213 \n", + "3 201 \n", + "4 185 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "0 199999 160417 1507029570190 4 \n", - "1 199999 5408 1507029571478 4 \n", - "2 199999 50823 1507029601478 4 \n", - "3 199998 157770 1507029532200 4 \n", - "4 199998 96613 1507029671831 4 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "0 1 17 1 13 \n", - "1 1 17 1 13 \n", - "2 1 17 1 13 \n", - "3 1 17 1 25 \n", - "4 1 17 1 25 \n", - "\n", - " click_referrer_type rank click_cnts category_id created_at_ts \\\n", - "0 1 11 11 281 1506942089000 \n", - "1 1 10 11 4 1506994257000 \n", - "2 1 9 11 99 1507013614000 \n", - "3 5 40 40 281 1506983935000 \n", - "4 5 39 40 209 1506938444000 \n", - "\n", - " words_count \n", - "0 173 \n", - "1 118 \n", - "2 213 \n", - "3 201 \n", - "4 185 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click = trn_click.merge(item_df, how='left', on=['click_article_id'])\n", - "trn_click.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### train_click_log.csv文件数据中每个字段的含义\n", - "\n", - "1. user_id: 用户的唯一标识\n", - "2. click_article_id: 用户点击的文章唯一标识\n", - "3. click_timestamp: 用户点击文章时的时间戳\n", - "4. click_environment: 用户点击文章的环境\n", - "5. click_deviceGroup: 用户点击文章的设备组\n", - "6. click_os: 用户点击文章时的操作系统\n", - "7. click_country: 用户点击文章时的所在的国家\n", - "8. click_region: 用户点击文章时所在的区域\n", - "9. click_referrer_type: 用户点击文章时,文章的来源" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:16:18.536902Z", - "start_time": "2020-11-13T15:16:18.424203Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Int64Index: 1112623 entries, 0 to 1112622\n", - "Data columns (total 14 columns):\n", - "user_id 1112623 non-null int64\n", - "click_article_id 1112623 non-null int64\n", - "click_timestamp 1112623 non-null int64\n", - "click_environment 1112623 non-null int64\n", - "click_deviceGroup 1112623 non-null int64\n", - "click_os 1112623 non-null int64\n", - "click_country 1112623 non-null int64\n", - "click_region 1112623 non-null int64\n", - "click_referrer_type 1112623 non-null int64\n", - "rank 1112623 non-null int64\n", - "click_cnts 1112623 non-null int64\n", - "category_id 1112623 non-null int64\n", - "created_at_ts 1112623 non-null int64\n", - "words_count 1112623 non-null int64\n", - "dtypes: int64(14)\n", - "memory usage: 127.3 MB\n" - ] - } - ], - "source": [ - "#用户点击日志信息\n", - "trn_click.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
count1.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+06
mean1.221198e+051.951541e+051.507588e+123.947786e+001.815981e+001.301976e+011.310776e+001.813587e+011.910063e+007.118518e+001.323704e+013.056176e+021.506598e+122.011981e+02
std5.540349e+049.292286e+043.363466e+083.276715e-011.035170e+006.967844e+001.618264e+007.105832e+001.220012e+001.016095e+011.631503e+011.155791e+028.343066e+095.223881e+01
min0.000000e+003.000000e+001.507030e+121.000000e+001.000000e+002.000000e+001.000000e+001.000000e+001.000000e+001.000000e+002.000000e+001.000000e+001.166573e+120.000000e+00
25%7.934700e+041.239090e+051.507297e+124.000000e+001.000000e+002.000000e+001.000000e+001.300000e+011.000000e+002.000000e+004.000000e+002.500000e+021.507220e+121.700000e+02
50%1.309670e+052.038900e+051.507596e+124.000000e+001.000000e+001.700000e+011.000000e+002.100000e+012.000000e+004.000000e+008.000000e+003.280000e+021.507553e+121.970000e+02
75%1.704010e+052.777120e+051.507841e+124.000000e+003.000000e+001.700000e+011.000000e+002.500000e+012.000000e+008.000000e+001.600000e+014.100000e+021.507756e+122.280000e+02
max1.999990e+053.640460e+051.510603e+124.000000e+005.000000e+002.000000e+011.100000e+012.800000e+017.000000e+002.410000e+022.410000e+024.600000e+021.510666e+126.690000e+03
\n", - "
" + "source": [ + "trn_click = trn_click.merge(item_df, how='left', on=['click_article_id'])\n", + "trn_click.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### train_click_log.csv文件数据中每个字段的含义\n", + "\n", + "1. user_id: 用户的唯一标识\n", + "2. click_article_id: 用户点击的文章唯一标识\n", + "3. click_timestamp: 用户点击文章时的时间戳\n", + "4. click_environment: 用户点击文章的环境\n", + "5. click_deviceGroup: 用户点击文章的设备组\n", + "6. click_os: 用户点击文章时的操作系统\n", + "7. click_country: 用户点击文章时的所在的国家\n", + "8. click_region: 用户点击文章时所在的区域\n", + "9. click_referrer_type: 用户点击文章时,文章的来源" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:16:18.536902Z", + "start_time": "2020-11-13T15:16:18.424203Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 1112623 entries, 0 to 1112622\n", + "Data columns (total 14 columns):\n", + "user_id 1112623 non-null int64\n", + "click_article_id 1112623 non-null int64\n", + "click_timestamp 1112623 non-null int64\n", + "click_environment 1112623 non-null int64\n", + "click_deviceGroup 1112623 non-null int64\n", + "click_os 1112623 non-null int64\n", + "click_country 1112623 non-null int64\n", + "click_region 1112623 non-null int64\n", + "click_referrer_type 1112623 non-null int64\n", + "rank 1112623 non-null int64\n", + "click_cnts 1112623 non-null int64\n", + "category_id 1112623 non-null int64\n", + "created_at_ts 1112623 non-null int64\n", + "words_count 1112623 non-null int64\n", + "dtypes: int64(14)\n", + "memory usage: 127.3 MB\n" + ] + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", - "mean 1.221198e+05 1.951541e+05 1.507588e+12 3.947786e+00 \n", - "std 5.540349e+04 9.292286e+04 3.363466e+08 3.276715e-01 \n", - "min 0.000000e+00 3.000000e+00 1.507030e+12 1.000000e+00 \n", - "25% 7.934700e+04 1.239090e+05 1.507297e+12 4.000000e+00 \n", - "50% 1.309670e+05 2.038900e+05 1.507596e+12 4.000000e+00 \n", - "75% 1.704010e+05 2.777120e+05 1.507841e+12 4.000000e+00 \n", - "max 1.999990e+05 3.640460e+05 1.510603e+12 4.000000e+00 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", - "mean 1.815981e+00 1.301976e+01 1.310776e+00 1.813587e+01 \n", - "std 1.035170e+00 6.967844e+00 1.618264e+00 7.105832e+00 \n", - "min 1.000000e+00 2.000000e+00 1.000000e+00 1.000000e+00 \n", - "25% 1.000000e+00 2.000000e+00 1.000000e+00 1.300000e+01 \n", - "50% 1.000000e+00 1.700000e+01 1.000000e+00 2.100000e+01 \n", - "75% 3.000000e+00 1.700000e+01 1.000000e+00 2.500000e+01 \n", - "max 5.000000e+00 2.000000e+01 1.100000e+01 2.800000e+01 \n", - "\n", - " click_referrer_type rank click_cnts category_id \\\n", - "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", - "mean 1.910063e+00 7.118518e+00 1.323704e+01 3.056176e+02 \n", - "std 1.220012e+00 1.016095e+01 1.631503e+01 1.155791e+02 \n", - "min 1.000000e+00 1.000000e+00 2.000000e+00 1.000000e+00 \n", - "25% 1.000000e+00 2.000000e+00 4.000000e+00 2.500000e+02 \n", - "50% 2.000000e+00 4.000000e+00 8.000000e+00 3.280000e+02 \n", - "75% 2.000000e+00 8.000000e+00 1.600000e+01 4.100000e+02 \n", - "max 7.000000e+00 2.410000e+02 2.410000e+02 4.600000e+02 \n", - "\n", - " created_at_ts words_count \n", - "count 1.112623e+06 1.112623e+06 \n", - "mean 1.506598e+12 2.011981e+02 \n", - "std 8.343066e+09 5.223881e+01 \n", - "min 1.166573e+12 0.000000e+00 \n", - "25% 1.507220e+12 1.700000e+02 \n", - "50% 1.507553e+12 1.970000e+02 \n", - "75% 1.507756e+12 2.280000e+02 \n", - "max 1.510666e+12 6.690000e+03 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "200000" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#训练集中的用户数量为20w\n", - "trn_click.user_id.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T16:03:01.378461Z", - "start_time": "2020-11-13T16:03:01.300712Z" - } - }, - "outputs": [ + "source": [ + "#用户点击日志信息\n", + "trn_click.info()" + ] + }, { - "data": { - "text/plain": [ - "2" + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
count1.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+06
mean1.221198e+051.951541e+051.507588e+123.947786e+001.815981e+001.301976e+011.310776e+001.813587e+011.910063e+007.118518e+001.323704e+013.056176e+021.506598e+122.011981e+02
std5.540349e+049.292286e+043.363466e+083.276715e-011.035170e+006.967844e+001.618264e+007.105832e+001.220012e+001.016095e+011.631503e+011.155791e+028.343066e+095.223881e+01
min0.000000e+003.000000e+001.507030e+121.000000e+001.000000e+002.000000e+001.000000e+001.000000e+001.000000e+001.000000e+002.000000e+001.000000e+001.166573e+120.000000e+00
25%7.934700e+041.239090e+051.507297e+124.000000e+001.000000e+002.000000e+001.000000e+001.300000e+011.000000e+002.000000e+004.000000e+002.500000e+021.507220e+121.700000e+02
50%1.309670e+052.038900e+051.507596e+124.000000e+001.000000e+001.700000e+011.000000e+002.100000e+012.000000e+004.000000e+008.000000e+003.280000e+021.507553e+121.970000e+02
75%1.704010e+052.777120e+051.507841e+124.000000e+003.000000e+001.700000e+011.000000e+002.500000e+012.000000e+008.000000e+001.600000e+014.100000e+021.507756e+122.280000e+02
max1.999990e+053.640460e+051.510603e+124.000000e+005.000000e+002.000000e+011.100000e+012.800000e+017.000000e+002.410000e+022.410000e+024.600000e+021.510666e+126.690000e+03
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", + "mean 1.221198e+05 1.951541e+05 1.507588e+12 3.947786e+00 \n", + "std 5.540349e+04 9.292286e+04 3.363466e+08 3.276715e-01 \n", + "min 0.000000e+00 3.000000e+00 1.507030e+12 1.000000e+00 \n", + "25% 7.934700e+04 1.239090e+05 1.507297e+12 4.000000e+00 \n", + "50% 1.309670e+05 2.038900e+05 1.507596e+12 4.000000e+00 \n", + "75% 1.704010e+05 2.777120e+05 1.507841e+12 4.000000e+00 \n", + "max 1.999990e+05 3.640460e+05 1.510603e+12 4.000000e+00 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", + "mean 1.815981e+00 1.301976e+01 1.310776e+00 1.813587e+01 \n", + "std 1.035170e+00 6.967844e+00 1.618264e+00 7.105832e+00 \n", + "min 1.000000e+00 2.000000e+00 1.000000e+00 1.000000e+00 \n", + "25% 1.000000e+00 2.000000e+00 1.000000e+00 1.300000e+01 \n", + "50% 1.000000e+00 1.700000e+01 1.000000e+00 2.100000e+01 \n", + "75% 3.000000e+00 1.700000e+01 1.000000e+00 2.500000e+01 \n", + "max 5.000000e+00 2.000000e+01 1.100000e+01 2.800000e+01 \n", + "\n", + " click_referrer_type rank click_cnts category_id \\\n", + "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", + "mean 1.910063e+00 7.118518e+00 1.323704e+01 3.056176e+02 \n", + "std 1.220012e+00 1.016095e+01 1.631503e+01 1.155791e+02 \n", + "min 1.000000e+00 1.000000e+00 2.000000e+00 1.000000e+00 \n", + "25% 1.000000e+00 2.000000e+00 4.000000e+00 2.500000e+02 \n", + "50% 2.000000e+00 4.000000e+00 8.000000e+00 3.280000e+02 \n", + "75% 2.000000e+00 8.000000e+00 1.600000e+01 4.100000e+02 \n", + "max 7.000000e+00 2.410000e+02 2.410000e+02 4.600000e+02 \n", + "\n", + " created_at_ts words_count \n", + "count 1.112623e+06 1.112623e+06 \n", + "mean 1.506598e+12 2.011981e+02 \n", + "std 8.343066e+09 5.223881e+01 \n", + "min 1.166573e+12 0.000000e+00 \n", + "25% 1.507220e+12 1.700000e+02 \n", + "50% 1.507553e+12 1.970000e+02 \n", + "75% 1.507756e+12 2.280000e+02 \n", + "max 1.510666e+12 6.690000e+03 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trn_click.describe()" ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click.groupby('user_id')['click_article_id'].count().min() # 训练集里面每个用户至少点击了两篇文章" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 画直方图大体看一下基本的属性分布" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.\n", - "findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure()\n", - "plt.figure(figsize=(15, 20))\n", - "i = 1\n", - "for col in ['click_article_id', 'click_timestamp', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', \n", - " 'click_region', 'click_referrer_type', 'rank', 'click_cnts']:\n", - " plot_envs = plt.subplot(5, 2, i)\n", - " i += 1\n", - " v = trn_click[col].value_counts().reset_index()[:10]\n", - " fig = sns.barplot(x=v['index'], y=v[col])\n", - " for item in fig.get_xticklabels():\n", - " item.set_rotation(90)\n", - " plt.title(col)\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "注:此处click_cnts直方图表示的是每篇文章对应用户的点击次数累计图\n", - "\n", - "也可以以用户角度分析,画出每个用户点击文章次数的直方图" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4 1084627\n", - "2 25894\n", - "1 2102\n", - "Name: click_environment, dtype: int64" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click['click_environment'].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从点击环境click_environment来看,仅有2102次(占0.19%)点击环境为1;仅有25894次(占2.3%)点击环境为2;剩余(占97.6%)点击环境为4。" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1 678187\n", - "3 395558\n", - "4 38731\n", - "5 141\n", - "2 6\n", - "Name: click_deviceGroup, dtype: int64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click['click_deviceGroup'].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从点击设备组click_deviceGroup来看,设备1占大部分(61%),设备3占36%。" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 测试集用户点击日志" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
024999916097415069591428204117113219192811506912747000259
124999916041715069591728204117113218192811506942089000173
2249998160974150695905606641121132552811506912747000259
3249998202557150695908606641121132453271506938401000219
4249997183665150695908861341171155773011500895686000256
\n", - "
" + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200000" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "0 249999 160974 1506959142820 4 \n", - "1 249999 160417 1506959172820 4 \n", - "2 249998 160974 1506959056066 4 \n", - "3 249998 202557 1506959086066 4 \n", - "4 249997 183665 1506959088613 4 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "0 1 17 1 13 \n", - "1 1 17 1 13 \n", - "2 1 12 1 13 \n", - "3 1 12 1 13 \n", - "4 1 17 1 15 \n", - "\n", - " click_referrer_type rank click_cnts category_id created_at_ts \\\n", - "0 2 19 19 281 1506912747000 \n", - "1 2 18 19 281 1506942089000 \n", - "2 2 5 5 281 1506912747000 \n", - "3 2 4 5 327 1506938401000 \n", - "4 5 7 7 301 1500895686000 \n", - "\n", - " words_count \n", - "0 259 \n", - "1 173 \n", - "2 259 \n", - "3 219 \n", - "4 256 " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tst_click = tst_click.merge(item_df, how='left', on=['click_article_id'])\n", - "tst_click.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
count518010.000000518010.0000005.180100e+05518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.0000005.180100e+05518010.000000
mean227342.428169193803.7925501.507387e+123.9473001.73828513.6284671.34820918.2502501.81961415.52178530.043586305.3249611.506883e+12210.966331
std14613.90718888279.3881773.706127e+080.3239161.0208586.6255641.7035247.0607981.08265733.95770256.868021110.4115135.816668e+0983.040065
min200000.000000137.0000001.506959e+121.0000001.0000002.0000001.0000001.0000001.0000001.0000001.0000001.0000001.265812e+120.000000
25%214926.000000128551.0000001.507026e+124.0000001.00000012.0000001.00000013.0000001.0000004.00000010.000000252.0000001.506970e+12176.000000
50%229109.000000199197.0000001.507308e+124.0000001.00000017.0000001.00000021.0000002.0000008.00000019.000000323.0000001.507249e+12199.000000
75%240182.000000272143.0000001.507666e+124.0000003.00000017.0000001.00000025.0000002.00000018.00000035.000000399.0000001.507630e+12232.000000
max249999.000000364043.0000001.508832e+124.0000005.00000020.00000011.00000028.0000007.000000938.000000938.000000460.0000001.509949e+123082.000000
\n", - "
" + "source": [ + "#训练集中的用户数量为20w\n", + "trn_click.user_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T16:03:01.378461Z", + "start_time": "2020-11-13T16:03:01.300712Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "count 518010.000000 518010.000000 5.180100e+05 518010.000000 \n", - "mean 227342.428169 193803.792550 1.507387e+12 3.947300 \n", - "std 14613.907188 88279.388177 3.706127e+08 0.323916 \n", - "min 200000.000000 137.000000 1.506959e+12 1.000000 \n", - "25% 214926.000000 128551.000000 1.507026e+12 4.000000 \n", - "50% 229109.000000 199197.000000 1.507308e+12 4.000000 \n", - "75% 240182.000000 272143.000000 1.507666e+12 4.000000 \n", - "max 249999.000000 364043.000000 1.508832e+12 4.000000 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "count 518010.000000 518010.000000 518010.000000 518010.000000 \n", - "mean 1.738285 13.628467 1.348209 18.250250 \n", - "std 1.020858 6.625564 1.703524 7.060798 \n", - "min 1.000000 2.000000 1.000000 1.000000 \n", - "25% 1.000000 12.000000 1.000000 13.000000 \n", - "50% 1.000000 17.000000 1.000000 21.000000 \n", - "75% 3.000000 17.000000 1.000000 25.000000 \n", - "max 5.000000 20.000000 11.000000 28.000000 \n", - "\n", - " click_referrer_type rank click_cnts category_id \\\n", - "count 518010.000000 518010.000000 518010.000000 518010.000000 \n", - "mean 1.819614 15.521785 30.043586 305.324961 \n", - "std 1.082657 33.957702 56.868021 110.411513 \n", - "min 1.000000 1.000000 1.000000 1.000000 \n", - "25% 1.000000 4.000000 10.000000 252.000000 \n", - "50% 2.000000 8.000000 19.000000 323.000000 \n", - "75% 2.000000 18.000000 35.000000 399.000000 \n", - "max 7.000000 938.000000 938.000000 460.000000 \n", - "\n", - " created_at_ts words_count \n", - "count 5.180100e+05 518010.000000 \n", - "mean 1.506883e+12 210.966331 \n", - "std 5.816668e+09 83.040065 \n", - "min 1.265812e+12 0.000000 \n", - "25% 1.506970e+12 176.000000 \n", - "50% 1.507249e+12 199.000000 \n", - "75% 1.507630e+12 232.000000 \n", - "max 1.509949e+12 3082.000000 " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tst_click.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "我们可以看出训练集和测试集的用户是完全不一样的\n", - "\n", - "训练集的用户ID由0 ~ 199999,而测试集A的用户ID由200000 ~ 249999。\n", - "\n", - "因此,也就是我们在训练时,需要把测试集的数据也包括在内,称为全量数据。\n", - "\n", - "!!!!!!!!!!!!!!!后续将对训练集和测试集合并分析!!!!!!!!!!!" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "50000" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#测试集中的用户数量为5w\n", - "tst_click.user_id.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:56:07.717463Z", - "start_time": "2020-11-13T15:56:07.693494Z" - } - }, - "outputs": [ + "source": [ + "trn_click.groupby('user_id')['click_article_id'].count().min() # 训练集里面每个用户至少点击了两篇文章" + ] + }, { - "data": { - "text/plain": [ - "1" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 画直方图大体看一下基本的属性分布" ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tst_click.groupby('user_id')['click_article_id'].count().min() # 注意测试集里面有只点击过一次文章的用户" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻文章信息数据表" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:20:34.183761Z", - "start_time": "2020-11-13T15:20:34.164770Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
click_article_idcategory_idcreated_at_tswords_count
0001513144419000168
1111405341936000189
2211408667706000250
3311408468313000230
4411407071171000162
3640423640424601434034118000144
3640433640434601434148472000463
3640443640444601457974279000177
3640453640454601515964737000126
3640463640464601505811330000479
\n", - "
" + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.\n", + "findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " click_article_id category_id created_at_ts words_count\n", - "0 0 0 1513144419000 168\n", - "1 1 1 1405341936000 189\n", - "2 2 1 1408667706000 250\n", - "3 3 1 1408468313000 230\n", - "4 4 1 1407071171000 162\n", - "364042 364042 460 1434034118000 144\n", - "364043 364043 460 1434148472000 463\n", - "364044 364044 460 1457974279000 177\n", - "364045 364045 460 1515964737000 126\n", - "364046 364046 460 1505811330000 479" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#新闻文章数据集浏览\n", - "item_df.head().append(item_df.tail())" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:28:13.084501Z", - "start_time": "2020-11-13T15:28:13.062561Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "176 3485\n", - "182 3480\n", - "179 3463\n", - "178 3458\n", - "174 3456\n", - "183 3432\n", - "184 3427\n", - "173 3414\n", - "180 3403\n", - "177 3391\n", - "170 3387\n", - "187 3355\n", - "169 3352\n", - "185 3348\n", - "175 3346\n", - "181 3330\n", - "186 3328\n", - "189 3327\n", - "171 3327\n", - "172 3322\n", - "165 3308\n", - "188 3288\n", - "167 3269\n", - "190 3261\n", - "192 3257\n", - "168 3248\n", - "193 3225\n", - "166 3199\n", - "191 3182\n", - "194 3164\n", - " ... \n", - "601 1\n", - "857 1\n", - "1977 1\n", - "1626 1\n", - "697 1\n", - "1720 1\n", - "696 1\n", - "706 1\n", - "592 1\n", - "1605 1\n", - "586 1\n", - "582 1\n", - "1606 1\n", - "972 1\n", - "716 1\n", - "584 1\n", - "1608 1\n", - "715 1\n", - "841 1\n", - "968 1\n", - "964 1\n", - "587 1\n", - "1099 1\n", - "1355 1\n", - "711 1\n", - "845 1\n", - "710 1\n", - "965 1\n", - "847 1\n", - "1535 1\n", - "Name: words_count, Length: 866, dtype: int64" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_df['words_count'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:28:59.029535Z", - "start_time": "2020-11-13T15:28:58.816106Z" - } - }, - "outputs": [ + "source": [ + "plt.figure()\n", + "plt.figure(figsize=(15, 20))\n", + "i = 1\n", + "for col in ['click_article_id', 'click_timestamp', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', \n", + " 'click_region', 'click_referrer_type', 'rank', 'click_cnts']:\n", + " plot_envs = plt.subplot(5, 2, i)\n", + " i += 1\n", + " v = trn_click[col].value_counts().reset_index()[:10]\n", + " fig = sns.barplot(x=v['index'], y=v[col])\n", + " for item in fig.get_xticklabels():\n", + " item.set_rotation(90)\n", + " plt.title(col)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "461\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "注:此处click_cnts直方图表示的是每篇文章对应用户的点击次数累计图\n", + "\n", + "也可以以用户角度分析,画出每个用户点击文章次数的直方图" + ] }, { - "data": { - "text/plain": [ - "" + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 1084627\n", + "2 25894\n", + "1 2102\n", + "Name: click_environment, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trn_click['click_environment'].value_counts()" ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从点击环境click_environment来看,仅有2102次(占0.19%)点击环境为1;仅有25894次(占2.3%)点击环境为2;剩余(占97.6%)点击环境为4。" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "print(item_df['category_id'].nunique()) # 461个文章主题\n", - "item_df['category_id'].hist()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(364047, 4)" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_df.shape # 364047篇文章" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻文章embedding向量表示" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
article_idemb_0emb_1emb_2emb_3emb_4emb_5emb_6emb_7emb_8...emb_240emb_241emb_242emb_243emb_244emb_245emb_246emb_247emb_248emb_249
00-0.161183-0.957233-0.1379440.0508550.8300550.901365-0.335148-0.559561-0.500603...0.3212480.3139990.6364120.1691790.540524-0.8131820.286870-0.2316860.5974160.409623
11-0.523216-0.9740580.7386080.1552340.6262940.485297-0.715657-0.897996-0.359747...-0.4878430.8231240.412688-0.3386540.3207860.588643-0.5941370.1828280.397090-0.834364
22-0.619619-0.972960-0.207360-0.1288610.044748-0.387535-0.730477-0.066126-0.754899...0.4547560.4731840.377866-0.863887-0.3833650.137721-0.810877-0.4475800.805932-0.285284
33-0.740843-0.9757490.3916980.641738-0.2686450.191745-0.825593-0.710591-0.040099...0.2715350.0360400.480029-0.7631730.0226270.565165-0.910286-0.5378380.243541-0.885329
44-0.279052-0.9723150.6853740.1130560.2383150.271913-0.5688160.341194-0.600554...0.2382860.8092680.427521-0.615932-0.5036970.614450-0.917760-0.4240610.185484-0.580292
\n", - "

5 rows × 251 columns

\n", - "
" + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 678187\n", + "3 395558\n", + "4 38731\n", + "5 141\n", + "2 6\n", + "Name: click_deviceGroup, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " article_id emb_0 emb_1 emb_2 emb_3 emb_4 emb_5 \\\n", - "0 0 -0.161183 -0.957233 -0.137944 0.050855 0.830055 0.901365 \n", - "1 1 -0.523216 -0.974058 0.738608 0.155234 0.626294 0.485297 \n", - "2 2 -0.619619 -0.972960 -0.207360 -0.128861 0.044748 -0.387535 \n", - "3 3 -0.740843 -0.975749 0.391698 0.641738 -0.268645 0.191745 \n", - "4 4 -0.279052 -0.972315 0.685374 0.113056 0.238315 0.271913 \n", - "\n", - " emb_6 emb_7 emb_8 ... emb_240 emb_241 emb_242 \\\n", - "0 -0.335148 -0.559561 -0.500603 ... 0.321248 0.313999 0.636412 \n", - "1 -0.715657 -0.897996 -0.359747 ... -0.487843 0.823124 0.412688 \n", - "2 -0.730477 -0.066126 -0.754899 ... 0.454756 0.473184 0.377866 \n", - "3 -0.825593 -0.710591 -0.040099 ... 0.271535 0.036040 0.480029 \n", - "4 -0.568816 0.341194 -0.600554 ... 0.238286 0.809268 0.427521 \n", - "\n", - " emb_243 emb_244 emb_245 emb_246 emb_247 emb_248 emb_249 \n", - "0 0.169179 0.540524 -0.813182 0.286870 -0.231686 0.597416 0.409623 \n", - "1 -0.338654 0.320786 0.588643 -0.594137 0.182828 0.397090 -0.834364 \n", - "2 -0.863887 -0.383365 0.137721 -0.810877 -0.447580 0.805932 -0.285284 \n", - "3 -0.763173 0.022627 0.565165 -0.910286 -0.537838 0.243541 -0.885329 \n", - "4 -0.615932 -0.503697 0.614450 -0.917760 -0.424061 0.185484 -0.580292 \n", - "\n", - "[5 rows x 251 columns]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_emb_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(295141, 251)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_emb_df.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 数据分析" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户重复点击" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:30:20.899771Z", - "start_time": "2020-11-13T15:30:20.750817Z" - } - }, - "outputs": [], - "source": [ - "#####merge\n", - "user_click_merge = trn_click.append(tst_click)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:30:26.290038Z", - "start_time": "2020-11-13T15:30:25.339579Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idcount
00307601
101575071
21637461
312891971
42361621
521684011
63361621
73506441
84398941
94425671
\n", - "
" + "source": [ + "trn_click['click_deviceGroup'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从点击设备组click_deviceGroup来看,设备1占大部分(61%),设备3占36%。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 测试集用户点击日志" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
024999916097415069591428204117113219192811506912747000259
124999916041715069591728204117113218192811506942089000173
2249998160974150695905606641121132552811506912747000259
3249998202557150695908606641121132453271506938401000219
4249997183665150695908861341171155773011500895686000256
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "0 249999 160974 1506959142820 4 \n", + "1 249999 160417 1506959172820 4 \n", + "2 249998 160974 1506959056066 4 \n", + "3 249998 202557 1506959086066 4 \n", + "4 249997 183665 1506959088613 4 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "0 1 17 1 13 \n", + "1 1 17 1 13 \n", + "2 1 12 1 13 \n", + "3 1 12 1 13 \n", + "4 1 17 1 15 \n", + "\n", + " click_referrer_type rank click_cnts category_id created_at_ts \\\n", + "0 2 19 19 281 1506912747000 \n", + "1 2 18 19 281 1506942089000 \n", + "2 2 5 5 281 1506912747000 \n", + "3 2 4 5 327 1506938401000 \n", + "4 5 7 7 301 1500895686000 \n", + "\n", + " words_count \n", + "0 259 \n", + "1 173 \n", + "2 259 \n", + "3 219 \n", + "4 256 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id count\n", - "0 0 30760 1\n", - "1 0 157507 1\n", - "2 1 63746 1\n", - "3 1 289197 1\n", - "4 2 36162 1\n", - "5 2 168401 1\n", - "6 3 36162 1\n", - "7 3 50644 1\n", - "8 4 39894 1\n", - "9 4 42567 1" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#用户重复点击\n", - "user_click_count = user_click_merge.groupby(['user_id', 'click_article_id'])['click_timestamp'].agg({'count'}).reset_index()\n", - "user_click_count[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:34:27.418638Z", - "start_time": "2020-11-13T15:34:27.372761Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idcount
311242862957425410
311243862957626810
39376110323720594810
39376310323723568910
5769021348506946313
\n", - "
" + "source": [ + "tst_click = tst_click.merge(item_df, how='left', on=['click_article_id'])\n", + "tst_click.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
count518010.000000518010.0000005.180100e+05518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.0000005.180100e+05518010.000000
mean227342.428169193803.7925501.507387e+123.9473001.73828513.6284671.34820918.2502501.81961415.52178530.043586305.3249611.506883e+12210.966331
std14613.90718888279.3881773.706127e+080.3239161.0208586.6255641.7035247.0607981.08265733.95770256.868021110.4115135.816668e+0983.040065
min200000.000000137.0000001.506959e+121.0000001.0000002.0000001.0000001.0000001.0000001.0000001.0000001.0000001.265812e+120.000000
25%214926.000000128551.0000001.507026e+124.0000001.00000012.0000001.00000013.0000001.0000004.00000010.000000252.0000001.506970e+12176.000000
50%229109.000000199197.0000001.507308e+124.0000001.00000017.0000001.00000021.0000002.0000008.00000019.000000323.0000001.507249e+12199.000000
75%240182.000000272143.0000001.507666e+124.0000003.00000017.0000001.00000025.0000002.00000018.00000035.000000399.0000001.507630e+12232.000000
max249999.000000364043.0000001.508832e+124.0000005.00000020.00000011.00000028.0000007.000000938.000000938.000000460.0000001.509949e+123082.000000
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "count 518010.000000 518010.000000 5.180100e+05 518010.000000 \n", + "mean 227342.428169 193803.792550 1.507387e+12 3.947300 \n", + "std 14613.907188 88279.388177 3.706127e+08 0.323916 \n", + "min 200000.000000 137.000000 1.506959e+12 1.000000 \n", + "25% 214926.000000 128551.000000 1.507026e+12 4.000000 \n", + "50% 229109.000000 199197.000000 1.507308e+12 4.000000 \n", + "75% 240182.000000 272143.000000 1.507666e+12 4.000000 \n", + "max 249999.000000 364043.000000 1.508832e+12 4.000000 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "count 518010.000000 518010.000000 518010.000000 518010.000000 \n", + "mean 1.738285 13.628467 1.348209 18.250250 \n", + "std 1.020858 6.625564 1.703524 7.060798 \n", + "min 1.000000 2.000000 1.000000 1.000000 \n", + "25% 1.000000 12.000000 1.000000 13.000000 \n", + "50% 1.000000 17.000000 1.000000 21.000000 \n", + "75% 3.000000 17.000000 1.000000 25.000000 \n", + "max 5.000000 20.000000 11.000000 28.000000 \n", + "\n", + " click_referrer_type rank click_cnts category_id \\\n", + "count 518010.000000 518010.000000 518010.000000 518010.000000 \n", + "mean 1.819614 15.521785 30.043586 305.324961 \n", + "std 1.082657 33.957702 56.868021 110.411513 \n", + "min 1.000000 1.000000 1.000000 1.000000 \n", + "25% 1.000000 4.000000 10.000000 252.000000 \n", + "50% 2.000000 8.000000 19.000000 323.000000 \n", + "75% 2.000000 18.000000 35.000000 399.000000 \n", + "max 7.000000 938.000000 938.000000 460.000000 \n", + "\n", + " created_at_ts words_count \n", + "count 5.180100e+05 518010.000000 \n", + "mean 1.506883e+12 210.966331 \n", + "std 5.816668e+09 83.040065 \n", + "min 1.265812e+12 0.000000 \n", + "25% 1.506970e+12 176.000000 \n", + "50% 1.507249e+12 199.000000 \n", + "75% 1.507630e+12 232.000000 \n", + "max 1.509949e+12 3082.000000 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id count\n", - "311242 86295 74254 10\n", - "311243 86295 76268 10\n", - "393761 103237 205948 10\n", - "393763 103237 235689 10\n", - "576902 134850 69463 13" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user_click_count[user_click_count['count']>7]" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:32:53.298575Z", - "start_time": "2020-11-13T15:32:53.285611Z" - } - }, - "outputs": [ + "source": [ + "tst_click.describe()" + ] + }, { - "data": { - "text/plain": [ - "array([ 1, 2, 4, 3, 6, 5, 10, 7, 13])" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "我们可以看出训练集和测试集的用户是完全不一样的\n", + "\n", + "训练集的用户ID由0 ~ 199999,而测试集A的用户ID由200000 ~ 249999。\n", + "\n", + "因此,也就是我们在训练时,需要把测试集的数据也包括在内,称为全量数据。\n", + "\n", + "!!!!!!!!!!!!!!!后续将对训练集和测试集合并分析!!!!!!!!!!!" ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user_click_count['count'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1 1605541\n", - "2 11621\n", - "3 422\n", - "4 77\n", - "5 26\n", - "6 12\n", - "10 4\n", - "7 3\n", - "13 1\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#用户点击新闻次数\n", - "user_click_count.loc[:,'count'].value_counts() " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "###### 可以看出:有1605541(约占99.2%)的用户未重复阅读过文章,仅有极少数用户重复点击过某篇文章。 这个也可以单独制作成特征" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户点击环境变化分析" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:39:41.961797Z", - "start_time": "2020-11-13T15:39:41.949829Z" - } - }, - "outputs": [], - "source": [ - "def plot_envs(df, cols, r, c):\n", - " plt.figure()\n", - " plt.figure(figsize=(10, 5))\n", - " i = 1\n", - " for col in cols:\n", - " plt.subplot(r, c, i)\n", - " i += 1\n", - " v = df[col].value_counts().reset_index()\n", - " fig = sns.barplot(x=v['index'], y=v[col])\n", - " for item in fig.get_xticklabels():\n", - " item.set_rotation(90)\n", - " plt.title(col)\n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:39:55.476626Z", - "start_time": "2020-11-13T15:39:48.764592Z" - } - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50000" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#测试集中的用户数量为5w\n", + "tst_click.user_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:56:07.717463Z", + "start_time": "2020-11-13T15:56:07.693494Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tst_click.groupby('user_id')['click_article_id'].count().min() # 注意测试集里面有只点击过一次文章的用户" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻文章信息数据表" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:20:34.183761Z", + "start_time": "2020-11-13T15:20:34.164770Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
click_article_idcategory_idcreated_at_tswords_count
0001513144419000168
1111405341936000189
2211408667706000250
3311408468313000230
4411407071171000162
3640423640424601434034118000144
3640433640434601434148472000463
3640443640444601457974279000177
3640453640454601515964737000126
3640463640464601505811330000479
\n", + "
" + ], + "text/plain": [ + " click_article_id category_id created_at_ts words_count\n", + "0 0 0 1513144419000 168\n", + "1 1 1 1405341936000 189\n", + "2 2 1 1408667706000 250\n", + "3 3 1 1408468313000 230\n", + "4 4 1 1407071171000 162\n", + "364042 364042 460 1434034118000 144\n", + "364043 364043 460 1434148472000 463\n", + "364044 364044 460 1457974279000 177\n", + "364045 364045 460 1515964737000 126\n", + "364046 364046 460 1505811330000 479" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#新闻文章数据集浏览\n", + "item_df.head().append(item_df.tail())" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtIAAAFgCAYAAACWgJ5JAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAABB/UlEQVR4nO3dd7gsVZn+/e9NkCwSjijhcBQVBhREz4AoMyJgAEEURUBMiDJm+CkG1FcQ1NEZUREcESQqShJGFFQQREAUOCBBYJAsSXJOyuF+/6i1pWl26O7d3dXd+/5cV127K3TVs/ucZ9fTq1atkm0iIiIiIqI9C9QdQERERETEMEohHRERERHRgRTSEREREREdSCEdEREREdGBFNIRERERER1IIR0RERER0YEU0hERHZD0XklnN8w/KOn5U7xnjiRLWmiax75e0qbT2UfZz5QxR0TExFJID5A6T8zdkhNzzFS2l7R9bd1xtKNbMUt6raTfSnpA0l2SLpL0GUmLdiPOiGE3Cuf3GF8K6QE2k0/MdZK0kaSb6o4jYhhI2gY4DvgxsKrt5YBtgZWBVSZ4TwqDmNFG4VwZlRTS0TeSFqw7hohOSFpF0vGS7igtrvuPs40lvaC8XkzSPpJukHSfpLMlLTbOe95aumm8eIrjv6vs6y5Jn29at4Ckz0q6pqw/RtKyZd0vJX20afuLJW3dTsySXiHpHEn3lvdvVJYL+Cawl+2DbN8NYPtK2x+zfVXZbk9Jx0n6kaT7gfdKWlHSiZLulnS1pA80xHiYpC83zD/ly235zHaXdLmkeyQdmtbviKhDCumaDMCJedwTY1l3hqS9Jf2+XKo9RdLyZV07J+bDJH1P0smSHgJeI+lfyv7vlXSZpDc17OcwSd+VdFI57rmSVmv6PD4s6aqyfm9Jq5Xf4/5SQDyjYfstVF1ivrdss3bDuusl7SbpkvJ5Hi1pUUlLAL8EVlR16e1BSStO9lnGaFP1BfAXwA3AHGAl4Kgp3vYN4OXAK4FlgU8DTzTtd0fg68Cmtv88yfHXBL4HvAtYEViOqrV3zMeANwOvLuvvAb5b1v0E2L5pX6sCJ7Uas6SVyvZfLst3A34qaRaweonlp5N8FmO2omq5fhZwJNVneFOJ+W3AVyVt3MJ+xuwAvB5YDXgR8IU23hvRMwNwfn9TOb/eW863/9Kw7jOSbi7n0CslbdKN33lGs52pzxOwIHAx8C1gCWBRYEPgvcDZDdsZeEF5/V3gDKqT+IJUJ7tFqE7sBhYCdgSuHnvPJMdfCbgL2Jzqy9Rry/yssv4M4Bqqk9NiZf5rZd27gd837GtN4F5gkXFiPgy4D3hVOc5SJb7PAc8ANgYeAFZv2P4uYL3y+xwJHNX0efwMeCawFvAYcBrwfGBp4HLgPWXbdYHbgfXL5/Ue4PqGOK8HzqM6iS8LXAF8sKzbCLip7v8nmQZjAjYA7gAWalo+br6W/+uPAOuMs6+xfN2t/H9duYXjf7EpD5YA/k5VgFP+727SsP65wD9KDi0FPETV5QLgK8Ahbcb8GeCHTct+XXJqw7KPRRvWHVX+JjwMvKss2xM4s2GbVYD5wFINy/4TOKy8Pgz4csO6p+Rkyd8PNsxvDlxT9/+VTJmo//z+opLzrwUWpvpCfDXVOXd14EZgxbLtHGC1uj+zYZ/SIl2P9agKuE/Zfsj2o7bPnmhjSQsA7wN2sX2z7fm2z7H9WMNmuwKfAjayffUUx38ncLLtk20/YftUYB7VyWjMobb/YvsR4BjgpWX5CcBLJa1a5ncAjm+KpdHPbP/e9hNlH0tSFeV/t306VUvf9g3bn2D7PNuPUxXSL23a33/Zvt/2ZcCfgVNsX2v7PqqW5HXLdjsD37d9bvm8DqcqvF/RsK/v2L7F1eXon49zrAioir4byv/JVixPdfK8ZpJtPgV813YrffFXpDr5AWD7IaovnGNWBU4orU/3UhXW84EVbD9A1Zq8Xdl2e6q8aifmVYFtxvZfjrEhVcE+FsdzG+LbzvazgAupioIxNza8XhG4u8Q35gaqQqJVjfu7oewzom51n9+3BU6yfartf1BdaVqMqjifT1WgrylpYdvX257s71S0IIV0Peo+MU92Yhzzt4bXD1MVwLRxYh7TfPK8sRTVY5pPnuMet8FtDa8fGWd+bPtVgU82/Y6r8NST7VTHioDq//BstX6D3J3Ao1RdDibyOuALkt7awv5upeGmPUmLU3XvaIxvM9vPapgWtX1zWf8TYHtJG1D9HfltmzHfSNUi3bj/JWx/DbgSuBnYuoXfww2vbwGWlbRUw7LZZV9Qtagt3rDuOePsr/FGxtllnxF1q/v8viLVeRWAcr69EVipFOG7Ul0hul3SUem6OH0ppOtR94l5shNjK1o5MY9pPnmuUr6Bj2k8eXbTjcBXmn7HxW3/pIX3eupNYgY5j6qY/ZqkJUpf+ldNtHE5cR0CfFPVDXULStpA0iINm10GvAH4buN9AhM4DthC0oblHoC9eOrf7gOAr4xdJZI0S9JWDetPpvpiuRdwdNMX2VZi/hGwpaTXl+WLqrr5b+Xyvk8Ce0j6gKRlVHkhsMIkn9GNwDnAf5b9rQ3sVI4FcBGwuaRlJT2H6uTf7COSVlZ1Y+XngaMn+xAj+qTu8/stVPkO/POG4FUo51nbP7a9YdnGVPdpxDSkkK5H3SfmCU+MLcY/5Yl5AudStfx+WtLCqm5w3JKpb9zqxEHAByWtX07sS0h6Y1ML2ERuA5aTtHQP4oohY3s+1f/TFwB/pbpBbtsp3rYbcClwPnA31cnqKX9vbV8MbAEcJGmzSY5/GfARquHlbqW6mbCxZWpf4ETgFEkPAH+kujdg7P2PAccDm5Z9tBVzKXq3orq34Q6qQuFTY7+P7aOBt1N1GbuRqjA4BjgQOHaS421P1UfzFqouY3vY/k1Z90OqfqbXA6cwfpH847LuWqrWvC+Ps01Ev9V9fj8GeKOkTSQtTPVF9zHgHEmrS9q47PtRqqu4rZ6/YyJ1dc6e6RNVS+z/UvUxvBP4DpPfjLAY8G2qb5X3AWeWZXPKdguV7eZSFYKbTXH89YHfUZ0w76DqrjG7rDsDeH/Dtk+Jqyw7uBz3X5uWN99s+OWm9WuV495HdbPVWxrWPWV7nn6D0T/3XebPBt7bMP9l4AcN82+gKgrupfrDdizl5iaqE/SmDdvuCfyoYf6Q8m9zL+XGjEyZMg3G1Jy/mTIN0jQA5/e3lPPrfeV8u1ZZvjZVof9AOff/Iue36U8qH25ERMRQkHQ91Zf930y1bUREL6VrR0REzSTtoCfHLW+cLqs7toiImFhapEeUpB2A74+z6gbba/U7noiIiJi+nN8HSwrpiIiIiIgOtDo8y1BYfvnlPWfOnLrDiOi7Cy644E7bs+qOo13J2ZiphjFnk68xU02WryNVSM+ZM4d58+bVHUZE30m6YeqtBk9yNmaqYczZ5GvMVJPla242jIiIiIjoQArpiIiIiIgOpJCOiIiIiOhACumIiIiIiA6M1M2G3fTyTx1Rdwgx4i7473fXHcJISc5GryVnuyf5Gr3Wr3xNi3RERERERAdSSEdEREREdCCFdEREREREB1JIR0RERER0IIV0REREREQHUkhHRERERHSgp4W0pEMk3S7pzw3LlpV0qqSrys9lJnjve8o2V0l6Ty/jjIhKcjZieCRfI+rX6xbpw4A3NC37LHCa7RcCp5X5p5C0LLAHsD6wHrDHRH8MIqKrDiM5GzEsDiP5GlGrlgtpST9sZVkj22cCdzct3go4vLw+HHjzOG99PXCq7btt3wOcytP/WETEFCQ9U9JSrW6fnI0YHsnXiPq10yK9VuOMpAWBl3dwzBVs31pe/w1YYZxtVgJubJi/qSx7Gkk7S5onad4dd9zRQTgRo0fSv0q6FLgE+LOkiyV1kq+QnI0YJsnXiD6aspCWtLukB4C1Jd1fpgeA24GfTefgtg14mvs40PZc23NnzZo1nV1FjJKDgQ/bnmN7VeAjwKHT3WlyNmJ4JF8jem/KQtr2f9peCvhv288s01K2l7O9ewfHvE3ScwHKz9vH2eZmYJWG+ZXLsohozXzbZ43N2D4beLzDfSVnI4ZH8jWij1ru2mF7d0krSXqlpH8fmzo45onA2B3C72H8Vu1fA6+TtEy5AeJ1ZVlEtOZ3kr4vaSNJr5b0P8AZkl4m6WVt7is5GzE8kq8RfbRQqxtK+hqwHXA5ML8sNnDmJO/5CbARsLykm6juEv4acIyknYAbgLeXbecCH7T9ftt3S9obOL/sai/bzTdURMTE1ik/92havi5V3m483puSsxHDI/kaUb+WC2ngLcDqth9r9Q22t59g1SbjbDsPeH/D/CHAIW3EFxGF7dd0+L7kbMSQSL5G1K+dQvpaYGGg5UI6Iuoh6YvjLbe9V79jiYiIGFXtFNIPAxdJOo2GYtr2x7seVURM10MNrxcFtgCuqCmWiIiIkdROIX1imSJiwNnep3Fe0jfIzUQRERFd1XIhbftwSYsBs21f2cOYIqL7Fqca4ioiIiK6pJ1HhG8JXAT8qsy/VFJaqCMGkKRLJV1SpsuAK4Fv1xxWRETESGmna8eewHrAGQC2L5L0/B7EFBHTt0XD68eB22x3+kCWiIiIGEfLLdLAP2zf17TsiW4GExHdYfsG4FnAllRDV65Za0AREREjqJ1C+jJJ7wAWlPRCSfsB5/QoroiYBkm7AEcCzy7TkZI+Vm9UERERo6WdQvpjwFpUQ9/9BLgf2LUHMUXE9O0ErG/7i7a/CLwC+EDNMUVERIyUdkbteBj4fJkiYrAJmN8wP78si4iIiC5puZCWNBf4HDCn8X221+5+WBExTYcC50o6ocy/GTi4vnAiIiJGTzujdhwJfAq4lNxkGDGwJC0A/JFqhJ0Ny+Idbf+ptqAiIiJGUDuF9B22M250xICz/YSk79peF7iw7ngiIiJGVTuF9B6SfgCcRnXDIQC2j+96VBExXadJeitwvG3XHUxERMQoaqeQ3hFYA1iYJ7t2GEghHTF4/gP4BPC4pEepbjS07WfWG1ZERMToaKeQ/lfbq3fjoJJWB45uWPR84Iu2v92wzUbAz4DryqLjbe/VjeNHjDrbS3VrX8nXiOGSnI3on3YK6XMkrWn78uke1PaVwEsBJC0I3AycMM6mZ9neYpzlETGOkk+L2X6wzL8CeEZZ/SfbD7S7z+RrxHBJzkb0TzuF9CuAiyRdR9VHeuxS8XSHv9sEuKY80jgipufrwO3Af5X5nwB/BhaluvHwM9Pcf/I1YrgkZyN6qJ1C+g09imE7qpP9eDaQdDFwC7Cb7cuaN5C0M7AzwOzZs3sUYsTQ2AT414b5e21vKUnAWV3Y/7TyFZKzEX2Wc2xED7X8iPDybfYm4B9UNxmOTR2T9AzgTcCx46y+EFjV9jrAfsD/ThDXgbbn2p47a9as6YQTMQoWsP14w/xnoLp0BCw5nR13I19LLMnZiAlIWkZSVx50lnNsRO+1XEhL+hhwG3AqcFKZfjHN428GXGj7tuYVtu8f6+dp+2RgYUnLT/N4EaPuGZL+eaOh7VMAJC1N1b1jOpKvET0g6QxJz5S0LFWBe5Ckb3Zh18nZiB5ruZAGdgFWt72W7ZeUabrfmrdngktOkp5TLkcjab0S613TPF7EqDsIOFrSP6/BSlqVKs9+MM19J18jemNp2/cDWwNH2F4f2LQL+03ORvRYO32kbwTu69aBJS0BvJZqvNuxZR8EsH0A8DbgQ5IeBx4BtsuDJSImZ/ubkh4Gzi45JuAB4Gu2v9fpfpOvET21kKTnAm8HPt+NHSZnI/qjnUL6WuAMSSfx1CcbdnT5yfZDwHJNyw5oeL0/sH8n+46YyUoeHTDWxaOTIe/G2WfyNaJ39gJ+Dfze9vmSng9cNZ0dJmcj+qOdQvqvZXoGT45LGxEDSNIKwFeBFYHNJK0JbGD74Hoji4hmto+l4YZA29cCb60voohoVcuFtO0vAUhassw/2KugImLaDgMO5cnLxH+hetJZCumIASNpZaqRM15VFp0F7GL7pvqiiohWtDNqx4sl/Qm4DLhM0gWS1updaBExDcvbPgZ4AqAMiTe/3pAiYgKHAidSXUFaEfh5WRYRA66dUTsOBD5he1XbqwKfpBohICIGz0OSlqOM9V4eFd61m4Ujoqtm2T7U9uNlOgzIoM0RQ6CdPtJL2P7t2IztM8pdwRExeD5B1cK1mqTfU52U31ZvSBExgbskvZMnh6rbngxFFzEU2hq1Q9L/B/ywzL+TaiSPiBgwti+U9Gpgdaoh8K60/Y+aw4qI8b2Pqo/0t6iuIp0D7FhrRBHRkna6dryPqlXreOCnwPJlWUQMGEkfAZa0fZntPwNLSvpw3XFFxNPZvsH2m2zPsv1s22+2/dex9ZJ2rzO+iJhYS4W0pAWB421/3PbLbL/c9q627+lxfBHRmQ/YvndspuTqB+oLJyKmYZu6A4iI8bVUSNueDzwhaekexxMR3bHg2ON/4Z9fhjP+e8Rw0tSbREQd2ukj/SBwqaRTgYfGFtr+eNejiojp+hVwtKTvl/n/KMsiYvjk0d0RA6qdQvr4MkXE4PsMVfH8oTJ/KvCD+sKJiGlIi3TEgGrnyYaH9zKQiOge208A3ytTRAwwScvavrtp2fNsX1dmjx3nbRExAKYspCUdY/vtki5lnMtLttfuSWQR0bbka8RQ+rmkzWzfDyBpTeAY4MUAtr9aZ3ARMbFWWqR3KT+36GUgEdEVydeI4fNVqmL6jVRjvx8B7FBvSBHRiikLadu3lpdvBY6yfUtvQ4qITiVfI4aP7ZMkLQycAiwFvMX2X2oOKyJa0M7NhksBp0q6GzgaONb2bb0JKyKmKfkaMeAk7cdTu2AtDVwDfFRSRsWKGALt3Gz4JeBLktYGtgV+J+km25t2cmBJ1wMPAPOBx23PbVovYF9gc+Bh4L22L+zkWBEzTbfzFZKzET0wr2n+gm7tOPka0R/ttEiPuR34G3AX8OxpHv81tu+cYN1mwAvLtD7V6APrT/N4ETNNN/MVkrMRXTM2GpakJYBHy8PPxh6gtEgXDpF8jeixlp5sCCDpw5LOAE4DlqN6BHEvRwDYCjjClT8Cz5L03B4eL2Jk1JCvkJyN6NRpwGIN84sBv+nxMZOvEV3QTov0KsCuti/q0rENnCLJwPdtH9i0fiXgxob5m8qyWxs3krQzsDPA7NmzuxRaxNDrdr5CcjaiVxa1/eDYjO0HJS0+zX0mXyP6oOUWadu7Uz0ifEVJs8emaRx7Q9svo7q89BFJ/97JTmwfaHuu7bmzZs2aRjgRo6Pk65KSdgSQNEvS86a52+RsRG88JOllYzOSXg48Ms19Jl8j+qDlFmlJHwX2BG4DniiLDXR0udj2zeXn7ZJOANYDzmzY5GaqVrUxK5dlETEFSXsAc6nGpD0UWBj4EfCqTveZnI3omV2BYyXdQvU48OdQ3STcseRrRH+03CJNleir217L9kvK1FERLWkJSUuNvQZeB/y5abMTgXer8grgvoYxciNicm8B3gQ8BFDGk16q050lZyN6x/b5wBrAh4APAv9iu+MRPJKvEf3TTh/pG4H7unTcFYATqtF3WAj4se1fSfoggO0DgJOphuW5mmponh27dOyImeDvtl36R46dTKcjORvRZZI2tn26pK2bVr2ojCN9fIe7Tr5G9Ek7hfS1wBmSTgIeG1to+5vtHtT2tcA64yw/oOG1gY+0u++IAOAYSd+nuhP/A8D7gIM63VlyNqInXg2cDmw5zjoDHRXSydeI/mmnkP5rmZ5RpogYULa/Iem1wP1U/aS/aPvUmsOKiAa29yg/0xocMaTafbIhkha3/XDvQoqIbiiFc4rniAEl6ROTre/kim9E9Fc7o3ZsABwMLAnMlrQO8B+2P9yr4CKiPZIeoLokPC7bz+xjOBExucluAJ4wjyNicLTTtePbwOup7vTF9sWdjksZEb1he+xO/b2pHqzwQ6rhtHYA8tSyiAHScKX3cGAX2/eW+WWAfWoMLSJa1M7wd9i+sWnR/C7GEhHd8ybb/2P7Adv32/4e1SOBI2LwrD1WRAPYvgdYt75wIqJV7RTSN0p6JWBJC0vaDbiiR3FFxPQ8JGkHSQtKWkDSDpQxpSNi4CxQWqEBkLQs7V0xjoiatJOoHwT2BVaievrRKWTonIhB9Q6qfN2Xqq/l78uyiBg8+wB/kHRsmd8G+EqN8UREi9oZteNOqn6W45K0u+3/7EpUETEttq9nkq4cydeIwWH7CEnzgI3Loq1tX15nTBHRmrb6SE9hmy7uKyJ6K/kaMUBsX257/zKliI4YEt0spNXFfUVEbyVfIyIipqmbhXTGvIwYHsnXiIiIaUqLdMTMlHyNiIiYppYL6TIcT/Oy5zXMHtu8PiLqkXyNiIjovXZapH8u6Z+PF5a0JvDzsXnbX+1mYBExLcnXiIiIHmunkP4q1cl5SUkvp2rRemdvwoqIaUq+RkRE9Fg740ifJGlhqgexLAW8xfZfehZZRHQs+RoREdF7UxbSkvbjqXf4Lw1cA3xUErY/3u5BJa0CHAGsUPZ9oO19m7bZCPgZcF1ZdLztvdo9VsRMknyNiORsRP+00iI9r2n+gi4c93Hgk7YvlLQUcIGkU8cZhP4s21t04XgRM0XyNSKSsxF9MmUhbftwAElLAI/anl/mFwQW6eSgtm8Fbi2vH5B0BbASkKc5RUxD8jUikrMR/dPOzYanAYs1zC8G/Ga6AUiaA6wLnDvO6g0kXSzpl5LWmuD9O0uaJ2neHXfcMd1wIkbFQOZr2UdyNqJPco6N6K12CulFbT84NlNeLz6dg0taEvgpsKvt+5tWXwisansdYD/gf8fbh+0Dbc+1PXfWrFnTCSdilAxkvpZYkrMRfZBzbETvtVNIPyTpZWMzZUitRzo9cBlR4KfAkbaPb15v+/6xQsD2ycDCkpbv9HgRM0zyNWIGS85G9EfLw98BuwLHSrqF6vHCzwG27eSgkgQcDFxh+5sTbPMc4DbblrQeVdF/VyfHi5iBdiX5GjEjJWcj+qedcaTPl7QGsHpZdKXtf3R43FcB7wIulXRRWfY5YHY51gHA24APSXqcqiVtO9seZ18R0ST5GjGjJWcj+qSVcaQ3tn26pK2bVr2ojEv7tEtGU7F9NlUr2WTb7A/s3+6+I2ay5GtEJGcj+qeVFulXA6cDW46zzkDbJ+aI6Jnka0RERJ+0Mo70HuXnjr0PJyKmI/kaERHRP6107fjEZOsnupEhIvov+RoREdE/rXTtWGqSdbkxIWKwJF8jIiL6pJWuHV8CkHQ4sIvte8v8MsA+PY0uItqSfI2IiOifdh7IsvbYSRnA9j1Ujx2NiMGTfI2IiOixdgrpBUqrFgCSlqW9B7pERP8kXyMiInqsnRPrPsAfJB1b5rcBvtL9kCKiC5KvERERPdbOkw2PkDQP2Lgs2tr25b0JKyKmI/kaERHRe21d6i0n4pyMI4ZA8jUiIqK32ukjHRERERERRQrpiIiIiIgOpJCOiIiIiOhACumIiIiIiA6kkI6IiIiI6EBthbSkN0i6UtLVkj47zvpFJB1d1p8raU4NYUZEkZyNGB7J14j+qKWQlrQg8F1gM2BNYHtJazZtthNwj+0XAN8Cvt7fKCNiTHI2YngkXyP6p64W6fWAq21fa/vvwFHAVk3bbAUcXl4fB2wiSX2MMSKelJyNGB7J14g+qauQXgm4sWH+prJs3G1sPw7cByzXl+giollyNmJ4JF8j+qStJxsOIkk7AzuX2QclXVlnPDPc8sCddQcxLPSN93Rzd6t2c2e9lJwdGMnXNs3EnE2+DpTkbBv6la91FdI3A6s0zK9clo23zU2SFgKWBu5q3pHtA4EDexRntEHSPNtz644jeiI5O2KSryMt+TqCkrODqa6uHecDL5T0PEnPALYDTmza5kRg7OvE24DTbbuPMUbEk5KzEcMj+RrRJ7W0SNt+XNJHgV8DCwKH2L5M0l7APNsnAgcDP5R0NXA31R+CiKhBcjZieCRfI/pH+QIa3SJp53IZMCIGXPI1YrgkZwdTCumIiIiIiA7kEeERERERER1IIR0RERER0YEU0hERERERHUghHV0h6Yi6Y4iIiIjop6F/smH0n6Tm8UgFvEbSswBsv6nvQUVERyTtaPvQuuOIiBhGGbUj2ibpQuBy4AeAqQrpn1DGIbX9u/qii4h2SPqr7dl1xxERTyrn2eOBn9i+pu54YmJpkY5OzAV2AT4PfMr2RZIeSQEdMZgkXTLRKmCFfsYSES1ZBngW8FtJf6NqrDra9i21RhVPkxbp6JiklYFvAbcBb0qrVsRgknQb8HrgnuZVwDm2V+x/VBExEUkX2n5Zef1vwPbA1sAVVK3UeTDLgEiLdHTM9k3ANpLeCNxfdzwRMaFfAEvavqh5haQz+h5NRLTM9lnAWZI+BrwW2BZIIT0g0iIdERERMUAkHWV7u7rjiKll+LuIiIiIATJZES1px37GEpNLi3RERETEkMhIO4MlLdIxLknntLn9RpJ+0at4ImJiydeI0SLpkgmmS8lIOwMlNxvGuGy/su4YIqI1ydeIkbMCk4y00/9wYiJpkY5xSXqw/NxI0hmSjpP0f5KOlKSy7g1l2YVUw/KMvXcJSYdIOk/SnyRtVZbvK+mL5fXrJZ0pKf8HI6Yp+RoxcsZG2rmhaboeOKPe0KJRWqSjFesCawG3AL8HXiVpHnAQsDFwNXB0w/afB063/b7y2PDzJP0G2B04X9JZwHeAzW0/0b9fI2JGSL5GDDnbO02y7h39jCUml9aFaMV5tm8qJ9GLgDnAGsB1tq9ydcfqjxq2fx3wWUkXUX1zXhSYbfth4APAqcD+eexpRE8kXyMi+iQt0tGKxxpez2fq/zcC3mr7ynHWvQS4C8iT1CJ6I/kaEdEnaZGOTv0fMEfSamV++4Z1vwY+1tA3c93yc1Xgk1SXnjeTtH4f442YyZKvEQMkI+2MjhTS0RHbjwI7AyeVm5dub1i9N7AwcImky4C9y0n6YGA327cAOwE/kLRon0OPmHGSrxGDJSPtjI48kCUiIiKijyQ9aHtJSRsBewJ3Ai8GLgDeaduS3gB8G3gYOBt4vu0tJC0B7Fe2XxjY0/bPJO0L3GV7L0mvp7qReKPcJNxb6SMdERERUZ+MtDPE0rUjIiIioj4ZaWeIpUU6IiIioj4ZaWeIpUU6IiIiYrBkpJ0hkUI6IiIiYoBkpJ3hkVE7IiIiIiI6kBbpiIiIiIgOpJCOiIiIiOhACumIiIiIiA6kkI6IiIiI6EAK6YiIiIiIDqSQjoiIiIjoQArpiIiIiIgOpJCOiIiIiOhACumIiIiIiA6kkI6IiIiI6EAK6YiIiIiIDqSQjoiIiIjoQArpESHpvZLObph/UNLzp3jPHEmWtFDvI4yIiQxr/kraQdIpdR0/olWDnGOSXiXpqhLTm3t5rOi+FFAjyvaSdcfQLZLmANcBC9t+vOZwInpuWPLX9pHAkXXHEdGuAcuxvYD9be9bdyDNJO0JvMD2O+uOZVClRTpGQlrVI9qXvInorRZzbFXgsm7tX9KC091HtC6F9BCStIqk4yXdIekuSfuPs40lvaC8XkzSPpJukHSfpLMlLTbOe94q6XpJL57i+BtKOkfSvZJulPTesnxpSUeUuG6Q9AVJC5R1e0r6UcM+nnLJTNIZkvaW9HtJD0g6RdLyZfMzy897y6WvDcplut9L+paku4C9JN0t6SUNx3i2pIclzWrn843opTrztyHvdpL0V+D0svx9kq6QdI+kX0tateE9r5N0ZTn2/0j6naT3l3XNl8tfKen8su35kl7ZsG6yHI/ommHKMUnXAM8Hfl7Ob4uUc+nBkm6VdLOkL6sUx+Oc+/aUdJik70k6WdJDwGskrSjpp+UzuE7Sxxti3FPScZJ+JOl+4L0T/C5vAD4HbFtiu1jSNpIuaNruE5J+Vl4fJukASaeWPP9d09+TNcq6u8vflbdP9FkOixTSQ6Yk0y+AG4A5wErAUVO87RvAy4FXAssCnwaeaNrvjsDXgU1t/3mS468K/BLYD5gFvBS4qKzeD1ia6o/Cq4F3Azu2+KsBvKNs/2zgGcBuZfm/l5/Psr2k7T+U+fWBa4EVgL2pPofGy0/bA6fZvqONGCJ6pu78bfBq4F+A10vaiupkuTVVTp8F/KTsd3ngOGB3YDngyhLHeL/bssBJwHfKtt8ETpK0XMNmE+V4RFcMW47ZXg34K7BlOb89BhwGPA68AFgXeB3w/oZ9N577vlKWvaO8Xgo4B/g5cHH5/TcBdpX0+oZ9bEWV289igu5Ztn8FfBU4usS2DnAi8DxJ/9Kw6buAIxrmd6A6Jy9PVR8cCSBpCeBU4MdUfwO2A/5H0prjfoLDwnamIZqADYA7gIWalr8XOLth3lRJuADwCLDOOPuaU7bbDbgcWLmF4+8OnDDO8gWBvwNrNiz7D+CM8npP4EfjHHuhMn8G8IWG9R8GfjXetg2/71+bYlif6g+Syvw84O11/5tlyjQ2DUD+jr3n+Q3Lfgns1DC/APAw1eXmdwN/aFgn4Ebg/c1xU51Mz2s63h+A95bXE+Z4pkzdmoYtx8r89VQFOlTF8WPAYg3bbw/8tuH3aD73HQYc0TC//jjb7A4cWl7vCZzZ4ue5Jw3n7rLse8BXyuu1gHuARRpiOaph2yWB+cAqwLbAWU37+j6wR93/b6YzpUV6+KwC3ODWb7pbHlgUuGaSbT4FfNf2TS0ef7x9LQ8sTNUKMOYGqm/Drfpbw+uHqRJwMjc2ztg+t7xvI0lrUP2RPLGN40f0Wt35O6Yxd1YF9lXVVete4G6qgnklYMXGbV2d+SY6zoo8Nf/h6X8D2s3xiHYNW441W5XqXHprw/bfp2rBHW/fEx1vxbH3l318jqpIn2wfrToceIckUX2BPsZVS/rT9m37Qarfd8US1/pNce0APGcasdQuHcyHz43AbEkLtfiH4k7gUWA1qss843kd8CtJf7P90xaOv94Ex/kHVaJcXpbNBm4urx8CFm/Yvp3EcRvLD6fq3vE34Djbj7ZxnIheqzt/xzTmzo1UrUtPu7wr6YXAyg3zapxvcgtV/jeaDfyqxZgiumGocmwcN1K1SC8/Sfzjnfuaj3ed7Re2GN9knrad7T9K+jvwb1RdSt7RtMkqYy8kLUnVXeaWEtfvbL+2xWMPhbRID5/zgFuBr0laQtKikl410ca2nwAOAb5Zbj5YUNXNeos0bHYZ8Abgu5LeNMXxjwQ2lfR2SQtJWk7SS23PB44BviJpqdKX+hPA2A2GFwH/Lmm2pKWpLjO16g6q/mqTjvlZ/Ah4C1UxfcQU20b0W935O54DgN0lrQX/vGl4m7LuJOAlkt6s6sbgjzDxl+CTgRdJekf527AtsCZVf9WIfhm2HGuO51bgFGAfSc+UtICk1SS9uo3jnQc8IOkzqm6kXFDSiyX9awex3wbMURk4oMERwP7AP2yf3bRuc1WDEjyDqq/0H23fSPW34EWS3iVp4TL9a1N/66GTQnrIlIJ1S6puC3+lusy67RRv2w24FDif6hLL12n6t7d9MbAFcJCkzSY5/l+BzYFPln1dBKxTVn+MquX5WuBsqhsKDinvOxU4GrgEuIA2Tq62H6a6ieL35XLQKybZ9kbgQqpv0We1eoyIfqg7fyeI6YSyz6NU3cH/Z2Czsu5OYBvgv4C7qArjeVQtZs37uavE8Mmy7aeBLco+Ivpi2HJsAu+muhn3cqr+x8cBz23jePNLrC+legbDncAPqAYDaNex5eddki5sWP5D4MU82VjW6MfAHlSf5cspgwDYfoCqdX87qhbqv1F9LouMs4+hMXZTVsTIkHQIcIvtL9QdS8QoKa1SNwE72P5t3fFERD1UDQ94O/Ay21c1LD8MuGkmnX/TRzpGiqqnIG5NNWRQRExTGTLrXKqRDT5FdZPUH2sNKiLq9iHg/MYieqZK1454Gkk7qBp8vXnq6MlL/SJpb6pLZv9t+7q644moQw/ydwOqEQ3upLpk/mbbj3Qt4IghM6znyIlI+uUEv8/nJtj+emAXqm5cM166dkREREREdCAt0hERERERHRipPtLLL7+858yZU3cYEX13wQUX3Gl7Vt1xtCs5GzPVMOZs8jVmqsnydaQK6Tlz5jBv3ry6w4joO0nNT5QbCsnZmKmGMWeTrzFTTZav6doRERExA0n6f5Iuk/RnST+RtGjdMUUMmxTSERERM4yklYCPA3NtvxhYkOpBGRHRhhTSERERM9NCwGLl8e+LUz1tLiLaMFJ9pLvp5Z86ou4QYsRd8N/vrjuEkZKcjV4bpZy1fbOkb1A9RvsR4BTbpzRvJ2lnYGeA2bNnt7Tv5GJ7Run/1UyUFumIiIgZRtIywFbA84AVgSUkvbN5O9sH2p5re+6sWUM1yEhEX6SQjoiImHk2Ba6zfYftfwDHA6+sOaaIoZNCOiIiYub5K/AKSYtLErAJcEXNMUUMnRTSERERM4ztc4HjgAuBS6nqgQNrDSpiCOVmw4iIiBnI9h7AHnXHETHM0iIdEREREdGBnhbSkg6RdLukPzcsW1bSqZKuKj+XmeC97ynbXCXpPb2MMyIqydmIiIjW9bpF+jDgDU3LPgucZvuFwGll/ikkLUt1uWl9YD1gj4lO3hHRVYeRnI2IiGhJTwtp22cCdzct3go4vLw+HHjzOG99PXCq7btt3wOcytNP7hHRZcnZiOEj6UWSThu7kiRpbUlfqDuuiJmgjj7SK9i+tbz+G7DCONusBNzYMH9TWfY0knaWNE/SvDvuuKO7kUYEJGcjBt1BwO7APwBsXwJsV2tEETNErTcb2jbgae4jT12K6JPkbMRAWtz2eU3LHq8lkogZpo5C+jZJzwUoP28fZ5ubgVUa5lcuyyKi/5KzEYPtTkmrUb7kSnobcOvkb4mIbqijkD4RGLuj/z3Az8bZ5tfA6yQtU25Yel1ZFhH9l5yNGGwfAb4PrCHpZmBX4IO1RhQxQ/R6+LufAH8AVpd0k6SdgK8Br5V0FbBpmUfSXEk/ALB9N7A3cH6Z9irLIqKHkrMRw8f2tbY3BWYBa9je0PYNdccVMRP09MmGtrefYNUm42w7D3h/w/whwCE9Ci0ixpGcjRg+kpajGn5yQ8CSzqb6MntXvZFFjL482TAiImK4HQXcAbwVeFt5fXStEUXMED1tkY6IiIiee67tvRvmvyxp29qiiZhB0iIdEREx3E6RtJ2kBcr0dnKzb0RfpJCOiIgYbh8Afgw8VqajgP+Q9ICk+2uNLGLEpWtHRETEELO9VN0xRMxUaZGOiIgYYpJ+KmlzSTmnR/RZy0kn6SW9DCQiIiI68j1gB+AqSV+TtHrdAUXMFO18e/0fSedJ+rCkpXsWUURERLTM9m9s7wC8DLge+I2kcyTtKGnheqOLGG0tF9K2/43qG+8qwAWSfizptT2LLCIiIlpSHsryXqqHJP0J2JeqsD51kvc8S9Jxkv5P0hWSNuhLsBEjpK2bDW1fJekLwDzgO8C6kgR8zvbxvQgwIjojaSVgVRry3PaZ9UUUEb0g6QRgdeCHwJa2by2rjpY0b5K37gv8yvbbJD0DWLzHoUaMnJYLaUlrAzsCb6T6hrul7QslrQj8AUghHTEgJH0d2Ba4HJhfFhtIIR0xeg6yfXLjAkmL2H7M9tzx3lC6aP47VSs2tv8O/L3XgUaMmnZapPcDfkDV+vzI2ELbt5RW6ogYHG8GVrf9WN2BRETPfRk4uWnZH6i6dkzkeVSPEj9U0jrABcAuth/qTYgRo6mlPtKSFgRutv3DxiJ6jO0fdj2yiJiOa4HcZBQxwiQ9R9LLgcUkrSvpZWXaiKm7aSxEVWh/z/a6wEPAZ8c5xs6S5kmad8cdd3T5N4gYfi21SNueL2kVSc8ol38iYrA9DFwk6TSqJ50BYPvj9YUUEV32eqquGSsD+wAqy+8HPjfFe28CbrJ9bpk/jnEKadsHAgcCzJ0719MPOWK0tNO14zrg95JOpPrmCoDtb3Y9qoiYrhPLFBEjyvbhwOGS3mr7pxNtJ+k9ZdvG9/5N0o2SVrd9JbAJ1T0VEdGGdgrpa8q0ADD2ONJ8O40YQLYPL3fhv6gsutL2P+qMKSJ6Y7IiutgFOHyc5R8Djix/K66lGlAgItrQTiF9ue1jGxdI2qaTg5anLh3dsOj5wBdtf7thm42An1G1hAMcb3uvTo4XMdOU/Dmc6uEMAlYprVJtj9qRfI0Yehpvoe2LgHFH9YiI1rRTSO8OHNvCsimVy0gvhSdvZAROGGfTs2xv0e7+I4J9gNeVXEPSi4CfAC9vd0fJ14ihl6vHET0yZSEtaTNgc2AlSd9pWPVM4PEuxLAJcI3tG7qwr4ioLDxWRAPY/kuXHhWcfI0YPuO2SEfE9LUy/N0tVE8yfJRqnMmx6USqO4anazuqlrLxbCDpYkm/lLTWeBtkaJ6Icc2T9ANJG5XpIKo8nq5p5SskZyO6SdICkt4+xWa/70swETPQlC3Sti8GLpb0427frFRucHgTVReRZhcCq9p+UNLmwP8CLxwnvgzNE/F0HwI+AowNd3cW8D/T2WE38hWSsxHdZPsJSZ8Gjplkm4/2MaSIGaWlB7IU60k6VdJfJF0r6TpJ107z+JsBF9q+rXmF7fttP1henwwsLGn5aR4vYkYojwb+pu2ty/StLjzlMPkaMZh+I2m38ryHZcemuoOKmAnaudnwYOD/UXXrmN+l42/PBJeJJT0HuM22Ja1HVfTf1aXjRowkScfYfrukSxnnBiPba09j98nXiMG0bfn5kYZlphphJyJ6qJ1C+j7bv+zWgSUtAbwW+I+GZR8EsH0A8DbgQ5IeBx4BtrOdy8ARk9ul/Ozq6BnJ14jBZft5dccQMVO1U0j/VtJ/A8fz1EcOX9jJgW0/BCzXtOyAhtf7A/t3su+Imcr2reVnV0fVSL5GDC5JiwOfAGbb3lnSC4HVbf+i5tAiRl47hfT65Wfj4O0GNu5eOBHRDZIe4OldO+6jGrnjk7ane39DRAyOQ6m6Xb6yzN9M9YyHFNIRPdZyIW37Nb0MJCK66tvATcCPqcaQ3Q5YjWp0jUOAjeoKLCK6bjXb20raHsD2w5IydnREH7RcSEv64njL8xjgiIH0JtvrNMwfKOki25+R9LnaooqIXvi7pMUoV6EkrUZDF8yI6J12hr97qGGaTzUU1pwexBQR0/ewpLeXhzWMPbDh0bIuNwFGjJY9gF8Bq0g6EjgN+HS9IUXMDO107dincV7SN4Bfdz2iiOiGHYB9qR7CYuCPwDtLq1UezhAxIiQtACwDbA28gqor1y6276w1sIgZop2bDZstDqzcrUAionvKzYRbTrD67H7GEhG9M/ZkQ9vHACfVHU/ETNNy1w5Jl0q6pEyXAVdS3dAUEQNG0osknSbpz2V+bUlfqDuuiOiJPNkwoibttEg3PuDhcaqnmD3e5XgiojsOAj4FfB/A9iWSfgx8udaoIqIX8mTDiJq000f6BknrAP9WFp0JXNKTqCJiuha3fV7TCFj54hsxYkof6c/aPrruWCJmona6duwCHAk8u0xHSvpYrwKLiGm5swyBNTYc1tuAW+sNKSK6zfYTVFefIqIG7XTt2AlYvzwqGElfB/4A7NeLwCJiWj4CHAisIelm4DqqkTwiYvT8RtJuwNFUQ9QCYPvu+kKKmBnaKaRFNX70mPllWUQMEEkLAh+2vamkJYAFbD9Qd1wR0TMd95Eufy/mATfb3mKq7SPiqdoppA8FzpV0Qpl/M3Bw1yOKiGmxPV/ShuX1Q1NtHxHDzfbzpvH2XYArgGd2KZyIGaWdmw2/KekMYMOyaEfbf+pJVBExXX+SdCJwLE+91Ht8fSFFRC9IWhz4BDDb9s6SXgisbvsXU7xvZeCNwFfK+yOiTS0X0pJeAVxm+8Iy/0xJ69s+t2fRRUSnFgXuAjZuWGYghXTE6DkUuAB4ZZm/mepL9KSFNNWzID4NLDXRBpJ2BnYGmD179nTjjBg57XTt+B7wsob5B8dZFhEDwPaOk62XtLvt/+xXPBHRU6vZ3lbS9gC2H1bT2JfNJG0B3G77AkkbTbSd7QOpblxm7ty57l7IEaOh5eHvANn+ZxKVIXc6fsS4pOvL0xIvkjRvnPWS9B1JV5enKaZgj+iebdp9Q3I2YmD9XdJiPDnc5WrAY1O851XAmyRdDxwFbCzpRz2NMmIEtVMIXyvp41St0AAfBq6d5vFfY/vOCdZtBrywTOuX464/zeNFRKXTEXeSsxGDZw/gV8Aqko6kKpLfO9kbbO8O7A5QWqR3s/3OnkYZMYLaaZH+IFX/q5uBm6hOkDv3IqhiK+AIV/4IPEvSc3t4vIiZpBeXaJOzEX0k6VXl5ZnA1lTF80+AubbPqCmsiBml5ULa9u22t7P9bNsr2H6H7dvH1kvavc1jGzhF0gXlZoZmKwE3NszfVJY9haSdJc2TNO+OO+5oM4SIGauTFunkbMRg+U75+Qfbd9k+yfYvJrlqNC7bZ2QM6YjOdNzHeRzbAO3cvLSh7ZslPRs4VdL/2T6z3YPmRoiIp5O0bPNTzSQ9z/Z1ZfbYDnabnI0YLP+QdCCwsqTvNK+0/fEaYoqYUdrp2jGVtlq4bN9cft4OnACs17TJzcAqDfMrl2URMbWfS/rnAxYkrQn8fGze9lfb3WFyNmLgbAGcDjxCNfxd8xQRPdbNFumWW5YaH1tcXr8O2KtpsxOBj0o6iqo/9n22b+1atBGj7atUxfQbgdWBI4AdOt1ZcjZi8JQuHEdJusL2xXXHEzETdbOQbqdFegXghDLM5ULAj23/StIHAWwfAJwMbA5cDTwMTDoubkQ8yfZJkhYGTqF62MJbbP9lGrtMzkYMrkcknQasYPvFktYG3mT7y3UHFjHq2nmyYdf6XNq+FlhnnOUHNLw28JFW9xkRIGk/nnp1aGngGqqW4o77TCZnIwbaQcCngO8D2L5E0o+BFNIRPdZOi/TPJW1m+374Z5/LY4AXQ2d9LiOi65oflJJ+khGjb3Hb5zU9zPDxuoKJmEnaKaS72ucyIrrP9uHwzz7Nj9qeX+YXBBapM7aI6Jk7y9MMx55s+DYg9ydE9EHLhXQP+lxGRO+cBmwKPFjmF6PK3VfWFlFE9MpHqIaUXEPSzcB1pKFrRvrrXi+pO4ShM/uLl07r/VMW0r3qcxkRPbWo7bEiGtsPSlq8zoAiovvK1aYP2960cXSduuOKmClaaZFOn8uI4fOQpJfZvhBA0supxpqNiBFie76kDcvrh+qOJ2KmmbKQTp/LiKG0K3CspFuohqZ8DrBtrRFFRK/8SdKJVKNn/bOYtn18fSFFzAzt3GyYPpcRQ8L2+ZLWoLoxGOBK2/+oM6aI6JlFgbuAjRuWGUghHdFj7RTS6XMZMeAkbWz7dElbN616UbmnISfWiBFje9KHH0na3fZ/9iueiJmknUI6fS4jBt+rgdOBLcdZlxaqiJlpGyCFdEQPtFNI70r6XEYMNNt7lJ95PHdEjNHUm0REJ9oZRzp9LiMGnKRPTLbe9jf7FUtEDAxPvUlEdKKVcaTT5zJieCw1ybqcTCNmprRIR/RIKy3S6XMZMSRsfwlA0uHALrbvLfPLAPvUGFpE9IikZW3f3bTsebavK7PH1hBWxIzQyjjS6XMZMXzWHiuiAWzfI2ndGuOJiN75uaTNbN8PIGlN4BjgxQC2v9r8BkmrAEcAK1A1ih1oe9/+hRwxGlrp2pE+lxHDZwFJy9i+B6oWK9q7uTgihsdXqYrpN1Ldx3QEsMMU73kc+KTtCyUtBVwg6VTbl/c41oiR0sqJtet9Llv5JixpI+BnwNilqeNt79XJ8SJmoH2AP0gau6S7DfCVTnaUfI0YbLZPkrQw1UPSlgLeYvsvU7znVuDW8voBSVcAKwEppCPa0ErXjl70uWz1m/BZtrfo8BgRM5btIyTN48knnW09jZam5GvEAJK0H09t0FoauAb4aBkM4OMt7mcOsC5w7jjrdgZ2Bpg9e/Z0Q44YOe1c6u1an8t8E47ovVLoTjunkq8RA2te0/wF7e5A0pLAT4Fdx/pYN7J9IHAgwNy5czPyT0STdgrpnvS5nOybMLCBpIuBW4DdbF82zvvzbTmiT6abr2UfydmILrB9OICkJYBHbc8v8wsCi0z1/tId5KfAkRnKNqIzC7Sx7Vify70l7Q2cA/zXdA4+xTfhC4FVba8D7Af873j7sH2g7bm2586aNWs64UTEJLqRr5CcjeiB04DFGuYXA34z2RskCTgYuCKDBkR0ruVC2vYRwNbAbWXa2vYPOz3wVN+Ebd9v+8Hy+mRgYUnLd3q8iOhc8jVioC06ln8A5fXiU7znVcC7gI0lXVSmzXsZZMQoaqtrRrf6XLbyTVjSc4DbbFvSelRF/13TPXZEtCf5GjHwHpL0MtsXAkh6OfDIZG+wfTZ54mHEtNU1ruzYN+FLJV1Uln0OmA1g+wDgbcCHJD1O9QdhO9u50SGi/5KvEYNtV+BYSbdQFcfPAbatNaKIGaKWQrqVb8K29wf2709EETGR5GvEYLN9vqQ1qB7GAnCl7X/UGVPETJEnnUVERAwhSRvbPl3S1k2rXlTGkc5IHBE9lkI6IiJiOL0aOB3Ycpx1BlJIR/RYCumIiIghZHuP8nPHumOJmKlSSEdERAwhSZ+YbH3Gh47ovRTSERERw2mpSdZl1JyIPkghHRERMYRsfwlA0uHALrbvLfPLUD2NOCJ6rJ1HhEdERMTgWXusiAawfQ+wbn3hRMwcKaQjIiKG2wKlFRoAScuSK84RfZFEi4iIGG77AH+QdGyZ3wb4So3xRMwYKaQjIiKGmO0jJM0DNi6LtrZ9eZ0xRcwUKaQjIiKGXCmcUzxH9Fn6SEdEREREdCCFdEREREREB1JIR0RERER0IIV0REREREQHUkhHRERERHSgtkJa0hskXSnpakmfHWf9IpKOLuvPlTSnhjAjokjORoyWqXI6IqZWSyEtaUHgu8BmwJrA9pLWbNpsJ+Ae2y8AvgV8vb9RRsSY5GzEaGkxpyNiCnW1SK8HXG37Wtt/B44CtmraZivg8PL6OGATSepjjBHxpORsxGhpJacjYgp1PZBlJeDGhvmbgPUn2sb245LuA5YD7mzcSNLOwM5l9kFJV/Yk4mjF8jT9+8TE9I33dHN3q3ZzZ+NIzo6e5Gubhixnp9JKTo9avg7k//ku/78aVAP52QOwR0vtPRPm69A/2dD2gcCBdccRIGme7bl1xxGDLTk7GJKv0YpRytf8n6/PKH/2dXXtuBlYpWF+5bJs3G0kLQQsDdzVl+giollyNmK0tJLTETGFugrp84EXSnqepGcA2wEnNm1zIjB2veNtwOm23ccYI+JJydmI0dJKTkfEFGrp2lH6T34U+DWwIHCI7csk7QXMs30icDDwQ0lXA3dTJXkMtpG4/BdPl5wdScnXGWyinK45rF7L//n6jOxnrzQYRURERES0L082jIiIiIjoQArpiIiIiIgOpJCOiIiIiOhACumIiIiI6BpJa0jaRNKSTcvfUFdMvZJCOrpO0o51xxARETGenKN6S9LHgZ8BHwP+LKnx0fNfrSeq3smoHdF1kv5qe3bdcURERDTLOaq3JF0KbGD7QUlzgOOAH9reV9KfbK9bb4TdNfSPCI96SLpkolXACv2MJSImJ+k5wB7AE8AXqVqK3gpcAexi+9Yaw4voupyjarWA7QcBbF8vaSPgOEmrUn3+IyWFdHRqBeD1wD1NywWc0/9wImIShwEnAUsAvwWOBDYH3gwcAGw10RsjhlTOUfW5TdJLbV8EUFqmtwAOAV5Sa2Q9kEI6OvULYMmxRGkk6Yy+RxMRk1nB9n4Akj5s++tl+X6SdqoxroheyTmqPu8GHm9cYPtx4N2Svl9PSL2TPtIRESNO0sW21ymvv2z7Cw3rLrU9cq1EERH9kFE7IiJG38/GhqFqKqJfAFxZW1QREUMuLdIRETOYpB1tH1p3HBERwygt0jEuSW3djCFpI0m/6FU8EdEzX6o7gIiZJufY0ZGbDWNctl9ZdwwR0R0ZCixisOQcOzrSIh3jkvRg+bmRpDMkHSfp/yQdKUll3RvKsguBrRveu4SkQySdJ+lPY081krSvpC+W16+XdKak/B+M6L0VqO6k33Kc6a4a44qYkXKOHR1pkY5WrAusBdwC/B54laR5wEHAxsDVwNEN238eON32+yQ9CzhP0m+A3YHzJZ0FfAfY3PYT/fs1ImasDAUWMbhyjh1i+aYSrTjP9k0lIS8C5gBrANfZvsrVHas/atj+dcBnJV0EnAEsCsy2/TDwAeBUYH/b1/TtN4iYwWzvZPvsCda9o9/xRMRT5Bw7xNIiHa14rOH1fKb+fyPgrbbHG1brJVSXklfsUmwRERHDLOfYIZYW6ejU/wFzJK1W5rdvWPdr4GMN/bzWLT9XBT5JdRlrM0nr9zHeiIiIYZFz7JBIIR0dsf0osDNwUrkR4vaG1XsDCwOXSLoM2Lsk/MHAbrZvAXYCfiBp0T6HHjFyMpRWxGjJOXZ45IEsEREzjKSNqE64W9QcSkTEUEuLdETEkMtQWhER9cjNhhERoyVDaUVE9ElaFyIiRkuG0oqI6JO0SEdEjJYMpRUR0SdpkY6IGH0ZSisiogdSSEdEjLgMpRUR0RsZ/i4iIiIiogNpkY6IiIiI6EAK6YiIiIiIDqSQjoiIiIjoQArpiIiIiIgOpJCOiIiIiOhACumIiIiIiA6kkI6IiIiI6MD/D+6OH9D/ve96AAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:28:13.084501Z", + "start_time": "2020-11-13T15:28:13.062561Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "176 3485\n", + "182 3480\n", + "179 3463\n", + "178 3458\n", + "174 3456\n", + "183 3432\n", + "184 3427\n", + "173 3414\n", + "180 3403\n", + "177 3391\n", + "170 3387\n", + "187 3355\n", + "169 3352\n", + "185 3348\n", + "175 3346\n", + "181 3330\n", + "186 3328\n", + "189 3327\n", + "171 3327\n", + "172 3322\n", + "165 3308\n", + "188 3288\n", + "167 3269\n", + "190 3261\n", + "192 3257\n", + "168 3248\n", + "193 3225\n", + "166 3199\n", + "191 3182\n", + "194 3164\n", + " ... \n", + "601 1\n", + "857 1\n", + "1977 1\n", + "1626 1\n", + "697 1\n", + "1720 1\n", + "696 1\n", + "706 1\n", + "592 1\n", + "1605 1\n", + "586 1\n", + "582 1\n", + "1606 1\n", + "972 1\n", + "716 1\n", + "584 1\n", + "1608 1\n", + "715 1\n", + "841 1\n", + "968 1\n", + "964 1\n", + "587 1\n", + "1099 1\n", + "1355 1\n", + "711 1\n", + "845 1\n", + "710 1\n", + "965 1\n", + "847 1\n", + "1535 1\n", + "Name: words_count, Length: 866, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_df['words_count'].value_counts()" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:28:59.029535Z", + "start_time": "2020-11-13T15:28:58.816106Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "461\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "print(item_df['category_id'].nunique()) # 461个文章主题\n", + "item_df['category_id'].hist()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(364047, 4)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_df.shape # 364047篇文章" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻文章embedding向量表示" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
article_idemb_0emb_1emb_2emb_3emb_4emb_5emb_6emb_7emb_8...emb_240emb_241emb_242emb_243emb_244emb_245emb_246emb_247emb_248emb_249
00-0.161183-0.957233-0.1379440.0508550.8300550.901365-0.335148-0.559561-0.500603...0.3212480.3139990.6364120.1691790.540524-0.8131820.286870-0.2316860.5974160.409623
11-0.523216-0.9740580.7386080.1552340.6262940.485297-0.715657-0.897996-0.359747...-0.4878430.8231240.412688-0.3386540.3207860.588643-0.5941370.1828280.397090-0.834364
22-0.619619-0.972960-0.207360-0.1288610.044748-0.387535-0.730477-0.066126-0.754899...0.4547560.4731840.377866-0.863887-0.3833650.137721-0.810877-0.4475800.805932-0.285284
33-0.740843-0.9757490.3916980.641738-0.2686450.191745-0.825593-0.710591-0.040099...0.2715350.0360400.480029-0.7631730.0226270.565165-0.910286-0.5378380.243541-0.885329
44-0.279052-0.9723150.6853740.1130560.2383150.271913-0.5688160.341194-0.600554...0.2382860.8092680.427521-0.615932-0.5036970.614450-0.917760-0.4240610.185484-0.580292
\n", + "

5 rows × 251 columns

\n", + "
" + ], + "text/plain": [ + " article_id emb_0 emb_1 emb_2 emb_3 emb_4 emb_5 \\\n", + "0 0 -0.161183 -0.957233 -0.137944 0.050855 0.830055 0.901365 \n", + "1 1 -0.523216 -0.974058 0.738608 0.155234 0.626294 0.485297 \n", + "2 2 -0.619619 -0.972960 -0.207360 -0.128861 0.044748 -0.387535 \n", + "3 3 -0.740843 -0.975749 0.391698 0.641738 -0.268645 0.191745 \n", + "4 4 -0.279052 -0.972315 0.685374 0.113056 0.238315 0.271913 \n", + "\n", + " emb_6 emb_7 emb_8 ... emb_240 emb_241 emb_242 \\\n", + "0 -0.335148 -0.559561 -0.500603 ... 0.321248 0.313999 0.636412 \n", + "1 -0.715657 -0.897996 -0.359747 ... -0.487843 0.823124 0.412688 \n", + "2 -0.730477 -0.066126 -0.754899 ... 0.454756 0.473184 0.377866 \n", + "3 -0.825593 -0.710591 -0.040099 ... 0.271535 0.036040 0.480029 \n", + "4 -0.568816 0.341194 -0.600554 ... 0.238286 0.809268 0.427521 \n", + "\n", + " emb_243 emb_244 emb_245 emb_246 emb_247 emb_248 emb_249 \n", + "0 0.169179 0.540524 -0.813182 0.286870 -0.231686 0.597416 0.409623 \n", + "1 -0.338654 0.320786 0.588643 -0.594137 0.182828 0.397090 -0.834364 \n", + "2 -0.863887 -0.383365 0.137721 -0.810877 -0.447580 0.805932 -0.285284 \n", + "3 -0.763173 0.022627 0.565165 -0.910286 -0.537838 0.243541 -0.885329 \n", + "4 -0.615932 -0.503697 0.614450 -0.917760 -0.424061 0.185484 -0.580292 \n", + "\n", + "[5 rows x 251 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_emb_df.head()" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(295141, 251)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_emb_df.shape" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsgAAAFgCAYAAACmDI9oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAA8qUlEQVR4nO3dd5gldZn//fcHRIligFUJwygqLiqKoqj4W1mMIBgwIKKu6MruzwSPGfUR8+qusGJYVwwERSWIjwFXxTAq6IIDYgDEACJJBRRJSryfP6pazzQ93ed0n9N1zvT7dV119al8d0/f87276lvfSlUhSZIkqbFW1wFIkiRJ48QCWZIkSephgSxJkiT1sECWJEmSelggS5IkST0skCVJkqQeFsiSBCR5fpKTe+avSXKPOfZZnqSS3GaB5/51kscs5BjtceaMWZI0NwvkEemysR0WG1stZVW1YVWd13UcgxhWzEkem+RbSa5OckWSM5O8Nsm6w4hTmmRrQvuuuVkgL5Kl3Nh2KcnOSS7qOg5pUiR5BnA88Clgq6q6M7AXsAWw5Wr2sdHXkrUmtJW6NQtkzUuStbuOQZqvJFsmOSHJZe0V0g/MsE0luWf7eb0kBye5IMmfkpycZL0Z9nla213ifnOc/7ntsa5I8oZp69ZK8rokv2rXH5vkTu26/0ny0mnb/yjJnoPEnORhSb6X5Mp2/53b5QEOAd5aVR+pqj8AVNW5VfWyqvpFu92bkxyf5JNJrgKen2SzJF9I8ockv0zyop4Yj0jy9p75Vf5wbX9mByY5O8kfkxzu1WpJXbJAHoIxaGxnbOzadSuSvC3JKe3t0q8l2aRdN0hje0SSDyX5cpJrgX9M8vft8a9MclaSJ/Uc54gkH0xyYnveU5NsPe3n8eIkv2jXvy3J1u33cVVbFNy2Z/vd09zmvbLdZruedb9O8qokP25/nsckWTfJBsD/AJuluQV2TZLNZvtZas2X5o+7LwEXAMuBzYHPzLHbe4AHA48A7gS8Brhl2nH3Bd4NPKaqfjrL+bcFPgQ8F9gMuDPN1dkpLwOeAjyqXf9H4IPtuk8De0871lbAif3GnGTzdvu3t8tfBXw2yabANm0sn53lZzHlyTRXmu8AHE3zM7yojfnpwDuT7NLHcabsAzwe2Bq4N/DGAfaVRmIM2vcnte3rlW17+/c9616b5OK2DT03yaOH8T2rVVVOC5iAtYEfAf8JbACsCzwSeD5wcs92Bdyz/fxBYAVNw7w2TQN2O5rGuoDbAPsCv5zaZ5bzbw5cAexG8wfPY9v5Tdv1K4Bf0TQ467Xz72rXPQ84pedY2wJXArebIeYjgD8BO7Xn2aiN7/XAbYFdgKuBbXq2vwJ4aPv9HA18ZtrP4/PA7YH7AtcD3wDuAWwMnA38U7vt9sDvgR3bn9c/Ab/uifPXwGk0DfOdgHOAf23X7Qxc1PXvidP4TMDDgcuA20xbPmPOtr/vfwYeMMOxpnL2Ve3v7BZ9nP9N03JhA+AGmsKa9vf30T3r7wbc2ObRRsC1NF0fAN4BfHzAmF8LfGLasq+2efXI9hjr9qz7TPv/wnXAc9tlbwa+07PNlsDNwEY9y/4NOKL9fATw9p51q+Rlm8P/2jO/G/Crrn9XnJb2RPft+73bfH8ssA7NH7m/pGlztwEuBDZrt10ObN31z2xNmryCvHAPpSnMXl1V11bVX6rq5NVtnGQt4AXA/lV1cVXdXFXfq6rrezY7AHg1sHNV/XKO8z8H+HJVfbmqbqmqk4CVNA3MlMOr6udV9WfgWOCB7fLPAQ9MslU7vw9wwrRYen2+qk6pqlvaY2xIU2zfUFXfpLkqt3fP9p+rqtOq6iaaAvmB047371V1VVWdBfwU+FpVnVdVf6K58rt9u91+wIer6tT253UkTUH9sJ5jva+qLqnmlvAXZziXNGVL4IL297Ifm9A0jL+aZZtXAx+sqn76u29G07ABUFXX0vwxOWUr4HPtFaMraQrmm4G7VNXVNFd/n9VuuzdNbg0S81bAM6aO357jkTSF+FQcd+uJ71lVdQfgDJoGf8qFPZ83A/7QxjflApoioV+9x7ugPabUpa7b972AE6vqpKq6keau0Ho0RffNNIX3tknWqapfV9Vs/0dpQBbIC9d1YztbYzfltz2fr6MpbBmgsZ0yvUG8sC2Wp0xvEGc8b4/f9Xz+8wzzU9tvBbxy2ve4Jas2oHOdS5pyIbAs/T9YdjnwF5pb/6vzOOCNSZ7Wx/EupedhtyTr03Sz6I1v16q6Q8+0blVd3K7/NLB3kofT/F/yrQFjvpDmCnLv8TeoqncB5wIXA3v28X1Uz+dLgDsl2ahn2bL2WNBcBVu/Z91dZzhe7wOAy9pjSl3qun3fjKZdBaBtby8ENm+L6wNo7ub8Psln7EI4XBbIC9d1YztbY9ePfhrbKdMbxC3bv5in9DaIw3Qh8I5p3+P6VfXpPvatuTfREnMaTZH6riQbtP3Vd1rdxm2j9HHgkDQPoq2d5OFJbtez2VnAE4AP9vbFX43jgd2TPLLtZ/9WVv2/+L+Bd0zd2UmyaZIn96z/Ms0fjW8Fjpn2R2o/MX8S2CPJ49vl66Z5aG6Ldr9XAgcleVGSO6ZxL+Aus/yMLgS+B/xbe7ztgBe25wI4E9gtyZ2S3JWmYZ/uJUm2SPNA4huAY2b7IUqLoOv2/RKaXAf++hDtlrTtbFV9qqoe2W5TNM9AaEgskBeu68Z2tY1dn/HP2diuxqk0V2pfk2SdNA8G7sHcDzvNx0eAf02yY9tYb5DkidOuVq3O74A7J9l4BHFpAlXVzTS/q/cEfkPzYNlec+z2KuAnwA+AP9A0RKv8/1lVPwJ2Bz6SZNdZzn8W8BKaYdQupXkIr/dq0qHAF4CvJbka+F+a/vdT+18PnAA8pj3GQDG3xeyTaZ4fuIymCHj11PdTVccAz6TpvnUhTaN/LHAYcNws59ubph/kJTTdtw6qqq+36z5B05fz18DXmLn4/VS77jyaK3Bvn2EbaTF13b4fCzwxyaOTrEPzx+v1wPeSbJNkl/bYf6G569pv+61+dNX5eU2aaK6c/n80/fcuB97H7J341wPeS/NX4J+A77TLlrfb3abdbgeaAm/XOc6/I/BtmkbwMppuE8vadSuAf+7ZdpW42mUfa8/7kGnLpz+k9/Zp6+/bnvdPNA8oPbVn3Srbc+uHcv567Hb+ZOD5PfNvBz7aM/8Emob+Spr/sI6jfSCIptF9TM+2bwY+2TP/8fbf5kraBxqcnJzGZ5qew05O4zKNQfv+1LZ9/VPb3t63Xb4dTQF/ddv2f8n2bbhT2h+0JEmdSPJrmj/kvz7XtpK0GOxiIUlDlmSf/G3s7d7prK5jkyTNzSvIEyDJPsCHZ1h1QVXdd7HjkSRJC2f7Pr4skCVJkqQe/Q5dsig22WSTWr58eddhSGPl9NNPv7yqNu06jpmYs9KqzFdpsqwuZ8eqQF6+fDkrV67sOgxprCS5YO6tumHOSqsyX6XJsrqc9SE9SZIkqcfIC+Qkd0hyfJKfJTmnfWObpDFkvkqTxZyVRmMxulgcCnylqp7evlZ1/UU4p6T5MV+lyWLOSiMw0gK5fb3vP9C8dYaqugG4YZTnlDQ/5qs0WcxZaXRGfQX57jSvPj48yQOA04H9q+raqQ2S7AfsB7Bs2bKhnvzBrz5qqMeT+nH6fzyv6xDma858BXNWa5YJzlewjdUStFg5O+o+yLcBHgR8qKq2B64FXte7QVUdVlU7VNUOm246liPjSEvFnPkK5qw0RmxjpREZdYF8EXBRVZ3azh9Pk8ySxo/5Kk0Wc1YakZEWyFX1W+DCJNu0ix4NnD3Kc0qaH/NVmizmrDQ6izGKxcuAo9una88D9l2Ec0qaH/NVmizmrDQCIy+Qq+pMYIdRn0fSwpmv0mQxZ6XR8E16kiRJUg8LZEmSJKmHBbIkSZLUwwJZkiRJ6tF3gZzkE/0skzQ+ktw+yUZdxyFJ0iQZ5AryfXtnkqwNPHi44UgahiQPSfIT4MfAT5P8KIn5KklSH+YskJMcmORqYLskV7XT1cDvgc+PPEJJ8/Ex4MVVtbyqtgJeAhzecUySJE2EOQvkqvq3qtoI+I+qun07bVRVd66qAxchRkmDu7mqvjs1U1UnAzd1GI8kSROj7xeFVNWBSTYHturdr6q+M4rAJC3It5N8GPg0UMBewIokDwKoqjO6DE6SpHHWd4Gc5F3As2je835zu7gAC2Rp/Dyg/XrQtOXb0+TtLosbjiRJk2OQV00/Fdimqq4fVTCShqOq/rHrGCRJmlSDFMjnAesAFsjSmEvyppmWV9VbFzsWSZImzSAF8nXAmUm+QU+RXFUvH3pUkhbq2p7P6wK7A+d0FIskSRNlkAL5C+0kacxV1cG980neA3y1o3AkSZoog4xicWSS9YBlVXXuCGOSNHzrA1t0HYQkSZNgkFdN7wGcCXylnX9gEq8oS2MoyU+S/LidzgLOBd7bcViSJE2EQbpYvBl4KLACoKrOTHKPEcQkaeF27/l8E/C7qvJFIZIk9aHvK8jAjVX1p2nLbhlmMJKGo6ouAO4A7EEzROO2nQYkSdIEGaRAPivJs4G1k9wryfuB740oLkkLkGR/4Gjg79rp6CQv6zYqSZImwyAF8suA+9IM8fZp4CrggBHEJGnhXgjsWFVvqqo3AQ8DXtRxTJIkTYRBRrG4DnhDO0kab+Fvr4Sn/ZyOYpEkaaL0XSAn2QF4PbC8d7+q2m74YUlaoMOBU5N8rp1/CvCx7sKRJGlyDDKKxdHAq4Gf4MN50thKshbwvzQjzjyyXbxvVf2ws6AkSZoggxTIl1WV4x5LY66qbknywaraHjij63gkSZo0gxTIByX5KPANmgf1AKiqE4YelaSF+kaSpwEnVFV1HYwkSZNkkAJ5X+A+wDr8rYtFARbI0vj5F+AVwE1J/kLzgF5V1e27DUuSpPE3SIH8kKraZmSRSBqaqtqo6xgkSZpUgxTI30uybVWdPbJoJC1IkrWB9arqmnb+YcBt29U/rKqrOwtOkqQJMUiB/DDgzCTn0/RBnrplO+cwb22jvRK4uKp2n1ekkvrxbuD3wL+3858GfgqsS/PA3mvnOoD5Kk0O81UajUEK5Ccs4Dz7A+cA9n+URuvRwEN65q+sqj2SBPhun8cwX6XJYb5KI9D3q6ar6gLgIuBGmofzpqZZJdkCeCLw0XnGKKl/a1XVTT3zr4XmVg+w4Vw7m69St5LcMUlfL+AyX6XR6btATvIy4HfAScCJ7fSlPnZ9L/AaVvNykST7JVmZZOVll13WbziSZnbbJH99QK+qvgaQZGOabhZzeS+z5Gt7LHNWGqIkK5LcPsmdaLpCfSTJIX3s+l7MV2kk+i6QaW7jbFNV962q+7fTrH/lJtkd+H1Vnb66barqsKraoap22HTTTQcIR9IMPgIck2TZ1IIkW9H0RZ71KlM/+QrmrDQCG1fVVcCewFFVtSPwmNl2MF+l0RqkD/KFwJ8GPP5OwJOS7EZz9er2ST5ZVc8Z8DiS+lBVhyS5Djg5yQY0D9NeDbyrqj40x+7mq9SN2yS5G/BM4A197mO+SiM0SIF8HrAiyYms+ia91d4GqqoDgQMBkuwMvMrklUarqv4b+O+prhb9Du1mvkqdeSvwVeCUqvpBknsAv5htB/NVGq1BCuTftNNt+du4qpLGUJK7AO8ENgN2TbIt8PCq+li3kUmarqqOA47rmT8PeFp3EUnqu0CuqrcAJNmwnb9mkBNV1QpgxSD7SJq3I4DD+dvt2p8DxwB9Fcjmq7R42tEo3k/TbQKaIRn3r6qL+tnffJWGb5BRLO6X5IfAWcBZSU5Pct/RhSZpATapqmNpn25vh367uduQJK3G4cAXaO74bAZ8sV0mqSODjGJxGPCKqtqqqrYCXknzxLyk8XNtkjvTjlXevnJ60IdsJS2OTavq8Kq6qZ2OABxyQurQIH2QN6iqb03NVNWK9il5SePnFTRXpLZOcgpNY/v0bkOStBpXJHkOzXCMAHsDV3QYj7TkDTSKRZL/F/hEO/8cmpEtJI2ZqjojyaOAbWiGeju3qm7sOCxJM3sBTR/k/6S56/M9YN9OI5KWuEG6WLyA5irUCcBngU3aZZLGTJKXABtW1VlV9VNgwyQv7jouSbdWVRdU1ZOqatOq+ruqekpV/WZqfZIDu4xPWor6KpCTrA2cUFUvr6oHVdWDq+qAqvrjiOOTND8vqqorp2baXH1Rd+FIWoBndB2AtNT0VSBX1c3ALUk2HnE8koZj7SSZmmn/yHX8cmkyZe5NJA3TIH2QrwF+kuQk4NqphVX18qFHJWmhvgIck+TD7fy/tMskTZ7qOgBpqRmkQD6hnSSNv9fSFMX/t50/Cfhod+FIWgCvIEuLbJA36R05ykAkDU9V3QJ8qJ0kjbEkd6qqP0xbdveqOr+dPW6G3SSN0JwFcpJjq+qZSX7CDLd5qmq7kUQmaWDmqzSRvphk16q6CiDJtsCxwP0AquqdXQYnLUX9XEHev/26+ygDkTQU5qs0ed5JUyQ/kWbs8qOAfboNSVra5iyQq+rS9uPTgM9U1SWjDUnSfJmv0uSpqhOTrAN8DdgIeGpV/bzjsKQlbZCH9DYCTkryB+AY4Liq+t1owpK0QOarNOaSvJ9Vu0JtDPwKeGkSR4mSOjTIQ3pvAd6SZDtgL+DbSS6qqseMLDpJ82K+ShNh5bT50zuJQtKtDHIFecrvgd8CVwB/N9xwJA2Z+SqNqanRoZJsAPylfSnX1It9btdlbNJS19eb9ACSvDjJCuAbwJ1pXmXrE/HSGDJfpYnyDWC9nvn1gK93FIskBruCvCVwQFWdOaJYJA2P+SpNjnWr6pqpmaq6Jsn6XQYkLXV9X0GuqgNpXjW9WZJlU9MIY5M0T22+bphkX4Akmya5e8dhSZrZtUkeNDWT5MHAnzuMR1ry+r6CnOSlwJuB3wG3tIsL8LatNGaSHATsQDOm6uHAOsAngZ26jEvSjA4AjktyCc1rpe9K83CtpI4M0sXiAGCbqrpiRLFIGp6nAtsDZwBU1SVJNuo2JEkzqaofJLkPzR+0AOdW1Y1dxiQtdYMUyBcCfxpVIJKG6oaqqiQFf31KXtIYSbJLVX0zyZ7TVt27HQf5hE4CkzRQgXwesCLJicD1Uwur6pChRyVpoY5N8mHgDkleBLwA+EjHMUla1aOAbwJ7zLCuAAtkqSODFMi/aafbtpOkMVVV70nyWOAqmtu2b6qqkzoOS1KPqjqo/bpv17FIWtWgb9IjyfpVdd3oQpI0DG1BbFEsjakkr5htvXdope4MMorFw4GPARsCy5I8APiXqnrxqIKTNJgkV9Pcmp1RVd1+EcORNLvZHpxdbR5LGr1Buli8F3g88AWAqvpRkn8YRVCS5qeqNgJI8jbgUuATNMNG7QPcrcPQJE3Tc2f2SGD/qrqynb8jcHCHoUlLXt8vCgGoqgunLbp5iLFIGp4nVdV/VdXVVXVVVX0IeHLXQUma0XZTxTFAVf2RZphGSR0ZpEC+MMkjgEqyTpJXAefMtkOSLZN8K8nZSc5Ksv+CopXUr2uT7JNk7SRrJdkHuHa2HcxXqTNrtVeNAUhyJ/q4w2vOSqMzSBeLfwUOBTYHLga+Brxkjn1uAl5ZVWe0Lyk4PclJVXX2vKKV1K9n0+TroTR9GU9pl83GfJW6cTDw/STHtfPPAN7Rx37mrDQig4xicTlNP8YZJTmwqv5t2j6X0vSDpKquTnIOTYFt8kojVFW/ZpYuFearND6q6qgkK4Fd2kV79lPkmrPS6AzUB3kOz5htZZLlNH2qTp22fL8kK5OsvOyyy4YYjqRZzCtf23XmrDRkVXV2VX2gnQYucG1jpeEaZoGc1a5INgQ+CxxQVVf1rquqw6pqh6raYdNNNx1iOJJmMa98BXNWGje2sdLwDbNAnnHMxiTr0CTu0b5XXhob5qu0BjBnpdEY6RXkJKF5ucg5vhFIGivmqzThzFlpdPoukNthZ6Yvu3vP7HHT1wM7Ac8FdklyZjvtNniYkgZhvkpLgjkrjcggw7x9McmuU/2bkmwLHAvcD6Cq3jl9h6o6mVn6OkoaGfNVWsOZs9LoDNLF4p00je6GSR5McwXqOaMJS9ICma+SJM3TIOMgn9g+DPA1YCPgqVX185FFJmnezFdJkuavn1dZvp9Vn3jfGPgV8NIkVNXLRxWcpMGYr5IkLVw/V5BXTps/fRSBSBoK81WSpAWas0CuqiMBkmwA/KWqbm7n1wZuN9rwJA3CfJUkaeEGeUjvG8B6PfPrAV8fbjiShsR8lSRpngYpkNetqmumZtrP6w8/JElDYL5KkjRPgxTI1yZ50NRMO3TUn4cfkqQhMF8lSZqnQV4UcgBwXJJLaAYmvyuw1yiCkrRgB2C+SpI0L4OMg/yDJPcBtmkXnVtVN44mLEkLYb5KkjR//YyDvEtVfTPJntNW3bsdV/WEEcUmaUDmqyRJC9fPFeRHAd8E9phhXQE2uNL4MF8lSVqgfsZBPqj9uu/ow5G0EOarJEkL108Xi1fMtr6qDhleOJIWwnyVJGnh+ulisdEs62pYgUgaCvNVkqQF6qeLxVsAkhwJ7F9VV7bzdwQOHml0kgZivkqStHCDvChku6nGFqCq/ghsP/SIJA2D+SpJ0jwNUiCv1V6FAiDJnRjsRSOSFo/5KknSPA3SYB4MfD/Jce38M4B3DD8kSUNgvkqSNE+DvEnvqCQrgV3aRXtW1dmjCUvSQpivkiTN30C3XNsG1kZWmgDmqyRJ8zNIH2RJkiRpjWeBLEmSJPWwQJYkSZJ6WCBLkiRJPSyQJUmSpB4WyJIkSVIPC2RJkiSpx8gL5CRPSHJukl8med2ozydp/sxXabKYs9JojLRATrI28EFgV2BbYO8k247ynJLmx3yVJos5K43OqK8gPxT4ZVWdV1U3AJ8Bnjzic0qaH/NVmizmrDQioy6QNwcu7Jm/qF0mafyYr9JkMWelEblN1wEk2Q/Yr529Jsm5Xcajv9oEuLzrICZR3vNPwz7kVsM+4EKYs2PLnJ0H81UdMV/nabFydtQF8sXAlj3zW7TL/qqqDgMOG3EcGlCSlVW1Q9dxaFHNma9gzo4rc3ZJso2dUObr+Bt1F4sfAPdKcvcktwWeBXxhxOeUND/mqzRZzFlpREZ6BbmqbkryUuCrwNrAx6vqrFGeU9L8mK/SZDFnpdFJVXUdg8ZQkv3aW3OSJoA5K00O83X8WSBLkiRJPXzVtCRJktTDAlmSJEnqYYEsSZIk9bBA1oySHNV1DJJmluShSR7Sft42ySuS7NZ1XJJmluQ+SR6dZMNpy5/QVUyanQ/piSTTx80M8I/ANwGq6kmLHpSkGSU5CNiVZpjOk4AdgW8BjwW+WlXv6DA8SdMkeTnwEuAc4IHA/lX1+XbdGVX1oA7D02p0/qppjYUtgLOBjwJFUyDvABzcZVCSZvR0mkb2dsBvgS2q6qok7wFOBSyQpfHyIuDBVXVNkuXA8UmWV9WhNO2txpBdLARNMXw68AbgT1W1AvhzVX27qr7daWSSprupqm6uquuAX1XVVQBV9Wfglm5DkzSDtarqGoCq+jWwM7BrkkOwQB5bFsiiqm6pqv8E9gXekOQDeHdBGlc3JFm//fzgqYVJNsYCWRpHv0vywKmZtljeHdgEuH9XQWl29kHWrSR5IrBTVb2+61gkrSrJ7arq+hmWbwLcrap+0kFYklYjyRY0d35+O8O6narqlA7C0hwskCVJkqQedrGQJEmSelggS5IkST0skJeAJN8bcPudk3xpVPFImp05K00O83XNZIG8BFTVI7qOQVL/zFlpcpivayYL5CUgyTXt152TrEhyfJKfJTk6Sdp1T2iXnQHs2bPvBkk+nuS0JD9M8uR2+aFJ3tR+fnyS7yTx90kaAnNWmhzm65rJsW6Xnu2B+wKXAKcAOyVZCXwE2AX4JXBMz/ZvAL5ZVS9IcgfgtCRfBw4EfpDku8D7gN2qyjFYpeEzZ6XJYb6uIfxrZOk5raouahPtTGA5cB/g/Kr6RTXj/n2yZ/vHAa9LciawAlgXWNa+xetFwEnAB6rqV4v2HUhLizkrTQ7zdQ3hFeSlp/cFAzcz9+9AgKdV1bkzrLs/cAWw2ZBik3Rr5qw0OczXNYRXkAXwM2B5kq3b+b171n0VeFlPP6rt269bAa+kuZ20a5IdFzFeaakzZ6XJYb5OIAtkUVV/AfYDTmwfIPh9z+q3AesAP05yFvC2NpE/Bryqqi4BXgh8NMm6ixy6tCSZs9LkMF8nk6+aliRJknp4BVmSJEnqYYEsSZIk9bBAliRJknpYIEuSJEk9LJAlSZKkHhbIkiRJUg8LZEmSJKmHBbIkSZLUwwJZkiRJ6mGBLEmSJPWwQJYkSZJ6WCBLkiRJPSyQx1CS5yc5uWf+miT3mGOf5UkqyW1GH6Gk2UxqDifZJ8nXujq/1I9xzq8kOyX5RRvTU0Z5Lo2WxdQEqKoNu45hWJIsB84H1qmqmzoOR1oUk5LDVXU0cHTXcUiDGLP8eivwgao6tOtApkvyZuCeVfWcrmOZBF5B1tjxKrg0P+aONDp95tdWwFnDOn6StRd6DM2PBXLHkmyZ5IQklyW5IskHZtimktyz/bxekoOTXJDkT0lOTrLeDPs8Lcmvk9xvjvM/Msn3klyZ5MIkz2+Xb5zkqDauC5K8Mcla7bo3J/lkzzFWuXWVZEWStyU5JcnVSb6WZJN28++0X69sb0E9vL1ddkqS/0xyBfDWJH9Icv+ec/xdkuuSbDrIz1catS5zuCf3XpjkN8A32+UvSHJOkj8m+WqSrXr2eVySc9tz/1eSbyf553bd9FvXj0jyg3bbHyR5RM+62fJcGopJyq8kvwLuAXyxbd9u17alH0tyaZKLk7w9bdE7Q9v35iRHJPlQki8nuRb4xySbJfls+zM4P8nLe2J8c5Ljk3wyyVXA81fzvTwBeD2wVxvbj5I8I8np07Z7RZLPt5+PSPLfSU5qc/zb0/4vuU+77g/t/ynPXN3PchJZIHeoTZIvARcAy4HNgc/Msdt7gAcDjwDuBLwGuGXacfcF3g08pqp+Osv5twL+B3g/sCnwQODMdvX7gY1pkv1RwPOAffv81gCe3W7/d8BtgVe1y/+h/XqHqtqwqr7fzu8InAfcBXgbzc+h9zbQ3sA3quqyAWKQRqrrHO7xKODvgccneTJNQ7gnTV5/F/h0e9xNgOOBA4E7A+e2ccz0vd0JOBF4X7vtIcCJSe7cs9nq8lxasEnLr6raGvgNsEfbvl0PHAHcBNwT2B54HPDPPcfubfve0S57dvt5I+B7wBeBH7Xf/6OBA5I8vucYT6bJ6zuwmi5SVfUV4J3AMW1sDwC+ANw9yd/3bPpc4Kie+X1o2uRNaOqDowGSbACcBHyKJv+fBfxXkm1n/AlOoqpy6mgCHg5cBtxm2vLnAyf3zBdNcq0F/Bl4wAzHWt5u9yrgbGCLPs5/IPC5GZavDdwAbNuz7F+AFe3nNwOfnOHct2nnVwBv7Fn/YuArM23b8/3+ZloMO9L8R5N2fiXwzK7/zZyceqcxyOGpfe7Rs+x/gBf2zK8FXEdz6/d5wPd71gW4EPjn6XHTNJSnTTvf94Hnt59Xm+dOTsOYJi2/2vlf0xTe0BS91wPr9Wy/N/Ctnu9jett3BHBUz/yOM2xzIHB4+/nNwHf6/Hm+mZ62u132IeAd7ef7An8EbtcTy2d6tt0QuBnYEtgL+O60Y30YOKjr35thTV5B7taWwAXV/8NqmwDrAr+aZZtXAx+sqov6PP9Mx9oEWIfmr/YpF9D89dqv3/Z8vo4msWZzYe9MVZ3a7rdzkvvQ/Of3hQHOLy2GrnN4Sm/+bAUcmqbb1JXAH2gK4c2BzXq3raZVW915NmPV/wPg1v8PDJrn0iAmLb+m24qmLb20Z/sP01xxnenYqzvfZlP7t8d4PU3xPdsx+nUk8Owkofmj+Nhqrnzf6thVdQ3N97tZG9eO0+LaB7jrAmIZK3bm7taFwLIkt+nzP4DLgb8AW9PcbpnJ44CvJPltVX22j/M/dDXnuZEmAc5uly0DLm4/Xwus37P9IAlRAyw/kqabxW+B46vqLwOcR1oMXefwlN78uZDmitCtbrUmuRewRc98euenuYTm/4Bey4Cv9BmTtFATlV8zuJDmCvIms8Q/U9s3/XznV9W9+oxvNrfarqr+N8kNwP+h6drx7GmbbDn1IcmGNN1WLmnj+nZVPbbPc08cryB36zTgUuBdSTZIsm6SnVa3cVXdAnwcOKTttL92mofcbtez2VnAE4APJnnSHOc/GnhMkmcmuU2SOyd5YFXdDBwLvCPJRm1f5VcAUw/mnQn8Q5JlSTamud3Tr8to+oPNOmZl65PAU2mK5KPm2FbqQtc5PJP/Bg5Mcl/46wO3z2jXnQjcP8lT0jxU+xJW/wful4F7J3l2+//DXsC2NH1CpcUwafk1PZ5Lga8BBye5fZK1kmyd5FEDnO804Ookr03zAOLaSe6X5CHziP13wPK0D9z3OAr4AHBjVZ08bd1uaR7mvy1NX+T/raoLaf4fuHeS5yZZp50eMq0/80SzQO5QW4juQdN94Dc0tzr3mmO3VwE/AX5Ac6vj3Uz7d6yqHwG7Ax9Jsuss5/8NsBvwyvZYZwIPaFe/jOZK8XnAyTQd8T/e7ncScAzwY+B0Bmgwq+o6mocPTmlvyzxslm0vBM6g+av3u/2eQ1osXefwamL6XHvMz6R5qv2nwK7tusuBZwD/DlxBU/CupLnKNf04V7QxvLLd9jXA7u0xpJGbtPxajefRPMB6Nk3/3uOBuw1wvpvbWB9I8w6By4GP0jxEP6jj2q9XJDmjZ/kngPvxt4tgvT4FHETzs3ww7cPzVXU1zdX4Z9FcUf4tzc/ldjMcYyJNPQAljaUkHwcuqao3dh2LtKZpryRdBOxTVd/qOh5Jiy/NMHi/Bx5UVb/oWX4EcNFSbX/tg6yxleate3vSDI0jaQja4aFOpXna/9U0Dxj9b6dBSerS/wV+0Fscyy4Wa7wk+6QZFHz6NK83/SyWJG+juXX1H1V1ftfxSF0ZQQ4/nOYp/8tpbl8/par+PLSApQkyqW3k6iT5n9V8P69fzfa/Bvan6UqlHnaxkCRJknp4BVmSJEnqMVZ9kDfZZJNavnx512FIY+X000+/vKo27TqOmZiz0qrMV2myrC5nx6pAXr58OStXruw6DGmsJJn+NrOxYc5KqzJfpcmyupwdeReLJHdIcnySnyU5J8nDR31OSfNjvkqTpX1xxA+T+AIXaYgW4wryocBXqurp7ZtY1p9rB0mdMV+lybI/cA5w+64DkdYkI72C3L6G+B+AjwFU1Q1VdeUozylpfsxXabIk2QJ4Is2b1SQN0aivIN8duAw4PMkDaF5LvH9VXTu1QZL9gP0Ali1bNtSTP/jVRw31eFI/Tv+P53UdwnzNma9gzmrNMsH5CvBemleAb7S6DUaZrxq937z1/l2HMLGWveknC9p/1H2QbwM8CPhQVW0PXAu8rneDqjqsqnaoqh023XQsH/yVloo58xXMWWkcJNkd+H1VnT7bduarND+jLpAvonmP96nt/PE0DbCk8WO+SpNjJ+BJ7ZvQPgPskuST3YYkrTlGWiBX1W+BC5Ns0y56NHD2KM8paX7MV2lyVNWBVbVFVS0HngV8s6qe03FY0hpjMUaxeBlwdPtE/HnAvotwTknzY75Kkpa8kRfIVXUmsMOozyNp4cxXafJU1QpgRcdhSGuUkb8oRJIkSZokFsiSJElSDwtkSZIkqYcFsiRJktTDAlmSJEnqYYEsSVKHktw7yTeS/LSd3y7JG7uOS1rKLJAlSerWR4ADgRsBqurHNC//kNQRC2RJkrq1flWdNm3ZTZ1EIgmwQJYkqWuXJ9kaKIAkTwcu7TYkaWlbjFdNS5Kk1XsJcBhwnyQXA+cD+3QbkrS0WSBLktShqjoPeEySDYC1qurqrmOSljq7WEiS1KEkd07yPuC7wIokhya5c9dxSUuZBbIkSd36DHAZ8DTg6e3nYzqNSFri7GIhSVK37lZVb+uZf3uSvTqLRpJXkCVJ6tjXkjwryVrt9Ezgq10HJS1lFsiSJHXrRcCngOvb6TPAvyS5OslVnUYmLVF2sZAkqUNVtVHXMUhaVd9XkJPcf5SBSJK0FCX5bJLdknhXVxoTgyTjfyU5LcmLk2w8sogkSVpaPkTzYpBfJHlXkm26Dkha6voukKvq/9Ak8JbA6Uk+leSxI4tMkqQloKq+XlX7AA8Cfg18Pcn3kuybZJ3V7Zdk3fbC1Y+SnJXkLYsVs7SmG+h2TlX9Angj8FrgUcD7kvwsyZ6jCE6SpKWgfTHI84F/Bn4IHEpTMJ80y27XA7tU1QOABwJPSPKw0UYqLQ19P6SXZDtgX+CJNAm7R1WdkWQz4PvACaMJUdJ8JNkc2IqePK+q73QXkaSZJPkcsA3wCZq29dJ21TFJVq5uv6oq4Jp2dp12qlHGKi0Vg4xi8X7go8Drq+rPUwur6pIkbxx6ZJLmLcm7gb2As4Gb28UFWCBL4+cjVfXl3gVJbldV11fVDrPtmGRt4HTgnsAHq+rUaev3A/YDWLZs2XCjltZgfRXIbQJeXFWfmGn96pZL6sxTgG2q6vquA5E0p7cDX5627Ps0XSxmVVU3Aw9Mcgfgc0nuV1U/7Vl/GHAYwA477ODVZalPfRXIVXVzki2T3Laqbhh1UJIW7Dya260WyNKYSnJXYHNgvSTbA2lX3R5Yf5BjVdWVSb4FPAH46VzbS5rdIF0szgdOSfIF4NqphVV1yNCjkrRQ1wFnJvkGPUVyVb28u5AkTfN4mgfztgAO5m8F8lXA6+faOcmmwI1tcbwe8Fjg3aMJVVpaBimQf9VOawFTb/3xdo00nr7QTpLGVFUdCRyZ5GlV9dnVbZfkn9ptp7tbu//aNG3zsVX1pRGFKy0pgxTIZ1fVcb0LkjxjyPFIGoKqOjLJbYF7t4vOraobu4xJ0sxmK45b+wO3KpCr6sfA9iMJSlriBhkH+cA+l91KkrWT/DCJf9lKiyDJzsAvgA8C/wX8PMk/9Lmv+SqNl8y9iaRhmvMKcpJdgd2AzZO8r2fV7YGb+jzP/sA57T6SRu9g4HFVdS5AknsDnwYe3Me+5qs0XuzOKC2yfq4gXwKsBP5CM9bi1PQFmgcMZpVkC5qXi3x0/mFKGtA6U8UxQFX9nGZUi1mZr9JY8gqytMjmvIJcVT8CfpTkU/Psw/he4DX87cE+SaO3MslHgU+28/vQ/KE7l/divkqLJslawNOr6thZNjtlseKR1BikD/JDk5yU5OdJzktyfpLzZtshye7A76vq9Fm22S/JyiQrL7vssgHCkTSL/0vzFr2Xt9PZ7bLV6idf2+3MWWlIquoWmj9KZ9vmpYsUjqTWIKNYfAz4f2i6V9w8x7ZTdgKelGQ3YF3g9kk+WVXPmdrAt/xIw9e+Qe+QdurXnPnaHtuclYbr60leBRzDqu8Z+EN3IUlL2yAF8p+q6n8GOXhVHUg70kX7VP2rpje2koYnybFV9cwkP2GGB3uqarvV7Wu+Sp3Zq/36kp5lBdyjg1gkMViB/K0k/wGcwKpv5jpj6FFJmq/926+7dxqFpL5V1d27jkHSqgYpkHdsv+7Qs6yAXfrZuapWACsGOJ+kAVXVpe3XCxZ4nBWYr9KiSLI+8ApgWVXtl+RewDa+FU/qTt8FclX94ygDkTQ8Sa7m1l0s/kQzksUrq2rWB2wlLarDaZ7veUQ7fzFwHGCBLHWk7wI5yZtmWl5Vbx1eOJKG5L3ARcCnaMZQfRawNXAG8HFg564Ck3QrW1fVXkn2Bqiq65I49rHUoUGGebu2Z7oZ2BVYPoKYJC3ck6rqw1V1dVVd1Y488fiqOga4Y9fBSVrFDUnWo73rk2Rrep71kbT4BulicXDvfJL3AF8dekSShuG6JM8Ejm/nn07zNkzwtbXSuDkI+AqwZZKjaYZcfH6nEUlL3CAP6U23PrDFsAKRNFT7AIcC/0VTEP8v8Jz2KpUvHZDGRPsmvTsCewIPo+kStX9VXd5pYNISN0gf5N5xVdcGNgXsfyyNofYhvD1Ws/rkxYxF0upV1S1JXtO+avrEruOR1BjkCnLvuKo3Ab+rqpuGHI+kIUhyb+BDwF2q6n5JtqPpl/z2jkOTdGu+SU8aM30/pNeOq3oHmqtSTwW2HVFMkhbuIzRvxbsRoKp+TDOShaTxsxfNW/S+QzPc2+k0QzJK6kjfBXKS/YGjgb9rp6OTvGxUgUlakPWr6rRpy7zjI42Ztg/y66rq7tMmXzMtdWiQLhYvBHasqmsBkrwb+D7w/lEEJmlBLm+HipoaNurpwKXdhiRpurYP8qtpuldIGhODFMihGf94ys3tMknj5yXAYcB9klwMnE8zsoWk8TOvPshJtgSOAu5C88fwYVV16CgDlZaKQQrkw4FTk3yunX8K8LGhRyRpQZKsDby4qh6TZANgraq6uuu4JK3WXu3Xl/QsK2CubhY30bw6/owkGwGnJzmpqs4eRZDSUjLIi0IOSbICeGS7aN+q+uFIopI0b1V1c5JHtp+vnWt7Sd2qqrvPc79LabtOVdXVSc4BNgcskKUFGmQc5IcBZ1XVGe387ZPsWFWnjiw6SfP1wyRfAI5j1Vu2J3QXkqSZJFkfeAWwrKr2S3IvYJuq+tIAx1gObA+cOm35fsB+AMuWLevrWA9+9VH9nlbTnP4fz+s6BA1J36NY0Iypek3P/DXtMknjZ13gCmAXmqEZ92DVscwljY/DgRuAR7TzFwN9j1meZEPgs8ABVXVV77qqOqyqdqiqHTbddNNhxSut8QZ6SK+qpt6kN/Xk7UJeVS1pRKpq39nWJzmwqv5tseKRNKutq2qvJHsDVNV1Sfp6CD7JOjTF8dHeIZKGZ5AryOcleXmSddppf+C8UQUmaaSe0XUAkv7qhiTr8bdhGbcGrp9rp7aI/hhwTlUdMtoQpaVlkAL5X2lu/1wMXATsSNuvSdLEcYhGaXwcBHwF2DLJ0cA3gNf0sd9OwHOBXZKc2U67jTBOackYZBSL3zPLq2q9ZStNlJp7E0mjlGSnqjqF5hXTewIPo/njdf+qunyu/avqZPxjVxqJQa4gz8VbttLksFGVuve+9uv3q+qKqjqxqr7UT3EsabSG+ZCdDa40JpLcafpbuJLcvarOb2eP6yAsSau6MclhwBZJ3jd9ZVW9vIOYJDHcAtlbttL4+GKSXaeGfEqyLXAscD+Aqnpnl8FJApqhFx8DPB44veNYJPXwCrK0ZnonTZH8RGAb4Chgn25DktSr7UrxmSTnVNWPuo5H0t8M8iY9b9lKE6KqTmzHR/0asBHw1Kr6ecdhSZrZn5N8A7hLVd0vyXbAk6qq75eFSBquQR7S+2KS20/NtLdsvzg17y1bqXtJ3p/kfW1/xl2AjYHzgZfO1MdR0lj4CHAgcCNAVf2YWUaNkjR6g3Sx8JatNP5WTpu3X6M0/tavqtOmvTzvpq6CkTTYOMjespXGXFUdCZBkA+AvVXVzO782cLsuY5O0Wpe3b8+bepPe04FLuw1JWtrmLJCTvJ9VR6jYGPgVzS1bh6GRxtM3aJ6Ov6adX4/mj9tHdBaRpNV5CXAYcJ8kF9N0i/IOrdShfq4ge8tWmjzrVtVUcUxVXZNk/S4DknRr7d2dF1fVY9o7P2tV1dVdxyUtdXMWyAu5ZZtkS5q+ynehuQp9WFUdutCgJc3p2iQPqqozAJI8GPjzbDuYr9Liq6qbkzyy/Xxt1/FIagzykN58btneBLyyqs5IshFwepKTqurseUUrqV8HAMcluYRmjPK7AnvNsY/5KnXjh0m+QDNc6l+L5Ko6obuQpKVtkAJ54Fu2VXUp7YMGVXV1knOAzQEbXGmEquoHSe5DM+IMwLlVdeMc+5ivUjfWBa6gGZpxSgEWyFJHBimQB75l2yvJcmB74NRpy/cD9gNYtmzZAOFImi7JLlX1zSR7Tlt17/ah2r4a3NXla7vOnJWGqKr2nW19kgOr6t8WKx5JgxXIBzD4LVsAkmwIfBY4oKqu6l1XVYfRPL3LDjvsUDPsLql/jwK+Cewxw7q+rkjNlq9gzkodeAZggSwtokHGQR74li1AO3byZ4Gj7U8ljVZVHdR+nfWK1OqYr9JYytybSBqmfsZBnvct2zSvBfoYcE5VHbLAWCXNIckrZls/Wx6ar9LY8k6NtMj6uYK8kFu2OwHPBX6S5Mx22eur6suDBCmpbxvNsm6uRtZ8lcaTV5ClRdbPOMjzvmVbVSdjYkuLpqreApDkSGD/qrqynb8jcPAc+5qvUgeS3Kmq/jBt2d2r6vx29rgOwpKWtH66WMz7lq2kzmw3VRwDVNUfk2zfYTySVu+LSXadeig2ybbAscD9AKrqnTPtlOTjwO7A76vqfosVrLQUrNXHNhvNMm04utAkLcBa7VVjoLlCxWCj1khaPO+kKZI3bIdQPQ54Th/7HQE8YZSBSUtVP10s5n3LVlJnDga+n2Tq1uwzgHd0GI+k1aiqE9sRZL5Gc/HpqVX18z72+047ZrmkIRvkipK3bKUJUVVHJVnJ397MtaevjJbGS5L3s+rDsxsDvwJe2o4S9fIhnMMX+0jzMEiBvFaSO1bVH8FbttK4awtii2JpfK2cNn/6sE/gi32k+RmkwPWWrSRJQ1JVRwIk2QD4S1Xd3M6vDdyuy9ikpa6fh/SA5pYtsCfwu3bas6o+MarAJElaIr4BrNczvx7w9Y5ikcSAXSS8ZStJ0tCtW1XXTM1U1TVJ1p9rpySfBnYGNklyEXBQVX1sdGFKS4d9iCVJ6ta1SR5UVWcAtEO9/Xmunapq75FHJi1RFsiSJHXrAOC4JJfQvM3yrsBenUYkLXEWyJIkdaiqfpDkPsA27aJzq+rGLmOSljoLZEmSOpBkl6r6ZpI9p626dzsO8gmdBCbJAlmSpI48CvgmsMcM6wqwQJY6YoEsSVIHquqg9uu+XcciaVUWyJIkdSDJK2ZbX1WHLFYsklZlgSxJUjc2mmWdr4WWOmSBLElSB6rqLQBJjgT2r6or2/k7Agd3GJq05PX9qmlJkjQS200VxwBV9Udg++7CkWSBLElSt9ZqrxoDkOROeIdX6pQJKElStw4Gvp/kuHb+GcA7OoxHWvIskCVJ6lBVHZVkJbBLu2jPqjq7y5ikpc4CWZKkjrUFsUWxNCbsgyxJkiT1sECWJEmSelggS5IkST0skCVJkqQeFsiSJElSDwtkSZIkqcfIC+QkT0hybpJfJnndqM8naf7MV2mymLPSaIy0QE6yNvBBYFdgW2DvJNuO8pyS5sd8lSaLOSuNzqivID8U+GVVnVdVNwCfAZ484nNKmh/zVZos5qw0IqN+k97mwIU98xcBO/ZukGQ/YL929pok5444JvVnE+DyroOYRHnPPw37kFsN+4CrMWe+gjk7xszZeZjgfIWl28aO7e/6CH6fxtnY/jsAcFD63XLGnO38VdNVdRhwWNdxaFVJVlbVDl3HofFjzo4nc1YzWRPz1d/18bCm/zuMuovFxcCWPfNbtMskjR/zVZos5qw0IqMukH8A3CvJ3ZPcFngW8IURn1PS/Jiv0mQxZ6URGWkXi6q6KclLga8CawMfr6qzRnlODc0adUtOczNfJ545u8Qs4Zz1d308rNH/DqmqrmOQJEmSxoZv0pMkSZJ6WCBLkiRJPSyQJUmSpB4WyJIkSZpVkvskeXSSDactf0JXMY2SBbJmlWTfrmOQJGk626fFk+TlwOeBlwE/TdL7SvN3dhPVaDmKhWaV5DdVtazrOCRJ6mX7tHiS/AR4eFVdk2Q5cDzwiao6NMkPq2r7biMcvs5fNa3uJfnx6lYBd1nMWCTNLcldgYOAW4A30VzVeRpwDrB/VV3aYXjS0Ng+jY21quoagKr6dZKdgeOTbEXzb7HGsUAWNP/JPB7447TlAb63+OFImsMRwInABsC3gKOB3YCnAP8NPHl1O0oTxvZpPPwuyQOr6kyA9kry7sDHgft3GtmIWCAL4EvAhlO/+L2SrFj0aCTN5S5V9X6AJC+uqne3y9+f5IUdxiUNm+3TeHgecFPvgqq6CXhekg93E9Jo2QdZkiZMkh9V1QPaz2+vqjf2rPtJVa2RV3QkabE4ioUkTZ7PTw21NK04vidwbmdRSdIawivIkrQGSbJvVR3edRySNMm8grwEJBnoQYYkOyf50qjikTRSb+k6AGkpsY1dM/mQ3hJQVY/oOgZJw+PQV9L4sI1dM3kFeQlIck37deckK5Icn+RnSY5OknbdE9plZwB79uy7QZKPJzktyQ+n3p6T5NAkb2o/Pz7Jd5L4+yQtjrvQPFW+xwzTFR3GJS05trFrJq8gLz3bA/cFLgFOAXZKshL4CLAL8EvgmJ7t3wB8s6pekOQOwGlJvg4cCPwgyXeB9wG7VdUti/dtSEuaQ19J48k2dg3hXyNLz2lVdVGbaGcCy4H7AOdX1S+qeWrzkz3bPw54XZIzgRXAusCyqroOeBFwEvCBqvrVon0H0hJXVS+sqpNXs+7Zix2PpL+yjV1DeAV56bm+5/PNzP07EOBpVTXT0FH3p7mdu9mQYpMkaZLZxq4hvIIsgJ8By5Ns3c7v3bPuq8DLevpRbd9+3Qp4Jc3tpF2T7LiI8UqSNClsYyeQBbKoqr8A+wEntg8Q/L5n9duAdYAfJzkLeFubyB8DXlVVlwAvBD6aZN1FDl1aIzlslLTmsI2dTL4oRJImXJKdaRrT3TsORZLWCF5BlqQx47BRktQtH9KTpPHmsFGStMi8eiBJ481hoyRpkXkFWZLGm8NGSdIi8wqyJE0eh42SpBGyQJakCeOwUZI0Wg7zJkmSJPXwCrIkSZLUwwJZkiRJ6mGBLEmSJPWwQJYkSZJ6WCBLkiRJPSyQJUmSpB4WyJIkSVKP/x/dTDFqNvhNTgAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据分析" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户重复点击" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:30:20.899771Z", + "start_time": "2020-11-13T15:30:20.750817Z" + } + }, + "outputs": [], + "source": [ + "#####merge\n", + "user_click_merge = trn_click.append(tst_click)" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:30:26.290038Z", + "start_time": "2020-11-13T15:30:25.339579Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idcount
00307601
101575071
21637461
312891971
42361621
521684011
63361621
73506441
84398941
94425671
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id count\n", + "0 0 30760 1\n", + "1 0 157507 1\n", + "2 1 63746 1\n", + "3 1 289197 1\n", + "4 2 36162 1\n", + "5 2 168401 1\n", + "6 3 36162 1\n", + "7 3 50644 1\n", + "8 4 39894 1\n", + "9 4 42567 1" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#用户重复点击\n", + "user_click_count = user_click_merge.groupby(['user_id', 'click_article_id'])['click_timestamp'].agg({'count'}).reset_index()\n", + "user_click_count[:10]" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:34:27.418638Z", + "start_time": "2020-11-13T15:34:27.372761Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idcount
311242862957425410
311243862957626810
39376110323720594810
39376310323723568910
5769021348506946313
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id count\n", + "311242 86295 74254 10\n", + "311243 86295 76268 10\n", + "393761 103237 205948 10\n", + "393763 103237 235689 10\n", + "576902 134850 69463 13" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_click_count[user_click_count['count']>7]" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:32:53.298575Z", + "start_time": "2020-11-13T15:32:53.285611Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 4, 3, 6, 5, 10, 7, 13])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_click_count['count'].unique()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 1605541\n", + "2 11621\n", + "3 422\n", + "4 77\n", + "5 26\n", + "6 12\n", + "10 4\n", + "7 3\n", + "13 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#用户点击新闻次数\n", + "user_click_count.loc[:,'count'].value_counts() " ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###### 可以看出:有1605541(约占99.2%)的用户未重复阅读过文章,仅有极少数用户重复点击过某篇文章。 这个也可以单独制作成特征" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户点击环境变化分析" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:39:41.961797Z", + "start_time": "2020-11-13T15:39:41.949829Z" + } + }, + "outputs": [], + "source": [ + "def plot_envs(df, cols, r, c):\n", + " plt.figure()\n", + " plt.figure(figsize=(10, 5))\n", + " i = 1\n", + " for col in cols:\n", + " plt.subplot(r, c, i)\n", + " i += 1\n", + " v = df[col].value_counts().reset_index()\n", + " fig = sns.barplot(x=v['index'], y=v[col])\n", + " for item in fig.get_xticklabels():\n", + " item.set_rotation(90)\n", + " plt.title(col)\n", + " plt.tight_layout()\n", + " plt.show()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:39:55.476626Z", + "start_time": "2020-11-13T15:39:48.764592Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAs8AAAFgCAYAAABE0JQRAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAABDJUlEQVR4nO3de9xtY73//9fbyplEdMA6SCJKZEWnvUlyKIdSQtqhg713FLt0UP0QZXfSQakoy6GUU/ruVZREdiGxSIr2ynJqWZTzWWJ5//4Y181Yc837vudc6573mHPd7+fjMR73HNcY15if++azxmeOeY1ryDYRERERETG6pZoOICIiIiJiUKR4joiIiIjoUIrniIiIiIgOpXiOiIiIiOhQiueIiIiIiA6leI6IiIiI6FCK54iILknaR9LFtfWHJL1glD7TJFnSMxbzvW+WtM3iHKMcZ9SYIyJiYSme+0CTJ+KxkhNxTGS2V7J9Y9NxdGOsYpb0Bkm/kvSgpLslXS3pY5KWG4s4IwbdknCOjwWleO5DE/lE3CRJW0m6tek4IgaFpN2As4AfAFNtPxvYHVgbmDxMnxQDMaEtCefLiS7Fc/ScpElNxxCxqCRNlnS2pDvLldVvtNnHkl5YXi8v6WhJt0i6X9LFkpZv0+etZQjGS0Z5/38rx7pb0idbti0l6eOSbijbz5C0Wtn2M0kHtOz/B0m7dhOzpFdKulTSfaX/VqVdwJeBI2x/x/Y9ALZn2/6A7evLfodLOkvS9yU9AOwjaU1JMyXdI2mOpPfVYjxJ0mdq6wt8qC1/s0MkXSfpXkkn5ip3RIynFM/jrA9OxG1PhGXbRZKOlHRJ+Qr2F5JWL9u6ORGfJOlbks6V9DDwOkkvLse/T9K1knauHeckScdKOqe87+8krdvy93i/pOvL9iMlrVt+jwdKwbBMbf8dVX11fF/ZZ+PatpslHSzpmvL3PF3ScpJWBH4GrKnqK7WHJK050t8ylnyqPvj9FLgFmAasBZw2SrcvAZsBrwZWAz4KPNly3H2BzwPb2P7TCO+/IfAt4N+ANYFnU13VHfIB4M3AlmX7vcCxZdsPgT1bjjUVOKfTmCWtVfb/TGk/GPiRpDWA9UssPxrhbzFkF6or1M8CTqX6G95aYn4bcJSkrTs4zpC9gO2AdYEXAZ/qom9Ez/TBOX7nco69r5xzX1zb9jFJ88p5dLak14/F7zwh2c4yTgswCfgD8BVgRWA54LXAPsDFtf0MvLC8Pha4iOqkPYnq5LYs1YncwDOAfYE5Q31GeP+1gLuBN1J9cHpDWV+jbL8IuIHqZLR8Wf9c2fYu4JLasTYE7gOWbRPzScD9wGvK+6xc4vsEsAywNfAgsH5t/7uBzcvvcypwWsvf43+AZwIbAY8BFwAvAFYBrgP2LvtuCtwBbFH+XnsDN9fivBm4nOqkvRrwZ+A/yratgFub/v8kS/8swKuAO4FntLS3zdny//ujwMvaHGsoZw8u/8+u3cH7H9qSCysC/6Qquin//76+tv35wOMlj1YGHqYaTgHwWWBGlzF/DPheS9t5Ja9eW46xXG3baeXfhUeAfytthwO/ru0zGZgPrFxr+2/gpPL6JOAztW0L5GXJ4f+orb8RuKHp/1eyZKH5c/yLSs6/AVia6kPwHKrz7vrAXGDNsu80YN2m/2aDuuTK8/janKpo+4jth23/w/bFw+0saSng3cCBtufZnm/7UtuP1XY7CPgIsJXtOaO8/zuBc22fa/tJ2+cDs6hOPkNOtP0X248CZwCblPYfA5tImlrW9wLOboml7n9sX2L7yXKMlagK8X/avpDqat6etf1/bPty209QFc+btBzvC7YfsH0t8CfgF7ZvtH0/1RXjTct++wHH2f5d+XudTFVsv7J2rGNs3+bqa+aftHmviCGTgVvK/5edWJ3qhHnDCPt8BDjWdifj69ekOuEBYPthqg+aQ6YCPy5Xme6jKqbnA8+1/SDVVeM9yr57UuVWNzFPBXYbOn55j9dSFelDcTy/Ft8etp8FXEVVCAyZW3u9JnBPiW/ILVTFQ6fqx7ulHDOiaU2f43cHzrF9vu3Hqb5RWp6qIJ9PVZRvKGlp2zfbHunfqRhBiufx1fSJeKQT4ZC/1V4/QlX00sWJeEjryXJuKaSHtJ4s275vzd9rrx9tsz60/1Tgwy2/42QWPLmO9l4RQ+YCU9T5TW53Af+gGk4wnG2BT0l6awfHu53ajXeSVqAaulGPbwfbz6oty9meV7b/ENhT0quo/i35VZcxz6W68lw//oq2PwfMBuYBu3bwe7j2+jZgNUkr19qmlGNBdeVshdq257U5Xv1mxCnlmBFNa/ocvybVuRWAcs6dC6xVCu+DqL4JukPSaRmauOhSPI+vpk/EI50IO9HJiXhI68lycvmUPaR+shxLc4HPtvyOK9j+YQd9PfouMcFcTlXAfk7SimV8/GuG27mcrGYAX1Z1U9wkSa+StGxtt2uB7YFj62P/h3EWsKOk15Zx/Uew4L/b3wY+O/SNkKQ1JO1S234u1QfKI4DTWz7AdhLz94GdJG1X2pdTdQPf2qXfh4HDJL1P0qqqrAc8d4S/0VzgUuC/y/E2Bt5T3gvgauCNklaT9DyqE36r/SWtrermyE8Cp4/0R4wYJ02f42+jynfgqZt6J1POtbZ/YPu1ZR9T3XcRiyDF8/hq+kQ87Imww/hHPREP43dUV3g/KmlpVTcp7sToN14tiu8A/yFpi3IiX1HSm1qucg3n78CzJa3Sg7hiANmeT/X/6guBv1Ld5Lb7KN0OBv4IXAHcQ3WCWuDfWtt/AHYEviNphxHe/1pgf6qp4G6nuiGwfgXqa8BM4BeSHgQuoxrvP9T/MeBsYJtyjK5iLoXuLlT3K9xJVRx8ZOj3sX068HaqIWFzqYqBM4DjgTNHeL89qcZc3kY1JOww278s275HNW70ZuAXtC+Mf1C23Uh11e4zbfaJGG9Nn+PPAN4k6fWSlqb6cPsYcKmk9SVtXY79D6pvbDs9h0erpgZbT9SF6orr/6MaL3gXcAwj30ywPPBVqk+O9wO/Lm3Tyn7PKPtNpyr+dhjl/bcA/pfqBHkn1VCMKWXbRcB7a/suEFdpO6G87yta2ltvGPxMy/aNyvveT3Wz1Ftq2xbYn4VvEHrq2GX9YmCf2vpngO/W1renKgLuo/qH7EzKzUlUJ+RtavseDny/tj6j/Le5j3JjRZYsWfpnac3hLFn6aemDc/xbyjn2/nLO3ai0b0xV3D9Yzv8/zTlu0ReVP2pERETfk3Qz1Yf8X462b0REL2TYRkREgyTtpafnFq8v1zYdW0RELCxXnpcwkvYCjmuz6RbbG413PBERETE2co7vDymeIyIiIiI61Ol0KgNh9dVX97Rp05oOI2JcXHnllXfZXqPpOBZHcjYmkkHP2eRrTCQj5esSVTxPmzaNWbNmNR1GxLiQdMvoe/W35GxMJIOes8nXmEhGytfcMBgRERER0aGeFs+SZki6Q9KfhtkuScdImiPpGkkvr23bW9L1Zdm7l3FGRCU5GzE4kq8Rzej1leeTqB5YMZwdgPXKsh/wLYDyyNXDqB7osTnV419X7WmkEQHJ2YhBchLJ14hx19Pi2favqZ5kM5xdgFNcuQx4lqTnA9sB59u+x/a9wPmM/A9ERIyB5GzE4Ei+RjSj6RsG1wLm1tZvLW3DtS9E0n5Un6iZMmXKmAW22UdOGbNjRQy58ovvajqExZWcjQllwHM2+RoTynjl68DfMGj7eNvTbU9fY42BnQEoYsJIzkYMjuRrxMKaLp7nAZNr62uXtuHaI6JZydmIwZF8jeiBpovnmcC7yh3BrwTut307cB6wraRVy00M25a2iGhWcjZicCRfI3qgp2OeJf0Q2ApYXdKtVHf3Lg1g+9vAucAbgTnAI8C+Zds9ko4EriiHOsL2SDdFRMQYSM5GDI7ka0Qzelo8295zlO0G9h9m2wxgRi/iioj2krMRgyP5GtGMpodtREREREQMjBTPEREREREdSvEcEREREdGhjotnSd/rpC0i+oekZ0pauek4IiIilhTdXHneqL4iaRKw2diGExFjQdIrJP0RuAb4k6Q/SEq+RkRELKZRi2dJh0h6ENhY0gNleRC4A/ifnkcYEYviBOD9tqfZnkp1x/2JDccUEREx8EYtnm3/t+2VgS/afmZZVrb9bNuHjEOMEdG9+bZ/M7Ri+2LgiQbjiYiIWCJ0PM+z7UMkrQVMrfez/eteBBYRi+V/JR0H/BAwsDtwkaSXA9i+qsngIiIiBlXHxbOkzwF7ANcB80uzgRTPEf3nZeXnYS3tm1Ll7dbjG05ERMSSoZsnDL4FWN/2Y70KJiLGhu3XNR1DRETEkqib4vlGYGkgxXNEn5N0aLt220eMdywRERFLkm6K50eAqyVdQK2Atv3BMY8qIhbXw7XXywE7An9uKJaIiIglRjfF88yyRESfs310fV3Sl4DzGgonIiJiidHNbBsnS1oemGJ7dg9jioixtwKwdtNBREREDLpuHs+9E3A18POyvomkXImO6EOS/ijpmrJcC8wGvtpwWBEREQOvm2EbhwObAxcB2L5a0gt6EFNELL4da6+fAP5uOw9JiYiIWEwdX3kGHrd9f0vbk2MZTESMDdu3AM8CdqKaZnLDRgOKiIhYQnRTPF8r6R3AJEnrSfo6cGmP4oqIxSDpQOBU4DllOVXSB5qNKiIiYvB1Uzx/ANiIapq6HwIPAAeN1knS9pJmS5oj6eNttn9F0tVl+Yuk+2rb5te2ZXx1ROfeA2xh+1DbhwKvBN43Wqfka8RgSc5GjL9uZtt4BPhkWToiaRJwLPAG4FbgCkkzbV9XO+5/1fb/ANXjg4c8anuTTt8vIp4iYH5tfX5pG75D8jVioCRnI5rRcfEsaTrwCWBavZ/tjUfotjkwx/aN5RinAbsA1w2z/57AYZ3GFBHDOhH4naQfl/U3AyeM0if5GjFYkrMRDehm2MapwEnAW6luQhpaRrIWMLe2fmtpW4ikqcA6wIW15uUkzZJ0maQ3D9Nvv7LPrDvvvLOT3yNiiSZpKeAyYF/gnrLsa/uro3Tteb6WvsnZiLGRc2xEA7qZqu5O270cE7UHcJbt+lfNU23PK1PiXSjpj7ZvqHeyfTxwPMD06dPdw/giBoLtJyUda3tT4Koevc0i5WuJLzkbMf5yjo0YI91ceT5M0ncl7Slp16FllD7zgMm19bVLWzt7UN2I+BTb88rPG6nml9504W4R0cYFkt4qacRxzi2SrxGDJTkb0YBuiud9gU2A7Xl6yMaOI3UArgDWk7SOpGWoknehq9eSNgBWBX5ba1tV0rLl9erAaxh+HFdELOjfgTOBxyQ9IOlBSQ+M0if5GjFYkrMRDehm2MYrbK/fzcFtPyHpAOA8YBIww/a1ko4AZtWGgewBnGa7/pXQi4HjJD1JVeR/rn4HcUQMz/bKi9An+RoxQJKzEc3opni+VNKG3SaX7XOBc1vaDm1ZP7xNv0uBl3bzXhETXZm6annbD5X1VwLLlM2/t/3gSP2TrxGDJTkbMf66KZ5fCVwt6SaqB6UI8ChT1UXE+Po8cAfwhbL+Q+BPwHJUNw9+rKG4IiIilgjdFM/b9yyKiBgrrwdeUVu/z/ZO5cbB3zQUU0RExBKj4xsGbd9CNYfk44BrS0T0j6VsP1Fb/xhUXxEBKzUTUkR0qtzIl290I/pYN08Y/ADVk4n+DjxZmg0kySP6xzKSVh4a22z7FwCSVqEauhERfUbSRcDOVOfkK4E7JF1i+0ONBhYRbXUzVd2BwPq2N7L90rKkcI7oL98BTpc0ZaihPFnsh8B3G4sqIkayiu0HgF2BU2xvAWzTcEwRMYxuxjzPBe7vVSARsfhsf1nSI8DFklakurH3QappqL7VbHQRMYxnSHo+8Hbgk00HExEj66Z4vhG4SNI5VLNtANXJesyjiohFZvvbwLclrVzWR5yeLiIadwTVXM2X2L6iPC77+oZjiohhdFM8/7Usy/D0vLER0YckPRc4ClgT2EHShsCrbJ/QbGQR0cr2mVRPBB1avxF4a3MRRcRIOi6ebX8aQNJKZf2hXgUVEYvtJOBEnv4K+C/A6UCK54g+I2lt4OtUj8iGalrJA23f2lxUETGcjm8YlPQSSb8HrgWulXSlpI16F1pELIbVbZ9BmRmnTF83v9mQImIYJwIzqb4pWhP4SWmLiD7UzWwbxwMfsj3V9lTgw1R39kdE/3lY0rMpc7GXx3Tnht+I/rSG7RNtP1GWk4A1mg4qItrrZszzirZ/NbRi+6JyN39E9J8PUV3JWlfSJVQn4rc1G1JEDONuSe+kmlISYE/g7gbjiYgRdDXbhqT/D/heWX8n1QwcEdFnbF8laUtgfarp6mbbfrzhsCKivXdTjXn+CtW3RZcC+zYaUUQMq5thG++munp1NvAjYPXSFhF9RtL+wEq2r7X9J2AlSe9vOq6IWJjtW2zvbHsN28+x/Wbbfx3aLumQJuOLiAV1VDxLmgScbfuDtl9uezPbB9m+t8fxRcSieZ/t+4ZWSq6+r7lwImIx7NZ0ABHxtI6KZ9vzgSclrdLjeCJibEySpKGV8gE487NHDCaNvktEjJduxjw/BPxR0vnAw0ONtj845lFFxOL6OXC6pOPK+r+XtogYPG46gIh4WjfF89lliYj+9zGqgvk/y/r5wHebCyciFkOuPEf0kY5vGLR9crtltH6Stpc0W9IcSR9vs30fSXdKuros761t21vS9WXZu/NfK2Jis/2k7W/ZfltZjivDr0aUfI0Yf5JWa9O2Tm31zNbttf2SsxHjbNQrz5LOsP12SX+kzVdHtjceoe8k4FjgDcCtwBWSZtq+rmXX020f0NJ3NeAwYHp53ytL39ykGDGM5GvEQPqJpB1sPwAgaUPgDOAlALaPatcpORvRjE6GbRxYfu64CMffHJhj+0YASacBuwCtid3OdsD5tu8pfc8HtufpSeQjYmHJ14jBcxRVAf0mqrnZTwH26qBfcjaiAaMO27B9e3n5VuDxMh/lU8so3dcC5tbWby1trd4q6RpJZ0ma3E1fSftJmiVp1p133jnarxOxROv3fIXkbEQr2+dQPSDlF8BJwFtsX91B15xjIxrQzUNSVgbOl/QbSQdIeu4YxfATYFr5Ovl8YNRx1HW2j7c93fb0NdZYY4xCihh4fZmvkJyNGCLp65KOkXQMsDWwCnATcEBpGws5x0aMsW5uGPy07Y2A/YHnA/8r6ZejdJsHTK6tr13a6se92/ZjZfW7wGad9o2I9pKvEQNhFnBlbfkC1RN8h9ZHk5yNaEA3U9UNuQP4G3A38JxR9r0CWK/cNTwP2AN4R30HSc+vfdW8M/Dn8vo84ChJq5b1bYE8ojSiO8nXiD41NGOVpBWBfwzNiFNuBFy2g0MkZyMa0HHxLOn9wNuBNaimzXlfmzt6F2D7CUkHUCXpJGCG7WslHQHMsj0T+KCknYEngHuAfUrfeyQdSfWPA8ARQzc2RMTIkq8RA+UCYBuqh5EBLE81/vnVI3VKzkY0o5srz5OBgzq8ieEpts8Fzm1pO7T2+hCG+bRrewYwo5v3iwgg+RoxSJazPVQ4Y/shSSt00jE5GzH+uhnzfAjV47nXlDRlaOlhbBGxiEq+riRpXwBJa7Q8dCEi+sfDkl4+tCJpM+DRBuOJiBF0M2zjAOBw4O/Ak6XZwLAPXYiIZkgaevjB+sCJwNLA94HXNBlXRLR1EHCmpNuoHsX9PGD3RiOKiGF1M2zjIGB923f3KJaIGDtvATYFrgKwfZuklZsNKSLasX2FpA2oPuwCzLb9eJMxRcTwuime5wL39yqQiBhT/7RtSYan7uaPiD4iaWvbF0ratWXTiyRh++xGAouIEXVTPN8IXCTpHGBozkhsf3nMo4qIxXWGpOOAZ0l6H/Bu4DsNxxQRC9oSuBDYqc02AymeI/pQN8XzX8uyTFkiok/Z/pKkNwAPUH0VfKjt8xsOKyJqbB9Wfu7bdCwR0bmOi2fbnwaQtILtR3oXUkSMhVIsp2CO6FOSPjTS9nyzG9Gfuplt41XACcBKwBRJLwP+3fb7exVcRHRH0oNUX/e2ZfuZ4xhORIxspJt4h83jiGhWN8M2vgpsB8wEsP0HSf/ai6AiYtHYXhmgPDnsduB7VFNf7QU8v8HQIqJF7Rvdk4EDbd9X1lcFjm4wtIgYQccPSQGwPbelaf4YxhIRY2dn29+0/aDtB2x/C9il6aAioq2NhwpnANv3Uk01GRF9qJviea6kVwOWtLSkg4E/9yiuiFg8D0vaS9IkSUtJ2gt4uOmgIqKtpcrVZgAkrUZ33wxHxDjqJjn/A/gasBYwD/gFsH8vgoqIxfYOqnz9GtXYyUtKW0T0n6OB30o6s6zvBny2wXgiYgTdzLZxF9W4ybYkHWL7v8ckqohYLLZvZoRhGsnXiP5h+xRJs4CtS9Outq9rMqaIGF5XY55HsdsYHisieiv5GtFHbF9n+xtlSeEc0cfGsnjWGB4rInor+RoREbEIxrJ4zpyUEYMj+RoREbEIcuU5YmJKvkZERCyCjovnMnVOa9s6tdUzW7dHRDOSrxEREb3RzZXnn0h66tG+kjYEfjK0bvuodp0kbS9ptqQ5kj7eZvuHJF0n6RpJF0iaWts2X9LVZZnZRawRE13yNWICSM5GjL9uiuejqE7IK0najOrK1TtH6iBpEnAssAOwIbBnOYnX/R6Ybntj4CzgC7Vtj9repCw7dxFrxESXfI1YwiVnI5rRzTzP50hamurhKCsDb7H9l1G6bQ7MsX0jgKTTqOaefWoaHtu/qu1/GaOc4CNidMnXiAkhORvRgFGLZ0lfZ8E781cBbgAOkITtD47QfS1gbm39VmCLEfZ/D/Cz2vpyZeL4J4DP2f5/beLbD9gPYMqUKSMcOmLJ1+/5WmJMzkaMjZxjIxrQyZXnWS3rV/YiEEnvBKYDW9aap9qeJ+kFwIWS/mj7hno/28cDxwNMnz4902/FRNfX+QrJ2Ygm5BwbMXZGLZ5tnwwgaUXgH7bnl/VJwLKjdJ8HTK6tr13aFiBpG+CTwJa2H6u997zy80ZJFwGbUl1Fi4g2kq8RE0pyNqIB3dwweAGwfG19eeCXo/S5AlhP0jqSlgH2ABa4o1fSpsBxwM6276i1rypp2fJ6deA11MZxRcSIkq8RS77kbEQDOr5hEFjO9kNDK7YfkrTCSB1sPyHpAOA8YBIww/a1ko4AZtmeCXwRWAk4UxLAX8tdvy8GjpP0JFWR/znbSeyIziRfI5ZwydmIZnRTPD8s6eW2rwIo0189Olon2+cC57a0HVp7vc0w/S4FXtpFfBHxtORrxASQnI0Yf90UzwdRfXK9jerRvs8Ddu9FUBGx2A4i+RoRETHmupnn+QpJGwDrl6bZth/vTVgRsTiSrxEREb3RyTzPW9u+UNKuLZteVOaNPbtHsUVEl5KvERERvdXJlectgQuBndpsM5CTcUT/SL5GRET0UCfzPB9Wfu7b+3AiYnEkXyMiInqrk2EbHxppu+0vj104EbE4kq8RERG91cmwjZVH2JZHdUb0l+RrRERED3UybOPTAJJOBg60fV9ZXxU4uqfRRURXkq8RERG91c3juTceOhED2L4X2HTMI4qIsZB8jYiI6IFuiuelytUrACStRncPWYmI8ZN8jYiI6IFuTqZHA7+VdGZZ3w347NiHFBFjIPkaERHRA908YfAUSbOArUvTrrav601YEbE4kq8RERG90dXXuOXkmxNwxABIvkZERIy9bsY8R0RERERMaCmeIyIiIiI6lOI5IiIiIqJDKZ4jIiIiIjqU4jkiIiIiokM9L54lbS9ptqQ5kj7eZvuykk4v238naVpt2yGlfbak7Xoda8REl3yNGCzJ2Yjx19PiWdIk4FhgB2BDYE9JG7bs9h7gXtsvBL4CfL703RDYA9gI2B74ZjleRPRA8jVisCRnI5rR6yvPmwNzbN9o+5/AacAuLfvsApxcXp8FvF6SSvtpth+zfRMwpxwvInoj+RoxWJKzEQ3o6iEpi2AtYG5t/VZgi+H2sf2EpPuBZ5f2y1r6rtX6BpL2A/Yrqw9Jmj02oUcXVgfuajqIQaAv7T2Wh5s6lgdjHPIVkrN9IjnboYmes8nXvpB87dB45Wuvi+ees308cHzTcUxkkmbZnt50HDEYkrPNS85Gp5KvzUu+9p9eD9uYB0yura9d2truI+kZwCrA3R32jYixk3yNGCzJ2YgG9Lp4vgJYT9I6kpahujlhZss+M4Gh6+xvAy607dK+R7lTeB1gPeDyHscbMZElXyMGS3I2ogE9HbZRxlcdAJwHTAJm2L5W0hHALNszgROA70maA9xDlfyU/c4ArgOeAPa3Pb+X8cYiy1d6S4Dk64SSnF0CJGcnjORrn1H1ATQiIiIiIkaTJwxGRERERHQoxXNERERERIdSPEdEREREdCjFcyw2Sac0HUNERETEeBj4h6TE+JLUOg2SgNdJehaA7Z3HPaiIiIiIcZLiObq1NtXURt8FTFU8TweObjKoiOiepH1tn9h0HBERgyRT1UVXJC0FHAi8EfiI7asl3Wj7BQ2HFhFdkvRX21OajiMiKpKuAs4Gfmj7hqbjifZy5Tm6YvtJ4CuSziw//07+P4roW5KuGW4T8NzxjCUiRrUq8CzgV5L+BvwQON32bY1GFQvIledYLJLeBLzG9ieajiUiFlY+4G4H3Nu6CbjU9prjH1VEtCPpKtsvL6//BdgT2BX4M9XV6DxtsA+keI6IWIJJOgE40fbFbbb9wPY7GggrItqoF8+1tknAG4Ddbe/bTGRRl+I5IiIiog9IOs32Hk3HESPLPM8RERERfWCkwllSrjr3iRTPsRBJl3a5/1aSftqreCIiIoJPNx1AVDJLQizE9qubjiEiOiPp0m5yVtJWwMG2d+xZUBGxSDI7zmBI8RwLkfSQ7ZXKSfZw4C7gJcCVwDttW9L2wFeBR4CLa31XBL5e9l8aONz2/0j6GnC37SMkbQd8EtiqTH0XEYsoH3YjlijPZYTZccY/nGgnwzZiNJsCBwEbAi8AXiNpOeA7wE7AZsDzavt/ErjQ9ubA64AvloL6EGB3Sa8DjgH2TeEcsfgkPVR+biXpIklnSfo/SadKUtm2fWm7imraq6G+K0qaIelySb+XtEtp/5qkQ8vr7ST9ujwgKSJ666fASrZvaVluBi5qNrQYkivPMZrLbd8KIOlqYBrwEHCT7etL+/eB/cr+2wI7Szq4rC8HTLH9Z0nvA34N/FeenBTRE5sCGwG3AZdQfdidRfVhd2tgDnB6bf+hD7vvlvQs4HJJv6T6sHuFpN9Qfdh9Yz7sRvSe7feMsC3TSvaJFM8xmsdqr+cz+v8zAt5qe3abbS8F7gbyUIaI3siH3YiIHsvXcLEo/g+YJmndsr5nbdt5wAdqXxdvWn5OBT5MdWVsB0lbjGO8ERPFon7Y3aQsU2z/uWzLh92IiDZSPEfXbP+D6srVOWUM5R21zUdS3Sh4jaRrgSNLIX0C1R3+twHvAb5bxk5HRG/lw25En8hUsEuGDNuIhdheqfy8iNoNCrYPqL3+ObBBm76PAv/e5rDb1Pa5kuqqVkT0mO1/SBr6sPsI8Btg5bL5SKpZc64pNwTeJGknah92Jb0HOEnSK8oH54hYRJkdZ8mQx3NHREREjINFnAr2BbZ3zFSw/SNXniMiIiLGX2bHGVAZ8xwREREx/i63fWspdK+mmh1nA8rsOK6GBny/tv+2wMfLTDoX8fTsOI8A7wPOB76R2XF6L1eeIyIiIsZfpoIdULnyHBEREdEfMjvOAEjxHBEREdEHMhXsYMhsGxERERERHcqV54iIiIiIDqV4joiIiIjoUIrniIiIiIgOpXiOiIiIiOhQiueIiIiIiA6leI6IiIiI6FCK54iIiIiIDqV4joiIiIjoUIrniIiIiIgOpXiOiIiIiOhQiueIiIiIiA6leI6IiIiI6FCK5yWApH0kXVxbf0jSC0bpM02SJT2j9xFGxHAGNX8l7SXpF029f0Qn+jm/JL1G0vUlpjf38r1ibKVwWgLZXqnpGMaKpGnATcDStp9oOJyInhuU/LV9KnBq03FEdKPP8usI4Bu2v9Z0IK0kHQ680PY7m46lH+XKcwy8XD2P6F7yJqJ3OsyvqcC1Y3V8SZMW9xjRmRTPA0bSZElnS7pT0t2SvtFmH0t6YXm9vKSjJd0i6X5JF0tavk2ft0q6WdJLRnn/10q6VNJ9kuZK2qe0ryLplBLXLZI+JWmpsu1wSd+vHWOBr8QkXSTpSEmXSHpQ0i8krV52/3X5eV/5autV5Wu4SyR9RdLdwBGS7pH00tp7PEfSI5LW6ObvG9FLTeZvLe/eI+mvwIWl/d2S/izpXknnSZpa67OtpNnlvb8p6X8lvbdsa/06/NWSrij7XiHp1bVtI+V4xJgYpPySdAPwAuAn5dy2bDmPniDpdknzJH1GpSBuc947XNJJkr4l6VxJDwOvk7SmpB+Vv8FNkj5Yi/FwSWdJ+r6kB4B9hvldtgc+AexeYvuDpN0kXdmy34ck/U95fZKkb0s6v+T4/7b8W7JB2XZP+Tfl7cP9LQdBiucBUpLop8AtwDRgLeC0Ubp9CdgMeDWwGvBR4MmW4+4LfB7YxvafRnj/qcDPgK8DawCbAFeXzV8HVqH6x2BL4F3Avh3+agDvKPs/B1gGOLi0/2v5+SzbK9n+bVnfArgReC5wJNXfof710p7ABbbv7CKGiJ5pOn9rtgReDGwnaReqk+SuVDn9G+CH5birA2cBhwDPBmaXONr9bqsB5wDHlH2/DJwj6dm13YbL8YjFNmj5ZXtd4K/ATuXc9hhwEvAE8EJgU2Bb4L21Y9fPe58tbe8or1cGLgV+Avyh/P6vBw6StF3tGLtQ5fWzGGbYle2fA0cBp5fYXgbMBNaR9OLarv8GnFJb34vqfLw6VW1wKoCkFYHzgR9Q5f8ewDclbdj2LzgIbGcZkAV4FXAn8IyW9n2Ai2vrpkq+pYBHgZe1Oda0st/BwHXA2h28/yHAj9u0TwL+CWxYa/t34KLy+nDg+23e+xll/SLgU7Xt7wd+3m7f2u/715YYtqD6h0hlfRbw9qb/m2XJMrT0Qf4O9XlBre1nwHtq60sBj1B9nfwu4Le1bQLmAu9tjZvqJHp5y/v9FtinvB42x7NkGYtl0PKrrN9MVZRDVRA/Bixf239P4Fe136P1vHcScEptfYs2+xwCnFheHw78usO/5+HUztul7VvAZ8vrjYB7gWVrsZxW23clYD4wGdgd+E3LsY4DDmv6/5tFXXLlebBMBm5x5zfOrQ4sB9wwwj4fAY61fWuH79/uWKsDS1N94h9yC9Un3079rfb6EarEG8nc+ort35V+W0nagOofx5ldvH9ErzWdv0PquTMV+JqqYVj3AfdQFclrAWvW93V1xhvufdZkwfyHhf8N6DbHI7oxaPnVairVefT22v7HUV2pbXfs4d5vzaH+5RifoCrMRzpGp04G3iFJVB+Yz3B1xXyhY9t+iOr3XbPEtUVLXHsBz1uMWBqVweKDZS4wRdIzOvwH4i7gH8C6VF/jtLMt8HNJf7P9ow7ef/Nh3udxqgS5rrRNAeaV1w8DK9T27yZh3EX7yVRDN/4GnGX7H128T0SvNZ2/Q+q5M5fqStJCX99KWg9Yu7au+nqL26jyv24K8PMOY4pYXAOVX23MpbryvPoI8bc777W+30221+swvpEstJ/tyyT9E/gXquEi72jZZfLQC0krUQ2Fua3E9b+239Dhe/e9XHkeLJcDtwOfk7SipOUkvWa4nW0/CcwAvlxuIpik6oa7ZWu7XQtsDxwraedR3v9UYBtJb5f0DEnPlrSJ7fnAGcBnJa1cxkZ/CBi6SfBq4F8lTZG0CtXXSJ26k2oM2ojzchbfB95CVUCfMsq+EeOt6fxt59vAIZI2gqdu/N2tbDsHeKmkN6u6uXd/hv/gey7wIknvKP827A5sSDUGNWI8DFp+tcZzO/AL4GhJz5S0lKR1JW3ZxftdDjwo6WOqboacJOklkl6xCLH/HZimcuN/zSnAN4DHbV/csu2NqiYVWIZq7PNltudS/TvwIkn/JmnpsryiZfz0QEnxPEBKkboT1ZCEv1J9hbr7KN0OBv4IXEH1FcrnafnvbvsPwI7AdyTtMML7/xV4I/DhcqyrgZeVzR+gusJ8I3Ax1Y0BM0q/84HTgWuAK+nihGr7EaqbIS4pX/e8coR95wJXUX1i/k2n7xExHprO32Fi+nE55mmq7r7/E7BD2XYXsBvwBeBuqmJ4FtXVsdbj3F1i+HDZ96PAjuUYET03aPk1jHdR3Ux7HdV44rOA53fxfvNLrJtQPR/hLuC7VDfzd+vM8vNuSVfV2r8HvISnL47V/QA4jOpvuRnlJn7bD1Jdxd+D6kr036j+Lsu2OcZAGLq5KmKJIGkGcJvtTzUdS8SSpFyBuhXYy/avmo4nIsafqqn87gBebvv6WvtJwK0T5dybMc+xxFD1NMJdqab4iYjFVKa4+h3VrAQfobrZ6bJGg4qIJv0ncEW9cJ6IMmwjFiBpL1WTorcui/QUpPEi6Uiqr8S+aPumpuOJaEIP8vdVVLMR3EX1lfibbT86ZgFHDJBBPT8OR9LPhvl9PjHM/jcDB1INz5rQMmwjIiIiIqJDufIcEREREdGhJWrM8+qrr+5p06Y1HUbEuLjyyivvsr1G03EsjuRsTCSDnrPJ15hIRsrXJap4njZtGrNmzWo6jIhxIan1iW4DJzkbE8mg52zyNSaSkfK1p8M2JM2QdIekPw2zXZKOkTRH0jWSXl7btrek68uydy/jjIhKcjZicCRfI5rR6zHPJ1E9nWc4OwDrlWU/4FsAklajmmh7C6rHQR8madWeRhoRkJyNGCQnkXyNGHc9LZ5t/5rqSTPD2QU4xZXLgGdJej6wHXC+7Xts3wucz8j/QETEGEjORgyO5GtEM5oe87wWMLe2fmtpG659IZL2o/pEzZQpU8YssM0+csqYHStiyJVffFfTISyu5GxMKAOes8nXmFDGK18Hfqo628fbnm57+hprDOxNzBETRnI2YnAkXyMW1nTxPA+YXFtfu7QN1x4RzUrORgyO5GtEDzRdPM8E3lXuCH4lcL/t24HzgG0lrVpuYti2tEVEs5KzEYMj+RrRAz0d8yzph8BWwOqSbqW6u3dpANvfBs4F3gjMAR4B9i3b7pF0JHBFOdQRtke6KSIixkByNmJwJF8jmtHT4tn2nqNsN7D/MNtmADN6EVdEtJecjRgcydeIZjQ9bCMiIiIiYmCkeI6IiIiI6FCK54iIiIiIDqV4joiIaJCkF0m6QNKfyvrGkj7VdFwR0V6K54iIiGZ9BzgEeBzA9jXAHo1GFBHDSvEcERHRrBVsX97S9kQjkUTEqFI8R0RENOsuSesCBpD0NuD2ZkOKiOH0dJ7niIiIGNX+wPHABpLmATcBezUbUkQMJ8VzREREg2zfCGwjaUVgKdsPNh1TRAwvwzYiIiIaJOnZko4BfgNcJOlrkp7ddFwR0V6K54iIiGadBtwJvBV4W3l9eqMRRcSwMmwjIiKiWc+3fWRt/TOSdm8smogYUa48R0RENOsXkvaQtFRZ3g6c13RQEdFeiueIiIhmvQ/4AfBYWU4D/l3Sg5IeaDSyiFhIhm1EREQ0yPbKTccQEZ3LleeIiIgGSfqRpDdKyjk5YgB0nKiSXtrLQCIiIiaob1E9FOV6SZ+TtH7TAUXE8Lr5lPtNSZdLer+kVXoWUURExARi+5e29wJeDtwM/FLSpZL2lbR0s9FFRKuOi2fb/0L1yXgycKWkH0h6w2j9JG0vabakOZI+3mb7VyRdXZa/SLqvtm1+bdvMTmONiEWTfI1oRnkoyj7Ae4HfA1+jKqbPH6VfcjZinHV1w6Dt6yV9CpgFHANsKknAJ2yf3bq/pEnAscAbgFuBKyTNtH1d7Zj/Vdv/A8CmtUM8anuTbmKMiIqktYCp1PLc9q9H2D/5GtEAST8G1ge+B+xk+/ay6XRJs0bol5yNaEDHxbOkjYF9gTdRfRLeyfZVktYEfgssVDwDmwNzbN9YjnEasAtwXZt9AfYEDus8/IhoR9Lngd2pcm1+aTYwbPFM8jWiKd+xfW69QdKyth+zPX2EfsnZiAZ0M+b568BVwMts72/7KgDbtwGfGqbPWsDc2vqtpW0hkqYC6wAX1pqXkzRL0mWS3txFrBET3ZuB9W2/0fZOZdl5lD7J14hmfKZN22876JecjWhAR1eey1dD82x/r9324dq7tAdwlu35tbaptudJegFwoaQ/2r6hJbb9gP0ApkyZMgZhRCwRbgSWpnrgQi8sUr5CcjZiiKTnURW7y0vaFFDZ9ExghTF+u5xjI8ZIR8Wz7fmSJktaxvY/uzj+PKobDIesXdra2QPYv+V955WfN0q6iGqs1g0t+xwPHA8wffp0dxFbxJLsEeBqSRdQK6Btf3CEPj3P17I9ORtR2Y7qJsG1gaN5unh+APhEB/1zjo1oQDc3DN4EXFLuyH14qNH2l0focwWwnqR1qBJ6D+AdrTtJ2gBYldrXVJJWBR6x/Zik1YHXAF/oIt6IiWxmWbqRfI0YR7ZPBk6W9FbbPxpuP0l7l31bJWcjGtBN8XxDWZYChh4lOuKnUNtPSDoAOA+YBMywfa2kI4BZtodO7nsAp9muH+/FwHGSnizv+bn6HcQRMTzbJ0taBnhRaZpt+/FR+iRfIxowUuFcHAgsVDwnZyOa0U3xfJ3tM+sNknYbrVO5g/jclrZDW9YPb9PvUiBPNYxYBJK2ojrZ3kz1VfDkcvVqpNk2kq8R/UnDbUjORoy/bmbbOKTDtoho3tHAtra3tP2vVGMrv9JwTBGxaDLWOKKPjHrlWdIOwBuBtSQdU9v0TOCJXgUWEYtladuzh1Zs/yWP+Y0YWMNeeY6I8dfJsI3bqJ4ouDNwZa39QeC/2vaIiKbNkvRd4PtlfS+qPI6IPiJpKeBtts8YYbdLxiueiBjdqMWz7T8Af5D0g9FuOIqIvvGfVNNSDU1N9xvgm82FExHt2H5S0keBYYtn2weMY0gRMYpubhjcXNLhwNTST4Btv6AXgUXEorP9GPDlskREf/ulpIOB01lwKth7mgspIobTTfF8AtUwjSuB+aPsGxENkHSG7bdL+iNtbjKyvXEDYUXEyHYvP+sPMTGQi1MRfaib4vl+2z/rWSQRMRYOLD93bDSKiOiY7XWajiEiOtdN8fwrSV8EzmbBx/1eNeZRRcQisX17+XlL07FERGckrQB8CJhiez9J6wHr2/5pw6FFRBvdFM9blJ/Ta20Gth67cCJiLEh6kIWHbdxPNePGh23fOP5RRcQwTqQaEvnqsj4POBNI8RzRhzounm2/rpeBRMSY+ipwK/ADqpt79wDWBa4CZgBbNRVYRCxkXdu7S9oTwPYjkjK3c0Sf6rh4lnRou3bbR4xdOBExRna2/bLa+vGSrrb9MUmfaCyqiGjnn5KWp3xbJGldasMjI6K/dPN47odry3xgB2BaD2KKiMX3iKS3S1qqLG8H/lG25VG/Ef3lMODnwGRJpwIXAB9tNqSIGE43wzaOrq9L+hJw3phHFBFjYS/ga1QPRjFwGfDOcnUrD1yI6BPlCYOrArsCr6QaZnWg7bsaDSwihtXNDYOtVgDWHqtAImLslBsCdxpm88XjGUtEDG/oCYPl8dznNB1PRIyu42Ebkv4o6ZqyXAvMpropKSL6jKQXSbpA0p/K+saSPtV0XBHR1i8lHSxpsqTVhpamg4qI9rq58lx/6MITwN9tPzHG8UTE2PgO8BHgOADb10j6AfCZRqOKiHbyhMGIAdLNmOdbJL0M+JfS9Gvgmp5EFRGLawXbl7fMdpUPuxF9pox5/rjt05uOJSI6082wjQOBU4HnlOVUSR/oVWARsVjuKtNdDU199Tbg9mZDiohWtp+k+pYoIgZEN8M23gNsYfthAEmfB34LfL0XgUXEYtkfOB7YQNI84CaqGTgiov/8UtLBwOlU08ECYPue5kKKiOF0M8+zqOZ3HjK/tI3cSdpe0mxJcyR9vM32fSTdKenqsry3tm1vSdeXZe8uYo2YsCRNAt5vextgDWAD26+1fUsHfZOvEeNvd6oPvL+mekz3lcCsTjomZyPGXzdXnk8Efifpx2X9zcAJI3UoJ/FjgTdQPSr4CkkzbV/Xsuvptg9o6bsa1cTx06m+er6y9L23i5gjJhzb8yW9trx+eLT9hyRfI5phe51F6ZecjWhGx1eebX8Z2Be4pyz72v7qKN02B+bYvtH2P4HTgF06fMvtgPNt31OS+Xxg+07jjZjgfi9ppqR/k7Tr0DJKn+RrRAMkrSDpU5KOL+vrSdpxtH4kZyMa0c0Ng68Errd9jO1jgBskbTFKt7WAubX1W0tbq7eW+aPPkjS5m76S9pM0S9KsO++8s9NfJ2JJtxxwN7A11cNSdmLB6Sbb6Xm+QnI2oo0TgX8Cry7r8+hsWsmcYyMa0M2Y528BD9XWHypti+snwDTbG1N98j25m862j7c93fb0NdZYYwzCiRh8tvdts7x7aLukQxbx0IuVryW25GzEgta1/QXgcQDbj9DBPUUdyjk2Yox1dcOgbQ+tlOl1RhszPQ+YXFtfu7Q9xfbdth8rq98FNuu0b0Qsst3atCVfI5rxT0nL8/TUkusCj43cBUjORjSim+L5RkkflLR0WQ4EbhylzxXAepLWkbQMsAcws76DpOfXVncG/lxenwdsK2lVSasC25a2iFh87a5qJV8jmnEY8HNgsqRTgQuAj3bQLzkb0YBuZtv4D+AY4FNUn44vAPYbqYPtJyQdQJWQk4AZtq+VdAQwy/ZM4IOSdqZ6+tk9wD6l7z2SjqT6xwHgiMx5GTFmvFBD8jViXEl6je1LqKao2xV4JdUH2wNt3zVa/+RsRDO6eTz3HVSfatuSdIjt/27T71zg3Ja2Q2uvDwHajr+0PQOY0WmMEdGxtuMpk68R4+oYqmEUv7X9cuCcbg+QnI0Yf91ceR7NbsBCxXNEjD9Jq7VeRZK0ju2byuqZDYQVEQt6vExPt7akY1o32v5gAzFFxCjGsngeqzuDI2Lx/UTSDrYfAJC0IXAG8BIA20c1GVxEANX0kdtQzbl8ZcOxRESHxrJ4XmgMZUQ05iiqAvpNwPrAKcBezYYUEXVlXPNpkv5s+w9NxxMRncmV54glkO1zJC0N/AJYGXiL7b80HFZEtPeopAuA59p+iaSNgZ1td/KglIgYZx0XzxlDGdH/JH2dBb8FWgW4AThAUsZQRvSn7wAfAY4DsH2NpB/Q2VMGI2KcdXPlOWMoI/rfrJb1jKOM6H8r2L5cWuAL3CeaCiYiRtZN8ZwxlBF9zvbJAJJWBP5he35ZnwQs22RsETGsu8pTBYeeMPg24PZmQ4qI4XQzz3PGUEYMjguo7uJ/qKwvT5W7r24soogYzv7A8cAGkuYBN5GLUxF9a9TiOWMoIwbScraHCmdsPyRphSYDioiFlW+F3m97m/KN0VK2H2w6rogYXidXnjOGMmLwPCzp5bavApC0GfBowzFFRAvb8yW9trx+uOl4ImJ0oxbPGUMZMZAOAs6UdBvVNJLPA3ZvNKKIGM7vJc2kmrXqqQLa9tnNhRQRw+nmhsGMoYwYELavkLQB1c29ALNtP95kTBExrOWAu4Gta20GUjxH9KFuiueMoYzoc5K2tn2hpF1bNr2o3KOQk3FEn7G970jbJR1i+7/HK56IGFk3xXPGUEb0vy2BC4Gd2mzLlayIwbQbkOI5ok90UzwfRMZQRvQ124eVnyNeyYqIgaLRd4mI8dLNPM8ZQxnR5yR9aKTttr88XrFExJjx6LtExHjpZJ7njKGMGBwrj7AtJ+CIwZQrzxF9pJMrzxlDGTEgbH8aQNLJwIG27yvrqwJHNxhaRAxD0mq272lpW8f2TWX1zAbCiohhLDXaDvUxlG2Wd4/WX9L2kmZLmiPp4222f0jSdZKukXSBpKm1bfMlXV2Wmd3+chET2MZDhTOA7XuBTUfrlHyNaMRPJD1zaEXShsBPhtZtHzVcx+RsxPjrZNjGIo+hLA9SORZ4A3ArcIWkmbavq+32e2C67Uck/SfwBZ6+EfFR25uMFmNELGQpSauWohlJqzFKvidfIxpzFFUB/Saq+4pOAfYarVNyNqIZnQzbWJwxlJsDc2zfCCDpNGAX4KnEtv2r2v6XAe/sIKaIGNnRwG8lDX3duxvw2VH6JF8jGmD7HElLUz14bGXgLbb/0kHX5GxEAzp5PPfijKFcC5hbW78V2GKE/d8D/Ky2vpykWcATwOds/7/WDpL2A/YDmDJlyijhREwMtk8puTP0xLJdW65GtdPzfIXkbMQQSV9nwYtQqwA3AAeUG/I/OMohco6NaEA38zwvNIZS0qhjKDsl6Z3AdKobFIdMtT1P0guACyX90fYN9X62jweOB5g+fXpmE4goSrE8WsG8SBY1X0tcydmIyqyW9St79UY5x0aMnW6K567HUALzgMm19bVL2wIkbQN8EtjS9mND7bbnlZ83SrqI6oanhU7GETEmkq8R48j2yQCSVgT+YXt+WZ8ELNvBIZKzEQ0YdbaNmqExlEdKOhK4lOrGg5FcAawnaR1JywB7AAvc0VuuXh8H7Gz7jlr7qpKWLa9XB15Dj66iRQSQfI1oygXA8rX15YFfdtAvORvRgG6eMNj1GErbT0g6ADgPmATMsH2tpCOAWbZnAl8EVqJ69DfAX23vDLwYOE7Sk1RF/uc6GLMZEYso+RrRmOVsPzS0YvshSSuM1ik5G9GMboZtLNIYStvnAue2tB1ae73NMP0uBV7azXtFxOJJvkY04mFJL7d9FYCkzYBHO+mYnI0Yf10VzxERETHmDqK6Mnwb1aO4n8fTczFHRJ9J8RwREdEg21dI2oDqASkAs20/3mRMETG8FM8RERENkLS17Qsl7dqy6UVlnuezGwksIkaU4jkiIqIZWwIXAju12WYgxXNEH0rxHBER0QDbh5Wf+zYdS0R0LsVzREREAyR9aKTttr88XrFEROdSPEdERDRj5RG25VHYEX0qxXNEREQDbH8aQNLJwIG27yvrq1I91Tci+lA3j+eOiIiIsbfxUOEMYPteYNPmwomIkaR4joiIaNZS5WozAJJWI98MR/StJGdERESzjgZ+K+nMsr4b8NkG44mIEaR4joiIaJDtUyTNArYuTbvavq7JmCJieCmeIyIiGlaK5RTMEQMgY54jIiIiIjqU4jkiIiIiokMpniMiIiIiOpTiOSIiIiKiQymeIyIiIiI61PPiWdL2kmZLmiPp4222Lyvp9LL9d5Km1bYdUtpnS9qu17FGTHTJ14jBkpyNGH89LZ4lTQKOBXYANgT2lLRhy27vAe61/ULgK8DnS98NgT2AjYDtgW+W40VEDyRfIwZLcjaiGb2+8rw5MMf2jbb/CZwG7NKyzy7AyeX1WcDrJam0n2b7Mds3AXPK8SKiN5KvEYMlORvRgF4/JGUtYG5t/VZgi+H2sf2EpPuBZ5f2y1r6rtX6BpL2A/Yrqw9Jmj02oUcXVgfuajqIQaAv7T2Wh5s6lgdjHPIVkrN9IjnboYmes8nXvpB87dB45evAP2HQ9vHA8U3HMZFJmmV7etNxxGBIzjYvORudSr42L/naf3o9bGMeMLm2vnZpa7uPpGcAqwB3d9g3IsZO8jVisCRnIxrQ6+L5CmA9SetIWobq5oSZLfvMBIaus78NuNC2S/se5U7hdYD1gMt7HG/ERJZ8jRgsydmIBvR02EYZX3UAcB4wCZhh+1pJRwCzbM8ETgC+J2kOcA9V8lP2OwO4DngC2N/2/F7GG4ssX+ktAZKvE0pydgmQnJ0wkq99RtUH0IiIiIiIGE2eMBgRERER0aEUzxERERERHUrxHBERERHRoRTPEREREX1C0gaSXi9ppZb27ZuKKRaU4jnGjKR9m44hIiJiUEn6IPA/wAeAP0mqP279qGaiilYpnmMsfbrpACLiafUrVZJWkXSCpGsk/UDSc5uMLSLaeh+wme03A1sB/5+kA8s2NRVULGjgH88d40vSNcNtAnIyjugvRwE/L6+PBm4HdgJ2BY4D3txMWBExjKVsPwRg+2ZJWwFnSZpKiue+keI5uvVcYDvg3pZ2AZeOfzgR0aHptjcpr78iae+Rdo6IRvxd0ia2rwaw/ZCkHYEZwEsbjSyekuI5uvVTYKWhxK6TdNG4RxMRI3mOpA9Rfbh9piT56SdjZdheRP95F9UTH59i+wngXZKOayakaJUnDEZELKEkHdbS9E3bd0p6HvAF2+9qIq6IiEGW4jkiYgkmaQNgLeB3Q2MpS/v2tn8+fM+IiGgnX9tFRCyhJH2ATHsVETGmUjzHQiR1deOfpK0k/bRX8UTEItuPTHsV0Tdyfl0y5IbBWIjtVzcdQ0SMiUx7FdFHcn5dMuTKcyxE0kPl51aSLpJ0lqT/k3SqJJVt25e2q6jmjB3qu6KkGZIul/T7oa+JJX1N0qHl9XaSfi0p//9F9NbfJW0ytFIK6R2B1cm0VxHjLufXJUOuPMdoNgU2Am4DLgFeI2kW8B1ga2AOcHpt/08CF9p+t6RnAZdL+iVwCHCFpN8AxwBvtP3k+P0aERNSpr2K6F85vw6ofDKJ0Vxu+9aSiFcD04ANgJtsX1/mjP1+bf9tgY9Luhq4CFgOmGL7EarHjp4PfMP2DeP2G0RMUCV3/zbMtkvGO56IWEDOrwMqV55jNI/VXs9n9P9nBLzV9uw2214K3A2sOUaxRUREDKqcXwdUrjzHovg/YJqkdcv6nrVt5wEfqI3d2rT8nAp8mOprqh0kbTGO8UZERAyCnF8HQIrn6Jrtf1BNgXVOuaHhjtrmI4GlgWskXQscWRL9BOBg27cB7wG+K2m5cQ49YomTqa8ilhw5vw6GPGEwImICKdPVHWx7x4ZDiYgYSLnyHBExwDL1VUTE+MoNgxERS45MfRUR0WO5khARseTI1FcRET2WK88REUuOTH0VEdFjufIcEbFky9RXERFjKMVzRMQSLFNfRUSMrUxVFxERERHRoVx5joiIiIjoUIrniIiIiIgOpXiOiIiIiOhQiueIiIiIiA6leI6IiIiI6FCK54iIiIiIDqV4joiIiIjo0P8PTSiMo7XMJNQAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# 分析用户点击环境变化是否明显,这里随机采样10个用户分析这些用户的点击环境分布\n", + "sample_user_ids = np.random.choice(tst_click['user_id'].unique(), size=10, replace=False)\n", + "sample_users = user_click_merge[user_click_merge['user_id'].isin(sample_user_ids)]\n", + "cols = ['click_environment','click_deviceGroup', 'click_os', 'click_country', 'click_region','click_referrer_type']\n", + "for _, user_df in sample_users.groupby('user_id'):\n", + " plot_envs(user_df, cols, 2, 3)" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# 分析用户点击环境变化是否明显,这里随机采样10个用户分析这些用户的点击环境分布\n", - "sample_user_ids = np.random.choice(tst_click['user_id'].unique(), size=10, replace=False)\n", - "sample_users = user_click_merge[user_click_merge['user_id'].isin(sample_user_ids)]\n", - "cols = ['click_environment','click_deviceGroup', 'click_os', 'click_country', 'click_region','click_referrer_type']\n", - "for _, user_df in sample_users.groupby('user_id'):\n", - " plot_envs(user_df, cols, 2, 3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以看出绝大多数数的用户的点击环境是比较固定的。思路:可以基于这些环境的统计特征来代表该用户本身的属性" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户点击新闻数量的分布" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:40:04.296033Z", - "start_time": "2020-11-13T15:40:03.980868Z" - } - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "[]" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看出绝大多数数的用户的点击环境是比较固定的。思路:可以基于这些环境的统计特征来代表该用户本身的属性" ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAASw0lEQVR4nO3da4yc1X3H8e8fr+/GVxZj1nZsgnshVCl0RYyS8iLkBm1qKpGIqCpWimSpJU1SWjXQvEjUV0nUQEMTkTghFamilIRQYVW0gQJRlRdxsgbCNYSNa8CLsZeLL/EFbHz6Yo6dsbPjZ9be2Znn+PuRrH2e85yZ55x9xr+ZOXP2TKSUkCSV64xuN0CS1FkGvSQVzqCXpMIZ9JJUOINekgrX1+0GAJx11llpxYoV3W6GJNXKpk2bXk4p9VfV64mgX7FiBUNDQ91uhiTVSkQ81049h24kqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSpcrYP+p1te5eb7nuGNQ4e73RRJ6lm1DvqHn3uNWx8c5tBhg16SWql10EuSqhn0klQ4g16SCmfQS1Lhigh6v99cklqrddBHdLsFktT7ah30kqRqBr0kFc6gl6TCGfSSVLgigt5JN5LUWq2DPnDajSRVqXXQS5KqGfSSVDiDXpIKV0TQJ9dAkKSWah30LoEgSdVqHfSSpGoGvSQVzqCXpMIZ9JJUuCKC3jk3ktRaEUEvSWrNoJekwhn0klS4toI+Iv4mIp6MiCci4jsRMSMiVkbExogYjog7I2Jarjs97w/n4ys62gNJ0glVBn1EDAAfBwZTShcCU4BrgM8Dt6SUzgdeA67LN7kOeC2X35LrSZK6pN2hmz5gZkT0AbOAbcC7gbvy8TuAq/L2mrxPPn55RGcXK3CpG0lqrTLoU0ojwD8Bz9MI+F3AJmBnSulQrrYVGMjbA8AL+baHcv1Fx99vRKyLiKGIGBodHT2pxnf4+UOSitDO0M0CGq/SVwLnArOBD5zqiVNK61NKgymlwf7+/lO9O0lSC+0M3bwH+L+U0mhK6SBwN/BOYH4eygFYCozk7RFgGUA+Pg94ZUJbLUlqWztB/zywOiJm5bH2y4GngIeAq3OdtcA9eXtD3icffzC5YLwkdU07Y/QbaXyo+jDweL7NeuBTwA0RMUxjDP72fJPbgUW5/Abgxg60W5LUpr7qKpBS+gzwmeOKNwOXjFH3APChU2/aOPh+QZJaqvVfxjrnRpKq1TroJUnVDHpJKpxBL0mFM+glqXBFBH1y2o0ktVTroHepG0mqVuuglyRVM+glqXAGvSQVzqCXpMIVEfSujSlJrdU66J10I0nVah30kqRqBr0kFc6gl6TCFRH0fhYrSa3VOujDNRAkqVKtg16SVM2gl6TCGfSSVDiDXpIKV0TQJ9dAkKSWah30TrqRpGq1DnpJUjWDXpIKZ9BLUuEMekkqXBFB75wbSWqt1kHvpBtJqlbroJckVTPoJalwBr0kFa6toI+I+RFxV0T8PCKejohLI2JhRNwfEc/mnwty3YiIWyNiOCIei4iLO9sFSdKJtPuK/kvAf6eUfgd4O/A0cCPwQEppFfBA3ge4AliV/60DbpvQFo/BpW4kqbXKoI+IecBlwO0AKaU3Uko7gTXAHbnaHcBVeXsN8K3U8GNgfkQsmeB2H2lcR+5WkkrSziv6lcAo8K8R8UhEfCMiZgOLU0rbcp2XgMV5ewB4oen2W3OZJKkL2gn6PuBi4LaU0kXAXn49TANAaqwTPK4BlIhYFxFDETE0Ojo6nptKksahnaDfCmxNKW3M+3fRCP7tR4Zk8s8d+fgIsKzp9ktz2TFSSutTSoMppcH+/v6Tbb8kqUJl0KeUXgJeiIjfzkWXA08BG4C1uWwtcE/e3gBcm2ffrAZ2NQ3xSJImWV+b9f4a+HZETAM2Ax+l8STx3Yi4DngO+HCuey9wJTAM7Mt1Oyq52o0ktdRW0KeUHgUGxzh0+Rh1E3D9qTWrPc65kaRq/mWsJBXOoJekwhn0klQ4g16SCldG0DvpRpJaqnXQu9SNJFWrddBLkqoZ9JJUOINekgpXRND7WawktVbroA8XQZCkSrUOeklSNYNekgpn0EtS4Qx6SSpcEUGfnHYjSS3VOuhdAkGSqtU66CVJ1Qx6SSqcQS9JhTPoJalwRQR9crUbSWqp1kHvpBtJqlbroJckVTPoJalwBr0kFc6gl6TCFRH0rnUjSa3VOuhd60aSqtU66CVJ1Qx6SSqcQS9JhTPoJalwRQS9k24kqbW2gz4ipkTEIxHxn3l/ZURsjIjhiLgzIqbl8ul5fzgfX9GhthOudiNJlcbziv4TwNNN+58HbkkpnQ+8BlyXy68DXsvlt+R6kqQuaSvoI2Ip8EfAN/J+AO8G7spV7gCuyttr8j75+OW5viSpC9p9Rf/PwN8Dh/P+ImBnSulQ3t8KDOTtAeAFgHx8V65/jIhYFxFDETE0Ojp6cq2XJFWqDPqI+GNgR0pp00SeOKW0PqU0mFIa7O/vn8i7liQ16WujzjuBP4mIK4EZwFzgS8D8iOjLr9qXAiO5/giwDNgaEX3APOCVCW95k+RiN5LUUuUr+pTSTSmlpSmlFcA1wIMppT8DHgKuztXWAvfk7Q15n3z8wdSpJHbkX5Iqnco8+k8BN0TEMI0x+Ntz+e3Aolx+A3DjqTVRknQq2hm6OSql9EPgh3l7M3DJGHUOAB+agLZJkiZAEX8ZK0lqrYig97NYSWqt1kHvZ7GSVK3WQS9JqmbQS1LhDHpJKpxBL0mFM+glqXC1DnpXP5akarUOeklSNYNekgpn0EtS4Qx6SSpcEUHvWjeS1Fqtg945N5JUrdZBL0mqZtBLUuEMekkqnEEvSYUrIugTTruRpFZqHfQudSNJ1Wod9JKkaga9JBXOoJekwhn0klS4IoLetW4kqbVaB72zbiSpWq2DXpJUzaCXpMIZ9JJUOINekgpXRNA76UaSWqt10IffMSVJlSqDPiKWRcRDEfFURDwZEZ/I5Qsj4v6IeDb/XJDLIyJujYjhiHgsIi7udCckSa2184r+EPC3KaULgNXA9RFxAXAj8EBKaRXwQN4HuAJYlf+tA26b8FZLktpWGfQppW0ppYfz9h7gaWAAWAPckavdAVyVt9cA30oNPwbmR8SSiW64JKk94xqjj4gVwEXARmBxSmlbPvQSsDhvDwAvNN1say47/r7WRcRQRAyNjo6Ot93HSK6BIEkttR30ETEH+D7wyZTS7uZjqZG040rblNL6lNJgSmmwv79/PDdtatNJ3UySTittBX1ETKUR8t9OKd2di7cfGZLJP3fk8hFgWdPNl+YySVIXtDPrJoDbgadTSjc3HdoArM3ba4F7msqvzbNvVgO7moZ4JEmTrK+NOu8E/hx4PCIezWX/AHwO+G5EXAc8B3w4H7sXuBIYBvYBH53IBkuSxqcy6FNKP4KWf5l0+Rj1E3D9KbZLkjRBav2XsUc450aSWisi6CVJrRn0klQ4g16SCmfQS1LhDHpJKlwRQe9SN5LUWq2DPlzsRpIq1TroJUnVDHpJKpxBL0mFM+glqXC1Dvq9rx8C4I1Dh7vcEknqXbUO+gWzpgJw2PmVktRSrYN+xtQpALzuK3pJaqnWQT+9rxH0Dt1IUmv1Dvqpjea//KvXu9wSSepdtQ76aVMazT/Dv5CVpJZqHfQLZ08DYMsre7vcEknqXbUO+vl51s3BNx2jl6RWah30s6Y1vtv82R2/6nJLJKl31TroARbNnsbTL+7udjMkqWfVPujPmTeDzS87Ri9JrdQ+6N+xchEAv9i+p8stkaTeVPug/+DblwDw9f/d3OWWSFJvqn3QX7R8AQDf27T16CJnkqRfq33QA/zjmrcB8MEv/6jLLZGk3lNE0F976QrOP3sOm0f3ctkXHmLPgYPdbpIk9Ywigh7g3o//IcsXzuL5V/fxe5+9jy/e94yLnUkSEKkH1nIfHBxMQ0NDE3JfN9//C2594Nmj++/53bN5/9vO4f0XnsPcGVMn5ByS1AsiYlNKabCyXmlBD7DvjUP8y4PD3P3wVrbv/vXKlufMncFFy+dz4cA8Llo2n7cNzGPO9D6mnOGiaJLq57QO+mbbdx9gw6Mv8pMtr/LY1p3HBP8R5/XPZmD+TM6aM53zz57DnOl9rDxrNnNnTmXZgpksmjO9I22TpFNh0Ldw8M3DPDGyi0ee38nW1/azY88Bntq2m9cPHmZk5/4T3nbx3OmcM3cG0Fg5c/nCWUePLVs46+hqmkDjiePMY58gBubPPPqtWJJ0qtoN+r4OnfwDwJeAKcA3Ukqf68R5TsbUKWdw0fIFR+ffN9v/xpvsP/gmW17Zy659B9m26wDbdx8AGn95u//gmwBseXkvz726j0de2ElKsGt/+7N8zpz+m7/y/jOnc+78mWPWnzltCr+1eM4J73PZgmOfZE7kwoF59I1jqGp63xTmzfKzDanOJjzoI2IK8BXgvcBW4KcRsSGl9NREn2uizZw2hZnTprQdmkfs2neQnfvfOLo/snM/o3uOHSJ6/pV9vLbvN58Qhkd/xd7XDx19Emm2bed+dux5nYd+vqPluQ8d7vw7sgWzpk7IO5HxPCFVmTV9CqvOPnNC7msiNJ6sZ0zqOd/aP4czZ3TktZpOwYy+KZzRY5/7deJRcgkwnFLaDBAR/w6sAXo+6E/WvFlTj3nV+5ZFsyft3Dt2H2DHnva+SvGpF3dz8HD7U04PvZl4YmQXE/EFXk+M7GbX/oPjevfTyos797PHv4JWj4qA8/tP/C682ccvX8UH335uB1vUmaAfAF5o2t8KvOP4ShGxDlgHsHz58g404/Rw9twZnD23vVeSFw7M63BrJs+BMd4Bdcure9/g+Vf3Teo5n3tlLzvHeIeo7npx535Gx/kd1vNmdn5otGvv+1JK64H10PgwtlvtUD310ofa586f2fIzlk5Zfd6iST2f6q0Tfxk7Aixr2l+ayyRJXdCJoP8psCoiVkbENOAaYEMHziNJasOED92klA5FxMeAH9CYXvnNlNKTE30eSVJ7OjJGn1K6F7i3E/ctSRqfYlavlCSNzaCXpMIZ9JJUOINekgrXE6tXRsQo8NxJ3vws4OUJbE4d2OfTg30+PZxKn9+SUuqvqtQTQX8qImKonWU6S2KfTw/2+fQwGX126EaSCmfQS1LhSgj69d1uQBfY59ODfT49dLzPtR+jlySdWAmv6CVJJ2DQS1Lhah30EfGBiHgmIoYj4sZut2e8ImJLRDweEY9GxFAuWxgR90fEs/nnglweEXFr7utjEXFx0/2szfWfjYi1TeV/kO9/ON920r/IMiK+GRE7IuKJprKO97HVObrY589GxEi+1o9GxJVNx27K7X8mIt7fVD7m4zsvAb4xl9+ZlwMnIqbn/eF8fMUkdZmIWBYRD0XEUxHxZER8IpcXe61P0Ofeu9YppVr+o7EE8i+B84BpwM+AC7rdrnH2YQtw1nFlXwBuzNs3Ap/P21cC/wUEsBrYmMsXApvzzwV5e0E+9pNcN/Jtr+hCHy8DLgaemMw+tjpHF/v8WeDvxqh7QX7sTgdW5sf0lBM9voHvAtfk7a8Cf5m3/wr4at6+BrhzEvu8BLg4b58J/CL3rdhrfYI+99y1ntT/9BP8S74U+EHT/k3ATd1u1zj7sIXfDPpngCVND6Rn8vbXgI8cXw/4CPC1pvKv5bIlwM+byo+pN8n9XMGxodfxPrY6Rxf73Oo//zGPWxrf43Bpq8d3DrmXgb5cfrTekdvm7b5cL7p0ze8B3ns6XOsx+txz17rOQzdjfQn5QJfacrIScF9EbIrGl6UDLE4pbcvbLwGL83ar/p6ofOsY5b1gMvrY6hzd9LE8TPHNpuGF8fZ5EbAzpXTouPJj7isf35XrT6o8jHARsJHT5Fof12fosWtd56AvwbtSShcDVwDXR8RlzQdT4+m66Pmvk9HHHvk93ga8Ffh9YBvwxa62pkMiYg7wfeCTKaXdzcdKvdZj9LnnrnWdg772X0KeUhrJP3cA/wFcAmyPiCUA+eeOXL1Vf09UvnSM8l4wGX1sdY6uSCltTym9mVI6DHydxrWG8ff5FWB+RPQdV37MfeXj83L9SRERU2kE3rdTSnfn4qKv9Vh97sVrXeegr/WXkEfE7Ig488g28D7gCRp9ODLTYC2NcT9y+bV5tsJqYFd+u/oD4H0RsSC/RXwfjXG8bcDuiFidZydc23Rf3TYZfWx1jq44EkTZn9K41tBo5zV5FsVKYBWNDx3HfHznV6wPAVfn2x//+zvS56uBB3P9jsu//9uBp1NKNzcdKvZat+pzT17rbnxoMYEfflxJ45PuXwKf7nZ7xtn282h8uv4z4Mkj7acxzvYA8CzwP8DCXB7AV3JfHwcGm+7rL4Dh/O+jTeWD+UH2S+DLdOGDOeA7NN6+HqQxxnjdZPSx1Tm62Od/y316LP8nXdJU/9O5/c/QNDOq1eM7P3Z+kn8X3wOm5/IZeX84Hz9vEvv8LhpDJo8Bj+Z/V5Z8rU/Q55671i6BIEmFq/PQjSSpDQa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKtz/A1/NmoIeUlAfAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户点击新闻数量的分布" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "user_click_item_count = sorted(user_click_merge.groupby('user_id')['click_article_id'].count(), reverse=True)\n", - "plt.plot(user_click_item_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以根据用户的点击文章次数看出用户的活跃度" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAfX0lEQVR4nO3de3Rd5Xnn8e9zbrpLlmxZsWUbO7aAEAgGBCHNHSddQDIxawYIaWfiMm49bck0lMxqSGemWZm1shrWakug6WLFE5KYyQUICbUnQ7PiMSRtmoEggwMOkFg4NrbwRb7Jtm5Hl2f+2K/kYyOjI+tyrL1/n7W0zt7vfvc57wb5Oa/e/e7nNXdHRETiJVXqBoiIyNRTcBcRiSEFdxGRGFJwFxGJIQV3EZEYypS6AQDz5s3zpUuXlroZIiKzytatWw+5e+NYx86L4L506VLa2tpK3QwRkVnFzHaf7ZiGZUREYkjBXUQkhhTcRURiSMFdRCSGFNxFRGJIwV1EJIYU3EVEYmhWB/e2XUe450evoLTFIiKnm9XBfXtHFw/85FU6T/aXuikiIueVWR3cV8yvAaD9wMkSt0RE5Pwyy4N7NQDtnQruIiKFZnVwb6oto6YsQ/tBBXcRkUKzOribGcvnV7NDwzIiIqeZ1cEdoGV+tYZlRETOUFRwN7NPm9l2M/uVmd0ZyhrMbLOZ7Qiv9aHczOx+M2s3sxfM7MppbD8r5lfTeaKfrp6B6fwYEZFZZdzgbmaXAn8EXANcDnzUzFYAdwNb3L0F2BL2AW4AWsLPOuCBaWj3qJamkZuqJ6bzY0REZpVieu5vA55x9x53HwR+CvxbYDWwIdTZANwUtlcDD3nkaWCOmS2Y2mafsqIxmg6pcXcRkVOKCe7bgfea2VwzqwRuBBYDTe6+L9TZDzSF7WZgT8H5e0PZtGiur6A8m9KMGRGRAuMus+fuL5vZPcCPgW5gGzB0Rh03swnlADCzdUTDNixZsmQip54mnTLeOq+aHQruIiKjirqh6u4PuvtV7v4+4CjwG+DAyHBLeD0YqncQ9exHLAplZ77nendvdffWxsYx13ctWktTtXruIiIFip0tMz+8LiEab/8OsAlYE6qsATaG7U3AJ8OsmWuBroLhm2mxorGajmO9dPcPTufHiIjMGuMOywTfN7O5wABwh7sfM7MvAY+a2VpgN3BrqPsE0bh8O9AD3D7FbX6DkTQEOzu7uWxR3XR/nIjIea+o4O7u7x2j7DCwaoxyB+6YfNOKNzIdcsfBEwruIiLE4AlVgAvmVpFJmcbdRUSCWAT3bDrF0nlVmjEjIhLEIrhDdFP1VQV3EREgRsG9pama3Ud66B8cGr+yiEjMxSa4r5hfzdCws+tQT6mbIiJScrEK7oBuqoqIEKPgvryxGrNoOqSISNLFJriXZ9Msrq9Uz11EhBgFd4iGZhTcRURiGNx3HupmcGi41E0RESmp2AX3/OAwe472lropIiIlFbvgDpoxIyISy+CuGTMiknSxCu615VmaasvUcxeRxItVcAdomV+j4C4iiRe74L5ifpRALEorLyKSTMUus/fnZvYrM9tuZt81s3IzW2Zmz5hZu5k9Yma5ULcs7LeH40un9QrOsGJ+Nd35IfZ19c3kx4qInFfGDe5m1gz8GdDq7pcCaeA24B7gXndfQbRo9tpwylrgaCi/N9SbMaduqmpoRkSSq9hhmQxQYWYZoBLYB1wHPBaObwBuCturwz7h+CozsylpbRFaNB1SRGT84O7uHcDfAK8RBfUuYCtwzN0HQ7W9QHPYbgb2hHMHQ/25Z76vma0zszYza+vs7JzsdYyaW11GfWWWdk2HFJEEK2ZYpp6oN74MWAhUAddP9oPdfb27t7p7a2Nj42Tf7jTKMSMiSVfMsMyHgN+6e6e7DwA/AN4NzAnDNACLgI6w3QEsBgjH64DDU9rqcayYX8MOzZgRkQQrJri/BlxrZpVh7HwV8BLwFHBzqLMG2Bi2N4V9wvEnfYaj7Ir51RzrGeBwd34mP1ZE5LxRzJj7M0Q3Rp8DXgznrAc+C9xlZu1EY+oPhlMeBOaG8ruAu6eh3W9qeWMVADs7u2f6o0VEzguZ8auAu38e+PwZxTuBa8ao2wfcMvmmnbt51WUAHOtRz11Ekil2T6gC1JRH31nH+wbHqSkiEk+xDO615VkATvQNlLglIiKlEcvgXh167ifUcxeRhIplcM+mU1Rk0xzvVc9dRJIplsEdoLYio567iCRWbIN7TXmW4xpzF5GEinFwV89dRJIrtsG9tjyr2TIiklixDe415RnNcxeRxIpxcFfPXUSSK7bBvbZCPXcRSa74BvfyLPnBYfoGhkrdFBGRGRfj4K6nVEUkuWIb3GuUX0ZEEizGwV2ZIUUkuWIb3Gsr1HMXkeQqZoHsi8xsW8HPcTO708wazGyzme0Ir/WhvpnZ/WbWbmYvmNmV038ZbzTac+9Vz11EkqeYZfZ+7e4r3X0lcBXQAzxOtHzeFndvAbZwajm9G4CW8LMOeGAa2j0ujbmLSJJNdFhmFfCqu+8GVgMbQvkG4KawvRp4yCNPA3PMbMFUNHYiNFtGRJJsosH9NuC7YbvJ3feF7f1AU9huBvYUnLM3lJ3GzNaZWZuZtXV2dk6wGeOrymUwQ5khRSSRig7uZpYDPgZ878xj7u6AT+SD3X29u7e6e2tjY+NETi1KKmVUlykzpIgk00R67jcAz7n7gbB/YGS4JbweDOUdwOKC8xaFshlXq5zuIpJQEwnun+DUkAzAJmBN2F4DbCwo/2SYNXMt0FUwfDOjasozmi0jIomUKaaSmVUBHwb+U0Hxl4BHzWwtsBu4NZQ/AdwItBPNrLl9ylo7QbUVygwpIslUVHB3925g7hllh4lmz5xZ14E7pqR1k1RbnuH1Y32lboaIyIyL7ROqoHVURSS5Yh3ca7WOqogkVKyD+8hqTNFIkYhIcsQ8uGcYdujOa8EOEUmWWAd3ZYYUkaSKdXBXZkgRSapYB/daZYYUkYSKdXCvUWZIEUmomAf3qOeuue4ikjSxDu61FVpHVUSSKd7BXWPuIpJQsQ7uZZkU2bRptoyIJE6sg7uZUVuuzJAikjyxDu4QcrprzF1EEiYBwV09dxFJnqKCu5nNMbPHzOwVM3vZzN5lZg1mttnMdoTX+lDXzOx+M2s3sxfM7MrpvYQ3V1uhzJAikjzF9tzvA37k7hcDlwMvA3cDW9y9BdgS9iFaa7Ul/KwDHpjSFk9QTVmW473quYtIsowb3M2sDngf8CCAu+fd/RiwGtgQqm0Abgrbq4GHPPI0MGdkIe1SUM9dRJKomJ77MqAT+IaZPW9mXwtrqjYVLHy9H2gK283AnoLz94ay05jZOjNrM7O2zs7Oc7+CcWjMXUSSqJjgngGuBB5w9yuAbk4NwQCj66ZOaEUMd1/v7q3u3trY2DiRUyekpjxDd36IwaHhafsMEZHzTTHBfS+w192fCfuPEQX7AyPDLeH1YDjeASwuOH9RKCuJkadUT/ZraEZEkmPc4O7u+4E9ZnZRKFoFvARsAtaEsjXAxrC9CfhkmDVzLdBVMHwz45QZUkSSKFNkvf8MfNvMcsBO4HaiL4ZHzWwtsBu4NdR9ArgRaAd6Qt2SGckM2dU7cNqfEyIicVZUcHf3bUDrGIdWjVHXgTsm16ypM5IZUj13EUmS2D+hWquc7iKSQLEP7hpzF5Ekin1wV053EUmi2Af36tBzV053EUmS2Af3bDpFZS6tnruIJErsgztE4+4acxeRJElIcM9qtoyIJEoignuteu4ikjCJCO7KDCkiSZOQ4K51VEUkWRIR3Gsr1HMXkWRJRHCvKc9onruIJEoigntteZb80DB9A0OlboqIyIxISHBXfhkRSZZEBPcaZYYUkYRJRHBXTncRSZqigruZ7TKzF81sm5m1hbIGM9tsZjvCa30oNzO738zazewFM7tyOi+gGDXKDCkiCTORnvsH3X2lu4+syHQ3sMXdW4AtYR/gBqAl/KwDHpiqxp6rGmWGFJGEmcywzGpgQ9jeANxUUP6QR54G5pjZgkl8zqQpp7uIJE2xwd2BH5vZVjNbF8qa3H1f2N4PNIXtZmBPwbl7Q9lpzGydmbWZWVtnZ+c5NL14oz13BXcRSYiiFsgG3uPuHWY2H9hsZq8UHnR3NzOfyAe7+3pgPUBra+uEzp2oqlwGM91QFZHkKKrn7u4d4fUg8DhwDXBgZLglvB4M1TuAxQWnLwplJZNKGTVlygwpIskxbnA3syozqxnZBn4X2A5sAtaEamuAjWF7E/DJMGvmWqCrYPimZGrKsxzv1bCMiCRDMcMyTcDjZjZS/zvu/iMzexZ41MzWAruBW0P9J4AbgXagB7h9ylt9DmorssoMKSKJMW5wd/edwOVjlB8GVo1R7sAdU9K6KRQttaeeu4gkQyKeUIUov4x67iKSFAkK7srpLiLJkZjgXqN1VEUkQRIU3KOee3RLQEQk3hIT3GsrMgw7dOe1YIeIxF9igvtoTnfNdReRBEhQcFdOdxFJjsQEd2WGFJEkSUxwV2ZIEUmSxAT32oqRnruGZUQk/hIT3E/13BXcRST+EhPcazVbRkQSJDHBvSyTIpdOaVhGRBIhMcHdzJQZUkQSIzHBHaJxd425i0gSFB3czSxtZs+b2Q/D/jIze8bM2s3sETPLhfKysN8eji+dprZPWG2FMkOKSDJMpOf+aeDlgv17gHvdfQVwFFgbytcCR0P5vaHeeaGmPKMbqiKSCEUFdzNbBHwE+FrYN+A64LFQZQNwU9heHfYJx1eF+iVXU5bVDVURSYRie+5fBv4CGA77c4Fj7j4SKfcCzWG7GdgDEI53hfolV1uhnO4ikgzjBncz+yhw0N23TuUHm9k6M2szs7bOzs6pfOuzqinPKv2AiCRCMT33dwMfM7NdwMNEwzH3AXPMbGSB7UVAR9juABYDhON1wOEz39Td17t7q7u3NjY2TuoiilVbnqUnP8Tg0PD4lUVEZrFxg7u7f87dF7n7UuA24El3/33gKeDmUG0NsDFsbwr7hONP+nmy/NFICoKT/RqaEZF4m8w8988Cd5lZO9GY+oOh/EFgbii/C7h7ck2cOqP5ZXoV3EUk3jLjVznF3X8C/CRs7wSuGaNOH3DLFLRtyo1khtS4u4jEXeKeUAWl/RWR+EtUcB/NDKmeu4jEXCKDu3ruIhJ3iQruI8Myx3ryJW6JiMj0SlRwr6vIcsHcSr75810amhGRWEtUcE+ljHs/vpJ9XX38t8e3c55MvxcRmXKJCu4AVy6p588/1MKmX77OD57rGP8EEZFZKHHBHeBPPrCCdy5r4K82bmfXoe5SN0dEZMolMrinw/BMJp3izx5+nvygcs2ISLwkMrgDLJxTwT3/7jJe2NvF327+dambIyIypRIb3AGuv3QBn7hmCV/96U5+tuNQqZsjIjJlEh3cAf7qo5ewYn41dz26jcMn+0vdHBGRKZH44F6RS3P/bVdwrGeALz7x8vgniIjMAokP7gCXLKzl41cv5v+8sI8uLaAtIjGg4B7c0rqI/sFhfvjC66VuiojIpCm4B5c113FRUw3fa9tb6qaIiExaMQtkl5vZL8zsl2b2KzP7QihfZmbPmFm7mT1iZrlQXhb228PxpdN8DVPCzLj5qkVs23OM9oMnSt0cEZFJKabn3g9c5+6XAyuB683sWuAe4F53XwEcBdaG+muBo6H83lBvVrjpimbSKeN7W9V7F5HZrZgFst3dT4bdbPhx4DrgsVC+AbgpbK8O+4Tjq8zMpqrB06mxpowPXtTI4891MDikp1ZFZPYqaszdzNJmtg04CGwGXgWOufvIqhd7geaw3QzsAQjHu4gW0D7zPdeZWZuZtXV2dk7qIqbSzVct5uCJfv5FDzWJyCxWVHB39yF3XwksIloU++LJfrC7r3f3VndvbWxsnOzbTZnrLp5PQ1WOxzQ0IyKz2IRmy7j7MeAp4F3AHDPLhEOLgJH8uR3AYoBwvA44PBWNnQm5TIrVKxey+aUDWrFJRGatYmbLNJrZnLBdAXwYeJkoyN8cqq0BNobtTWGfcPxJn2WrYtx81SLyQ8Ns+qXmvIvI7FRMz30B8JSZvQA8C2x29x8CnwXuMrN2ojH1B0P9B4G5ofwu4O6pb/b0evvCOi5ZUKs57yIya2XGq+DuLwBXjFG+k2j8/czyPuCWKWldCd3Suogv/O+XeGX/cS5+S22pmyMiMiF6QvUsVq9sJps2HlPvXURmIQX3s2ioyrHq4ib+cVsHA5rzLiKzjIL7m7j5qkUcOpnnJ78+f+bhi4gUQ8H9Tbz/okbmVZex4ee76BsYKnVzRESKpuD+JrLpFH/43mX8rP0QH/ybn/D9rXsZHp5VszpFJKEU3Mfxx+9fziPrrmV+TRmf+d4v+Tdf+Rk/b1dqAhE5vym4F+Gdb53L43/6bu67bSXHegb4va89w3/85rNs7+iiN6/hGhE5/9j58PBoa2urt7W1lboZRekbGOKh/7eLv3+ynRN9Ud608myKuVVl1Fdlqa/MUV+Zo6osQ0U2TUUuRUU2TXk2TUUuTUU2TWUu7BeULZlbSVkmXeKrE5HZxMy2unvrWMfGfYhJTleeTbPufcu5+arF/N+XDnCou5+j3XkOd+c52p3nSM8Auw/30JMfom9giJ78IMUM018wt5Jv/MHVvLWxevovQkRiTz33aebuDAw5vQND9IaA3xt++vJD9OSHONKd50s/egV3539+spXWpQ2lbraIzALquZeQmZHLGLlMirqK7FnrXbOsgdu/+Sy/97VnuPfWlXzkHQtmsJUiEje6oXqeWDqviu//ye/wjuY67vjOc3z1p69yPvxVJSKzk4L7eaShKse3/vCdfOSyBfz1P73Cf9+4Xcv9icg50bDMeaY8m+bvP3EFi+or+Oo/72TbnmNcOL+G+qocDVXRTJyGqhyNNWVc9JYaqsv0v1BE3kiR4TyUShmfu/FtLJtXxXd+8Rq/2HWEI915es6YU28Gy+ZVcVlzHZcurOPS5jre3lxLbfnZx/ZFJBk0W2YW6RsY4mhPniPdefYd6+Olfcd5saOL7R1d7OvqAyBl8KnrWrhzVQuplJW4xSIynSY1W8bMFgMPAU2AA+vd/T4zawAeAZYCu4Bb3f2omRlwH3Aj0AP8gbs/NxUXknTl2TQL6ipYUFfB2xfW8aFLmkaPHTrZz/aOLh5/voP7t+yg/eAJ/vaWlVTk9GCUSBIVc0N1EPiMu18CXAvcYWaXEC2ft8XdW4AtnFpO7wagJfysAx6Y8lbLG8yrLuMDF83nyx9fyX+98W380/b93PLVn7Ovq7fUTROREhg3uLv7vpGet7ufIFocuxlYDWwI1TYAN4Xt1cBDHnkamGNmmrQ9Q8yMP3rfW/n6mqvZdaiHj33lX3n+taOlbpaIzLAJTYU0s6VE66k+AzS5+75waD/RsA1EgX9PwWl7Q9mZ77XOzNrMrK2zU4thTLUPXjyfH/zp71CeTfHx9U+zcVtHqZskIjOo6NkyZlYNfB+4092PR0PrEXd3M5vQnVl3Xw+sh+iG6kTOleJc2FTDxjvewx9/ayuffngb//BUOxW5DBXZFJW5zGhCs6baMi4NM24WN1RQ+P9WRGanooK7mWWJAvu33f0HofiAmS1w931h2OVgKO8AFhecviiUSQk0VOX41tp38pWn2vn1/uP0DgzTlx+i80T/aL6bgyf6GBiKvl/rKrJc2lzLpQvrWN5YPeaMG4OQ5TJ1WnbLymyGt9SVk8vo2TiRUitmtowBDwIvu/vfFRzaBKwBvhReNxaUf8rMHgbeCXQVDN9ICeQyKe768IVnPd4/OMRv9p/kxY4uXuzo4levd/GNf91F/hyejk2njCUNlSxvrGb5/CpWNFazfH41FzbpgSuRmVTMv7Z3A/8BeNHMtoWyvyQK6o+a2VpgN3BrOPYE0TTIdqKpkLdPZYNl6pVl0ly2qI7LFtWNluUHhzlwvG/M+sPu9A8O05s/PcPlif5B9hzpof3gSV7tPMlPf3Nw9C+CwgeuLmsOD1wtrKVGD1yJTAs9xCTTZnBomD1He2k/eJKXXj/O9tdPf+AKomGjiY7wl2fTlGdTo0NB5bk0FdkUFzbVcPXSBq68oF5/JUgiKOWvlEQmnWLZvCqWzaviw2M8cLW9o4v9Z/nr4GzcoW9g+FRe/PwQx3sHeP3YIJtfOsCwR0/pvn1hHa1L67lmaQPLGqtOW/WqIpsmk9Z9AYk39dwlNk72D/L8a0d59rdH+MWuIzz/2jH6B8e+b5BNG3MqcyxvrGLF/GqWN1aPvi6oK9eMIZkV1HOXRKguy/Delkbe29IIRPcNtr/exf6uvtH7A32ht98zMMShE/282nmSTdte53hYDxdOrYnbUJWLsnFWZqkPGTkrc+loGmnh2rjZNJn02F8GZZk0DSGjZ3lWqSBk5ii4S2zlMimuXFI/bj1359DJ/OiN4N2HuznSPcCR7n6O9Ayw61A3R7rznOwfHPe93kxlLj2asrmmPMNYfxwYRnk2mmJaGYaRyguGk0YXVy8oKzxekTt1LKuhp0RTcJfEMzMaa8porCnjXcvnnrVefnD49N5/wV8DQ2dZBb13IFoj98joAurR9sm+sb8oht050j182lq7vfmhsw4vnYuUMfpFUPicQkU2+vKpr8oxd+Svlqos9ZU5cmf5osikU+H8VHhALnzhZFMa2ioxBXeRIuUyqXHXwp0uw8NO3+DQqemn4bUnLLo+8mXQkz+1EHt+aOwvnOFhP32h9vB+3fkhdh46yZHdeY72DJz1C6sY6ZRRX5k9bYGZ+qocb6kt58ol9VyxZA5VmtE0rfRfV2QWSKWMylyGytzM/JMdHnaO9w1Ef3H05Bkc44vCgcEhP+1Zh5HtE30DHOkeGP1rZcfBk6Pb7lHwf/vCWq5e2sDVSxu46oJ65lXn1NufQgruIvIGqVQ0m2hOZW5K3/dE3wDPvXZsdEbT/3p6Nw/+7LcA5NIp6sMw0MhN6MKe/5n7NeXRMJAWpRmbgruIzJia8izvv7CR918YzWjqHxxie0cXz792jEMnT92XONqd56XXj3O4O09X78CbvmdZJnX6DeaRm9GF9xQK7jGMdaN6JE9SZeF9g1yKTGpqbkqP5GMqy6Rm7MtIwV1ESqYsk+aqCxq46oKGs9YZHBrmWG8Y4gk/h7vzdPcPjt5zKLwP0Zsfom9wiO7+QTpP9J92P6J/YPicciZNpfJs6rSZTnd+6EI+dvnCKf8cBXcROa9l0inmVZcxr7psSt5vcGiYvsFhevKD9OWH6RkYpG9gePRGdG/BsxDDk7ipXKgwH1PfwKmZVr0DQ9RXTs8NegV3EUmUTDpFdToV+/xDespBRCSGFNxFRGJIwV1EJIbGDe5m9nUzO2hm2wvKGsxss5ntCK/1odzM7H4zazezF8zsyulsvIiIjK2Ynvs3gevPKLsb2OLuLcCWsA9wA9ASftYBD0xNM0VEZCLGDe7u/s/AkTOKVwMbwvYG4KaC8oc88jQwJyyeLSIiM+hcx9ybCha93g+MLLPTDOwpqLc3lL2Bma0zszYza+vs7DzHZoiIyFgmfUPVo6WcJjzT393Xu3uru7c2NjZOthkiIlLgXGfxHzCzBe6+Lwy7HAzlHcDignqLQtmb2rp16yEz232ObZkHHDrHc2ezpF43JPfadd3JUsx1X3C2A+ca3DcBa4AvhdeNBeWfMrOHgXcCXQXDN2fl7ufcdTeztrOtIRhnSb1uSO6167qTZbLXPW5wN7PvAh8A5pnZXuDzREH9UTNbC+wGbg3VnwBuBNqBHuD2c22YiIicu3GDu7t/4iyHVo1R14E7JtsoERGZnDg8obq+1A0okaReNyT32nXdyTKp67aosy0iInESh567iIicQcFdRCSGZnVwN7PrzezXIVHZ3eOfMTtNJHlbnJjZYjN7ysxeMrNfmdmnQ3msr93Mys3sF2b2y3DdXwjly8zsmfD7/oiZTe3q1ecJM0ub2fNm9sOwH/vrNrNdZvaimW0zs7ZQNqnf81kb3M0sDfwDUbKyS4BPmNklpW3VtPkmxSdvi5NB4DPufglwLXBH+H8c92vvB65z98uBlcD1ZnYtcA9wr7uvAI4Ca0vXxGn1aeDlgv2kXPcH3X1lwdz2Sf2ez9rgDlwDtLv7TnfPAw8TJS6LnQkmb4sNd9/n7s+F7RNE/+Cbifm1h8R7J8NuNvw4cB3wWCiP3XUDmNki4CPA18K+kYDrPotJ/Z7P5uBedJKymDpb8rZYMrOlwBXAMyTg2sPQxDai1B6bgVeBY+4+GKrE9ff9y8BfAMNhfy7JuG4HfmxmW81sXSib1O95vFeITQh3dzOL7ZxWM6sGvg/c6e7Ho85cJK7X7u5DwEozmwM8Dlxc2hZNPzP7KHDQ3bea2QdK3JyZ9h537zCz+cBmM3ul8OC5/J7P5p77OSUpi5EDI7nyz0jeFitmliUK7N929x+E4kRcO4C7HwOeAt5FtD7CSIcsjr/v7wY+Zma7iIZZrwPuI/7Xjbt3hNeDRF/m1zDJ3/PZHNyfBVrCnfQccBtR4rKkGEneBqcnb4uNMN76IPCyu/9dwaFYX7uZNYYeO2ZWAXyY6H7DU8DNoVrsrtvdP+fui9x9KdG/5yfd/feJ+XWbWZWZ1YxsA78LbGeSv+ez+glVM7uRaIwuDXzd3b9Y2hZNj8LkbcABouRt/wg8CiwhJG9z9zNvus5qZvYe4F+AFzk1BvuXROPusb12M3sH0Q20NFEH7FF3/x9m9laiHm0D8Dzw7929v3QtnT5hWOa/uPtH437d4foeD7sZ4Dvu/kUzm8skfs9ndXAXEZGxzeZhGREROQsFdxGRGFJwFxGJIQV3EZEYUnAXEYkhBXcRkRhScBcRiaH/DyTuWfBm9aPjAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#点击次数在前50的用户\n", - "plt.plot(user_click_item_count[:50])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "点击次数排前50的用户的点击次数都在100次以上。思路:我们可以定义点击次数大于等于100次的用户为活跃用户,这是一种简单的处理思路, 判断用户活跃度,更加全面的是再结合上点击时间,后面我们会基于点击次数和点击时间两个方面来判断用户活跃度。" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAD4CAYAAAAaT9YAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAARV0lEQVR4nO3dfYxc1X3G8eexd7ExEDAYjEPYrkOQFZekKUxT2lKgJQHHSuWGphJIDaRYWaUBKUitKJQqRWlTNYnaSFWippvaMonASZsUGSVtg4tSXKkYYqd+WQqYlwLxSzAvcYgIBYxP/5i7u6Nl786dmTt7z5n7/UjWzt6Z3fmdnfGjM+ece65DCAIApGlB1QUAALpHiANAwghxAEgYIQ4ACSPEASBhQ/P5ZMuWLQujo6Pz+ZQAkLydO3c+H0I4fbb75jXER0dHtWPHjvl8SgBInu2n8+5jOAUAEkaIA0DCCHEASBghDgAJI8QBIGFtQ9z2RtuHbU/Mct8f2g62l/WnPADAXIr0xDdJWjPzoO2zJV0u6ZmSawIAFNR2nXgIYZvt0Vnu+oKkmyRtKbuome59+Fnt/uGRfj/Nm5yy5Dh99FdHtWCB5/25AaCIrk72sb1O0oEQwm577oCzPSZpTJJGRka6eTrdt+85fW177lr3vpjcZv2SVafrnNNPnNfnBoCiOg5x20sk/YmaQylthRDGJY1LUqPR6OoKFJ9ed54+ve68bn60a9/ec1A33PnfeuMYF80AEK9uVqecI2mlpN22n5L0Nkk/sH1mmYXFggsfAYhZxz3xEMJeSWdMfp8FeSOE8HyJdVXOYhwcQPyKLDHcLOl+Sats77e9vv9lVW9yqD+IrjiAeBVZnXJ1m/tHS6smIpP9cIZTAMSMMzZzTPXECXEAESPE22A4BUDMCPFcTGwCiB8hnoPhFAApIMRz0A8HkAJCvA164gBiRojnaLcnDADEgBDPMbVOnNUpACJGiOdgYhNACgjxHNOn3QNAvAhxAEgYIZ5jchfDwHgKgIgR4nkYTgGQAEI8B7sYAkgBIZ5jep04KQ4gXoQ4ACSMEM/BcAqAFBDiOVgnDiAFhHiO6SWGFRcCAHMgxHOw/xWAFBDibXCyD4CYEeI5WGAIIAWEeB52MQSQAEI8x9TEJn1xABEjxHMwsQkgBYR4O3TEAUSMEM/BxCaAFBDiOSY3wGJiE0DMCPEcjIkDSAEhnoOr3QNIASHeBsMpAGLWNsRtb7R92PZEy7E/t73H9i7b99h+a3/LnH/sYgggBUV64pskrZlx7PMhhHeHEN4j6duSPlVyXRFgUBxA/IbaPSCEsM326IxjL7V8e4IGsMM6tKAZ4us3fV8LIp/lPGHRQm25/iKNnLak6lIAzLO2IZ7H9mckXSPpJ5J+Y47HjUkak6SRkZFun27erX7rW3TTmlX66f8drbqUOR088oq27DqoA0deIcSBGuo6xEMIt0q61fYtkm6Q9Gc5jxuXNC5JjUYjmR778MIF+sSl76i6jLa2P/mCtuw6yCoaoKbKWJ1yh6TfKeH3oAtTAz1kOFBLXYW47XNbvl0n6ZFyykG3yHCgntoOp9jeLOlSScts71dz2GSt7VWSjkl6WtLH+1kk8rE9AFBvRVanXD3L4Q19qAVdiHzhDIA+44zNxLE9AFBvhHjizGXkgFojxAcEGQ7UEyGevMmJTWIcqCNCPHFMbAL1RognjsvIAfVGiCfO7JkL1BohPiBYYgjUEyGeuKnhFDIcqCVCPHFMbAL1RognzmLvFKDOCPHEMa8J1BshPiA42QeoJ0J8QBDhQD0R4oljYhOoN0I8cUxsAvVGiCfOXGQTqDVCfEDQEwfqiRBPHEsMgXojxBNnMbMJ1BkhnjguzwbUGyGeOC6UDNQbIT4g6IkD9USIJ46JTaDeCPHkMbEJ1BkhnrjpiU364kAdEeKJox8O1BshPiDoiAP1RIgnbvJq9ywxBOqJEE8cwylAvbUNcdsbbR+2PdFy7PO2H7G9x/Zdtk/pa5XIxRmbQL0V6YlvkrRmxrGtks4LIbxb0j5Jt5RcFwpiP3Gg3obaPSCEsM326Ixj97R8u13Sh0uuCx26b99zOvLK61WX0bMVJy/W2netqLoMIBltQ7yA6yR9I+9O22OSxiRpZGSkhKdDq5OXDOvERUO6e/dB3b37YNXllGLvbZfrpMXDVZcBJKGnELd9q6Sjku7Ie0wIYVzSuCQ1Gg0+9Jfs5OOHteNP36dXjx6rupSe3fnAM/rsvz2io2/wNgGK6jrEbX9U0gclXRY4XbBSi4cXavHwwqrL6Nnxw80pGt5MQHFdhbjtNZJuknRJCOFn5ZaEurJZMAl0qsgSw82S7pe0yvZ+2+slfVHSSZK22t5l+8t9rhM1wD4wQOeKrE65epbDG/pQC2pu+gIXAIrijE1Eh444UBwhjniwDwzQMUIc0WBaE+gcIY5omEFxoGOEOKJDhgPFEeKIBpt5AZ0jxBGNqXXi9MWBwghxRIOJTaBzhDiiwQUugM4R4ogOGQ4UR4gjGtMTm8Q4UBQhjngwnAJ0jBBHNJjYBDpHiANAwghxRGPyohAMpwDFEeKIxvTWKaQ4UBQhjmiwThzoHCEOAAkjxBGN6b1TABRFiCManOwDdI4QRzToiQOdI8QRHTriQHGEOAAkjBBHNMxFNoGOEeKIxlSEk+FAYYQ4osHEJtA5QhzRoScOFEeIIxpmM1qgY4Q4osHV7oHOEeKIBhObQOcIcUSDXQyBzrUNcdsbbR+2PdFy7HdtP2T7mO1Gf0tE3TCcAhRXpCe+SdKaGccmJF0paVvZBaHOmNgEOjXU7gEhhG22R2cce1hqPcMO6N2C7O30e//wgIYWDv5I37lnnKg7P3Zh1WUgcW1DvFe2xySNSdLIyEi/nw4J++WVp+m6X1upV15/o+pS+m7vgSP6rydeqLoMDIC+h3gIYVzSuCQ1Gg0GO5Hr5CXD+tRvra66jHnxha37NHHgparLwAAY/M+sQISmV+LQr0FvCHGgQmQ4elVkieFmSfdLWmV7v+31tj9ke7+kX5H0Hdvf7XehwCCZuhRdxXUgfUVWp1ydc9ddJdcC1AYLu1AWhlOACkxvMUBfHL0hxIEKsHc6ykKIAxWiI45eEeJABSbPdmafGPSKEAeAhBHiQAXYdhdlIcSBCnApOpSFEAcqRE8cvSLEgQpwPVGUhRAHKsBgCspCiAMVYGITZSHEgQqwARbKQogDFWLvFPSKEAcqwN4pKAshDgAJI8SBCkztnUJXHD0ixIEKTC0xJMTRI0IcqBAn+6BXhDhQAdaJoyyEOFABzthEWQhxoALTF4UAekOIAxWYHk4hxtEbQhyoEBGOXhHiQAUmx8TpiKNXhDhQBTO1iXIQ4kAFpnriDKigR4Q4UAFPpzjQE0IcqBAZjl4R4kAFpi4KQYqjR4Q4UAHmNVGWtiFue6Ptw7YnWo6danur7ceyr0v7WyYwWJjYRFmK9MQ3SVoz49jNku4NIZwr6d7sewAFsQEWyjLU7gEhhG22R2ccXifp0uz27ZL+Q9Ifl1kYUAf/sveQli45ruoyorRoeIHev3q5Fg0trLqUqLUN8RzLQwiHsts/krQ874G2xySNSdLIyEiXTwcMlmUnLpIk/cV3Hq64krj9/Ucu0BU/f2bVZUSt2xCfEkIItnM/FIYQxiWNS1Kj0eDDIyDpsncu1/ZbLtNrR49VXUqUnn7xZX1kw4N6lb9PW92G+LO2V4QQDtleIelwmUUBdXDmyYurLiFar73RDG92eWyv2yWGd0u6Nrt9raQt5ZQDACzB7ESRJYabJd0vaZXt/bbXS/orSe+3/Zik92XfA0ApyPDiiqxOuTrnrstKrgUAJLVc+YjRlLY4YxNAtDgZqj1CHEB0uGhGcYQ4gOgwsVkcIQ4gOuzyWBwhDiA6U3vLVFtGEghxANHiZJ/2CHEA0SLC2yPEAUSHic3iCHEA0TGD4oUR4gCiw5WPiiPEAUSLec32CHEA0WE0pThCHEB0zD6GhRHiAKLDhaSLI8QBRIeJzeIIcQDRoifeHiEOID5MbBZGiAOIDhObxRHiAKJjrgpRGCEOIDrTE5tohxAHEC064u0R4gCiM321e1K8HUIcQHSY1iyOEAcQHfZOKY4QBxAdLpRcHCEOIFpkeHuEOID4TG2ARYy3Q4gDiA7X2CyOEAcQHTK8OEIcQHSm14lXXEgCCHEA0WI/8fZ6CnHbn7Q9Yfsh2zeWVBOAmmP/q+K6DnHb50n6mKT3SvoFSR+0/Y6yCgNQX0xsFjfUw8++U9IDIYSfSZLt+yRdKelzZRQGoL4mT/b5yn8+qW/u3F9xNeX4yyvfpV8aPbX039tLiE9I+ozt0yS9ImmtpB0zH2R7TNKYJI2MjPTwdADqYvHwAn38knP0zIsvV11KaY4fXtiX3+teFtPbXi/pE5JelvSQpFdDCDfmPb7RaIQdO96U8wCAOdjeGUJozHZfTxObIYQNIYQLQggXS/qxpH29/D4AQGd6GU6R7TNCCIdtj6g5Hn5hOWUBAIroKcQlfSsbE39d0vUhhCO9lwQAKKqnEA8h/HpZhQAAOscZmwCQMEIcABJGiANAwghxAEhYTyf7dPxk9nOSnu7yx5dJer7EclJAm+uBNtdDL23+uRDC6bPdMa8h3gvbO/LOWBpUtLkeaHM99KvNDKcAQMIIcQBIWEohPl51ARWgzfVAm+uhL21OZkwcAPBmKfXEAQAzEOIAkLAkQtz2GtuP2n7c9s1V19ML20/Z3mt7l+0d2bFTbW+1/Vj2dWl23Lb/Nmv3Htvnt/yea7PHP2b72qraMxvbG20ftj3Rcqy0Ntq+IPsbPp79bOVXZMxp8222D2Sv9S7ba1vuuyWr/1HbV7Qcn/W9bnul7Qey49+wfdz8tW52ts+2/T3b/5NdLP2T2fGBfa3naHN1r3UIIep/khZKekLS2yUdJ2m3pNVV19VDe56StGzGsc9Jujm7fbOkz2a310r6VzUv/n2hmtc0laRTJT2ZfV2a3V5addta2nOxpPMlTfSjjZIezB7r7Gc/EGmbb5P0R7M8dnX2Pl4kaWX2/l4413td0j9Kuiq7/WVJfxBBm1dIOj+7fZKaF4VZPciv9Rxtruy1TqEn/l5Jj4cQngwhvCbp65LWVVxT2dZJuj27fbuk3245/tXQtF3SKbZXSLpC0tYQwoshhB9L2ippzTzXnCuEsE3SizMOl9LG7L63hBC2h+a7/Kstv6syOW3Os07S10MIr4YQ/lfS42q+z2d9r2e9z9+U9M3s51v/fpUJIRwKIfwgu/1TSQ9LOksD/FrP0eY8fX+tUwjxsyT9sOX7/Zr7jxa7IOke2zvdvIi0JC0PIRzKbv9I0vLsdl7bU/yblNXGs7LbM4/H6oZs6GDj5LCCOm/zaZKOhBCOzjgeDdujkn5R0gOqyWs9o81SRa91CiE+aC4KIZwv6QOSrrd9ceudWY9joNd91qGNmb+TdI6k90g6JOmvK62mT2yfKOlbkm4MIbzUet+gvtaztLmy1zqFED8g6eyW79+WHUtSCOFA9vWwpLvU/Fj1bPbRUdnXw9nD89qe4t+krDYeyG7PPB6dEMKzIYQ3QgjHJH1Fzdda6rzNL6g59DA043jlbA+rGWZ3hBD+OTs80K/1bG2u8rVOIcS/L+ncbMb2OElXSbq74pq6YvsE2ydN3pZ0uaQJNdszOSN/raQt2e27JV2TzepfKOkn2cfU70q63PbS7GPb5dmxmJXSxuy+l2xfmI0fXtPyu6IyGWSZD6n5WkvNNl9le5HtlZLOVXMCb9b3etab/Z6kD2c/3/r3q0z2998g6eEQwt+03DWwr3Vemyt9rauc6S36T81Z7X1qzubeWnU9PbTj7WrOQu+W9NBkW9QcB7tX0mOS/l3SqdlxS/pS1u69khotv+s6NSdJHpf0+1W3bUY7N6v5kfJ1Ncf01pfZRkmN7D/JE5K+qOzM4wjb/LWsTXuy/8wrWh5/a1b/o2pZcZH3Xs/eOw9mf4t/krQogjZfpOZQyR5Ju7J/awf5tZ6jzZW91px2DwAJS2E4BQCQgxAHgIQR4gCQMEIcABJGiANAwghxAEgYIQ4ACft/AbwTsfQSxAYAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#点击次数排名在[25000:50000]之间\n", - "plt.plot(user_click_item_count[25000:50000])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以看出点击次数小于等于两次的用户非常的多,这些用户可以认为是非活跃用户" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻点击次数分析" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:42:14.526476Z", - "start_time": "2020-11-13T15:42:14.463642Z" - } - }, - "outputs": [], - "source": [ - "item_click_count = sorted(user_click_merge.groupby('click_article_id')['user_id'].count(), reverse=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:42:16.198000Z", - "start_time": "2020-11-13T15:42:16.044455Z" - } - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "[]" + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:40:04.296033Z", + "start_time": "2020-11-13T15:40:03.980868Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAASw0lEQVR4nO3da4yc1X3H8e8fr+/GVxZj1nZsgnshVCl0RYyS8iLkBm1qKpGIqCpWimSpJU1SWjXQvEjUV0nUQEMTkTghFamilIRQYVW0gQJRlRdxsgbCNYSNa8CLsZeLL/EFbHz6Yo6dsbPjZ9be2Znn+PuRrH2e85yZ55x9xr+ZOXP2TKSUkCSV64xuN0CS1FkGvSQVzqCXpMIZ9JJUOINekgrX1+0GAJx11llpxYoV3W6GJNXKpk2bXk4p9VfV64mgX7FiBUNDQ91uhiTVSkQ81049h24kqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSpcrYP+p1te5eb7nuGNQ4e73RRJ6lm1DvqHn3uNWx8c5tBhg16SWql10EuSqhn0klQ4g16SCmfQS1Lhigh6v99cklqrddBHdLsFktT7ah30kqRqBr0kFc6gl6TCGfSSVLgigt5JN5LUWq2DPnDajSRVqXXQS5KqGfSSVDiDXpIKV0TQJ9dAkKSWah30LoEgSdVqHfSSpGoGvSQVzqCXpMIZ9JJUuCKC3jk3ktRaEUEvSWrNoJekwhn0klS4toI+Iv4mIp6MiCci4jsRMSMiVkbExogYjog7I2Jarjs97w/n4ys62gNJ0glVBn1EDAAfBwZTShcCU4BrgM8Dt6SUzgdeA67LN7kOeC2X35LrSZK6pN2hmz5gZkT0AbOAbcC7gbvy8TuAq/L2mrxPPn55RGcXK3CpG0lqrTLoU0ojwD8Bz9MI+F3AJmBnSulQrrYVGMjbA8AL+baHcv1Fx99vRKyLiKGIGBodHT2pxnf4+UOSitDO0M0CGq/SVwLnArOBD5zqiVNK61NKgymlwf7+/lO9O0lSC+0M3bwH+L+U0mhK6SBwN/BOYH4eygFYCozk7RFgGUA+Pg94ZUJbLUlqWztB/zywOiJm5bH2y4GngIeAq3OdtcA9eXtD3icffzC5YLwkdU07Y/QbaXyo+jDweL7NeuBTwA0RMUxjDP72fJPbgUW5/Abgxg60W5LUpr7qKpBS+gzwmeOKNwOXjFH3APChU2/aOPh+QZJaqvVfxjrnRpKq1TroJUnVDHpJKpxBL0mFM+glqXBFBH1y2o0ktVTroHepG0mqVuuglyRVM+glqXAGvSQVzqCXpMIVEfSujSlJrdU66J10I0nVah30kqRqBr0kFc6gl6TCFRH0fhYrSa3VOujDNRAkqVKtg16SVM2gl6TCGfSSVDiDXpIKV0TQJ9dAkKSWah30TrqRpGq1DnpJUjWDXpIKZ9BLUuEMekkqXBFB75wbSWqt1kHvpBtJqlbroJckVTPoJalwBr0kFa6toI+I+RFxV0T8PCKejohLI2JhRNwfEc/mnwty3YiIWyNiOCIei4iLO9sFSdKJtPuK/kvAf6eUfgd4O/A0cCPwQEppFfBA3ge4AliV/60DbpvQFo/BpW4kqbXKoI+IecBlwO0AKaU3Uko7gTXAHbnaHcBVeXsN8K3U8GNgfkQsmeB2H2lcR+5WkkrSziv6lcAo8K8R8UhEfCMiZgOLU0rbcp2XgMV5ewB4oen2W3OZJKkL2gn6PuBi4LaU0kXAXn49TANAaqwTPK4BlIhYFxFDETE0Ojo6nptKksahnaDfCmxNKW3M+3fRCP7tR4Zk8s8d+fgIsKzp9ktz2TFSSutTSoMppcH+/v6Tbb8kqUJl0KeUXgJeiIjfzkWXA08BG4C1uWwtcE/e3gBcm2ffrAZ2NQ3xSJImWV+b9f4a+HZETAM2Ax+l8STx3Yi4DngO+HCuey9wJTAM7Mt1Oyq52o0ktdRW0KeUHgUGxzh0+Rh1E3D9qTWrPc65kaRq/mWsJBXOoJekwhn0klQ4g16SCldG0DvpRpJaqnXQu9SNJFWrddBLkqoZ9JJUOINekgpXRND7WawktVbroA8XQZCkSrUOeklSNYNekgpn0EtS4Qx6SSpcEUGfnHYjSS3VOuhdAkGSqtU66CVJ1Qx6SSqcQS9JhTPoJalwRQR9crUbSWqp1kHvpBtJqlbroJckVTPoJalwBr0kFc6gl6TCFRH0rnUjSa3VOuhd60aSqtU66CVJ1Qx6SSqcQS9JhTPoJalwRQS9k24kqbW2gz4ipkTEIxHxn3l/ZURsjIjhiLgzIqbl8ul5fzgfX9GhthOudiNJlcbziv4TwNNN+58HbkkpnQ+8BlyXy68DXsvlt+R6kqQuaSvoI2Ip8EfAN/J+AO8G7spV7gCuyttr8j75+OW5viSpC9p9Rf/PwN8Dh/P+ImBnSulQ3t8KDOTtAeAFgHx8V65/jIhYFxFDETE0Ojp6cq2XJFWqDPqI+GNgR0pp00SeOKW0PqU0mFIa7O/vn8i7liQ16WujzjuBP4mIK4EZwFzgS8D8iOjLr9qXAiO5/giwDNgaEX3APOCVCW95k+RiN5LUUuUr+pTSTSmlpSmlFcA1wIMppT8DHgKuztXWAvfk7Q15n3z8wdSpJHbkX5Iqnco8+k8BN0TEMI0x+Ntz+e3Aolx+A3DjqTVRknQq2hm6OSql9EPgh3l7M3DJGHUOAB+agLZJkiZAEX8ZK0lqrYig97NYSWqt1kHvZ7GSVK3WQS9JqmbQS1LhDHpJKpxBL0mFM+glqXC1DnpXP5akarUOeklSNYNekgpn0EtS4Qx6SSpcEUHvWjeS1Fqtg945N5JUrdZBL0mqZtBLUuEMekkqnEEvSYUrIugTTruRpFZqHfQudSNJ1Wod9JKkaga9JBXOoJekwhn0klS4IoLetW4kqbVaB72zbiSpWq2DXpJUzaCXpMIZ9JJUOINekgpXRNA76UaSWqt10IffMSVJlSqDPiKWRcRDEfFURDwZEZ/I5Qsj4v6IeDb/XJDLIyJujYjhiHgsIi7udCckSa2184r+EPC3KaULgNXA9RFxAXAj8EBKaRXwQN4HuAJYlf+tA26b8FZLktpWGfQppW0ppYfz9h7gaWAAWAPckavdAVyVt9cA30oNPwbmR8SSiW64JKk94xqjj4gVwEXARmBxSmlbPvQSsDhvDwAvNN1say47/r7WRcRQRAyNjo6Ot93HSK6BIEkttR30ETEH+D7wyZTS7uZjqZG040rblNL6lNJgSmmwv79/PDdtatNJ3UySTittBX1ETKUR8t9OKd2di7cfGZLJP3fk8hFgWdPNl+YySVIXtDPrJoDbgadTSjc3HdoArM3ba4F7msqvzbNvVgO7moZ4JEmTrK+NOu8E/hx4PCIezWX/AHwO+G5EXAc8B3w4H7sXuBIYBvYBH53IBkuSxqcy6FNKP4KWf5l0+Rj1E3D9KbZLkjRBav2XsUc450aSWisi6CVJrRn0klQ4g16SCmfQS1LhDHpJKlwRQe9SN5LUWq2DPlzsRpIq1TroJUnVDHpJKpxBL0mFM+glqXC1Dvq9rx8C4I1Dh7vcEknqXbUO+gWzpgJw2PmVktRSrYN+xtQpALzuK3pJaqnWQT+9rxH0Dt1IUmv1Dvqpjea//KvXu9wSSepdtQ76aVMazT/Dv5CVpJZqHfQLZ08DYMsre7vcEknqXbUO+vl51s3BNx2jl6RWah30s6Y1vtv82R2/6nJLJKl31TroARbNnsbTL+7udjMkqWfVPujPmTeDzS87Ri9JrdQ+6N+xchEAv9i+p8stkaTeVPug/+DblwDw9f/d3OWWSFJvqn3QX7R8AQDf27T16CJnkqRfq33QA/zjmrcB8MEv/6jLLZGk3lNE0F976QrOP3sOm0f3ctkXHmLPgYPdbpIk9Ywigh7g3o//IcsXzuL5V/fxe5+9jy/e94yLnUkSEKkH1nIfHBxMQ0NDE3JfN9//C2594Nmj++/53bN5/9vO4f0XnsPcGVMn5ByS1AsiYlNKabCyXmlBD7DvjUP8y4PD3P3wVrbv/vXKlufMncFFy+dz4cA8Llo2n7cNzGPO9D6mnOGiaJLq57QO+mbbdx9gw6Mv8pMtr/LY1p3HBP8R5/XPZmD+TM6aM53zz57DnOl9rDxrNnNnTmXZgpksmjO9I22TpFNh0Ldw8M3DPDGyi0ee38nW1/azY88Bntq2m9cPHmZk5/4T3nbx3OmcM3cG0Fg5c/nCWUePLVs46+hqmkDjiePMY58gBubPPPqtWJJ0qtoN+r4OnfwDwJeAKcA3Ukqf68R5TsbUKWdw0fIFR+ffN9v/xpvsP/gmW17Zy659B9m26wDbdx8AGn95u//gmwBseXkvz726j0de2ElKsGt/+7N8zpz+m7/y/jOnc+78mWPWnzltCr+1eM4J73PZgmOfZE7kwoF59I1jqGp63xTmzfKzDanOJjzoI2IK8BXgvcBW4KcRsSGl9NREn2uizZw2hZnTprQdmkfs2neQnfvfOLo/snM/o3uOHSJ6/pV9vLbvN58Qhkd/xd7XDx19Emm2bed+dux5nYd+vqPluQ8d7vw7sgWzpk7IO5HxPCFVmTV9CqvOPnNC7msiNJ6sZ0zqOd/aP4czZ3TktZpOwYy+KZzRY5/7deJRcgkwnFLaDBAR/w6sAXo+6E/WvFlTj3nV+5ZFsyft3Dt2H2DHnva+SvGpF3dz8HD7U04PvZl4YmQXE/EFXk+M7GbX/oPjevfTyos797PHv4JWj4qA8/tP/C682ccvX8UH335uB1vUmaAfAF5o2t8KvOP4ShGxDlgHsHz58g404/Rw9twZnD23vVeSFw7M63BrJs+BMd4Bdcure9/g+Vf3Teo5n3tlLzvHeIeo7npx535Gx/kd1vNmdn5otGvv+1JK64H10PgwtlvtUD310ofa586f2fIzlk5Zfd6iST2f6q0Tfxk7Aixr2l+ayyRJXdCJoP8psCoiVkbENOAaYEMHziNJasOED92klA5FxMeAH9CYXvnNlNKTE30eSVJ7OjJGn1K6F7i3E/ctSRqfYlavlCSNzaCXpMIZ9JJUOINekgrXE6tXRsQo8NxJ3vws4OUJbE4d2OfTg30+PZxKn9+SUuqvqtQTQX8qImKonWU6S2KfTw/2+fQwGX126EaSCmfQS1LhSgj69d1uQBfY59ODfT49dLzPtR+jlySdWAmv6CVJJ2DQS1Lhah30EfGBiHgmIoYj4sZut2e8ImJLRDweEY9GxFAuWxgR90fEs/nnglweEXFr7utjEXFx0/2szfWfjYi1TeV/kO9/ON920r/IMiK+GRE7IuKJprKO97HVObrY589GxEi+1o9GxJVNx27K7X8mIt7fVD7m4zsvAb4xl9+ZlwMnIqbn/eF8fMUkdZmIWBYRD0XEUxHxZER8IpcXe61P0Ofeu9YppVr+o7EE8i+B84BpwM+AC7rdrnH2YQtw1nFlXwBuzNs3Ap/P21cC/wUEsBrYmMsXApvzzwV5e0E+9pNcN/Jtr+hCHy8DLgaemMw+tjpHF/v8WeDvxqh7QX7sTgdW5sf0lBM9voHvAtfk7a8Cf5m3/wr4at6+BrhzEvu8BLg4b58J/CL3rdhrfYI+99y1ntT/9BP8S74U+EHT/k3ATd1u1zj7sIXfDPpngCVND6Rn8vbXgI8cXw/4CPC1pvKv5bIlwM+byo+pN8n9XMGxodfxPrY6Rxf73Oo//zGPWxrf43Bpq8d3DrmXgb5cfrTekdvm7b5cL7p0ze8B3ns6XOsx+txz17rOQzdjfQn5QJfacrIScF9EbIrGl6UDLE4pbcvbLwGL83ar/p6ofOsY5b1gMvrY6hzd9LE8TPHNpuGF8fZ5EbAzpXTouPJj7isf35XrT6o8jHARsJHT5Fof12fosWtd56AvwbtSShcDVwDXR8RlzQdT4+m66Pmvk9HHHvk93ga8Ffh9YBvwxa62pkMiYg7wfeCTKaXdzcdKvdZj9LnnrnWdg772X0KeUhrJP3cA/wFcAmyPiCUA+eeOXL1Vf09UvnSM8l4wGX1sdY6uSCltTym9mVI6DHydxrWG8ff5FWB+RPQdV37MfeXj83L9SRERU2kE3rdTSnfn4qKv9Vh97sVrXeegr/WXkEfE7Ig488g28D7gCRp9ODLTYC2NcT9y+bV5tsJqYFd+u/oD4H0RsSC/RXwfjXG8bcDuiFidZydc23Rf3TYZfWx1jq44EkTZn9K41tBo5zV5FsVKYBWNDx3HfHznV6wPAVfn2x//+zvS56uBB3P9jsu//9uBp1NKNzcdKvZat+pzT17rbnxoMYEfflxJ45PuXwKf7nZ7xtn282h8uv4z4Mkj7acxzvYA8CzwP8DCXB7AV3JfHwcGm+7rL4Dh/O+jTeWD+UH2S+DLdOGDOeA7NN6+HqQxxnjdZPSx1Tm62Od/y316LP8nXdJU/9O5/c/QNDOq1eM7P3Z+kn8X3wOm5/IZeX84Hz9vEvv8LhpDJo8Bj+Z/V5Z8rU/Q55671i6BIEmFq/PQjSSpDQa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKtz/A1/NmoIeUlAfAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "user_click_item_count = sorted(user_click_merge.groupby('user_id')['click_article_id'].count(), reverse=True)\n", + "plt.plot(user_click_item_count)" ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以根据用户的点击文章次数看出用户的活跃度" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(item_click_count)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(item_click_count[:100])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以看出点击次数最多的前100篇新闻,点击次数大于1000次" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(item_click_count[:20])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "点击次数最多的前20篇新闻,点击次数大于2500。思路:可以定义这些新闻为热门新闻, 这个也是简单的处理方式,后面我们也是根据点击次数和时间进行文章热度的一个划分。" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(item_click_count[3500:])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以发现很多新闻只被点击过一两次。思路:可以定义这些新闻是冷门新闻" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻共现频次:两篇新闻连续出现的次数" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count
count433597.000000
mean3.184139
std18.851753
min1.000000
25%1.000000
50%1.000000
75%2.000000
max2202.000000
\n", - "
" + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " count\n", - "count 433597.000000\n", - "mean 3.184139\n", - "std 18.851753\n", - "min 1.000000\n", - "25% 1.000000\n", - "50% 1.000000\n", - "75% 2.000000\n", - "max 2202.000000" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tmp = user_click_merge.sort_values('click_timestamp')\n", - "tmp['next_item'] = tmp.groupby(['user_id'])['click_article_id'].transform(lambda x:x.shift(-1))\n", - "union_item = tmp.groupby(['click_article_id','next_item'])['click_timestamp'].agg({'count'}).reset_index().sort_values('count', ascending=False)\n", - "union_item[['count']].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "由统计数据可以看出,平均共现次数3.18,最高为2202。\n", - "\n", - "说明用户看的新闻,相关性是比较强的。" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#画个图直观地看一看\n", - "x = union_item['click_article_id']\n", - "y = union_item['count']\n", - "plt.scatter(x, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAATdElEQVR4nO3df6xkZX3H8fe37Aq2EPmxN7pd9nKhmhgxuOB1hUANISHlV+CPYrqkRUTNNoopVlsrmiCamIhNlSpG3ApF1Cr4syuFWFqwahuW7OKy/BK9KgYQ3AVkkarU1W//mLMwd5hzZ+7MmTt3znm/ksmeOeeZOd89s/dzn32ec85EZiJJqr/fG3cBkqSlYeBLUkMY+JLUEAa+JDWEgS9JDbFiXDtetWpVzszMjGv3kjSRtm3b9mhmTg3y2rEF/szMDFu3bh3X7iVpIkXETwZ9rUM6ktQQBr4kNYSBL0kNYeBLUkMY+JLUEH0HfkTsExHfjYjru2zbNyKujYi5iNgSETOVVilJGtpievgXAveWbHsj8PPMfDHwEeDSYQuTJFWrr/PwI+JQ4HTgA8DbuzQ5C7ikWP4ScHlERI7g3sv3PfIL/m3HT0u3n/CSKdYffnDVu5WkidfvhVeXAe8EDijZvgZ4ACAz90TEbuAQ4NH2RhGxEdgIMD09PUC5MLfzKT52y1zXbZlw648f57q/PG6g95akOusZ+BFxBrAzM7dFxInD7CwzNwGbAGZnZwfq/Z9+1GpOP+r0rtv+/FO38vRvfjd4gZJUY/2M4R8PnBkR9wNfAE6KiM92tHkIWAsQESuAFwCPVVhn3/z+LknqrmfgZ+ZFmXloZs4AG4CbM/MvOpptBs4rls8u2pi9krSMDHzztIh4P7A1MzcDVwKfiYg54HFavxiWXBD4e0aSultU4GfmN4FvFssXt63/NfDaKguTJFWrVlfaRoy7AklavmoV+OCkrSSVqV3gS5K6q13gO2crSd3VLvAlSd3VKvAjwjF8SSpRq8CXJJWrVeB7VqYklatV4APO2kpSiVoFvhdeSVK5WgU+eOGVJJWpXeBLkrqrVeAHDuFLUplaBb4kqVytAj+ctZWkUrUKfIB02laSuqpV4Nu/l6RytQp8cNJWksrULvAlSd3VKvAj7OFLUplaBb4kqVzNAt9pW0kqU7PA9146klSmVoHvdVeSVK5n4EfEfhFxW0TcERF3R8T7urR5fUTsiojtxeNNoym3t3TWVpK6WtFHm6eBkzLzqYhYCXwnIm7MzFs72l2bmW+tvkRJUhV6Bn62usxPFU9XFo9l2Y12REeSyvU1hh8R+0TEdmAncFNmbunS7E8jYkdEfCki1pa8z8aI2BoRW3ft2jV41ZKkResr8DPzt5m5DjgUWB8RL+9o8nVgJjOPAm4CPl3yPpsyczYzZ6empoYouzsnbSWp3KLO0snMJ4BbgFM61j+WmU8XTz8FvLKS6gbgnK0kddfPWTpTEXFgsfx84GTgex1tVrc9PRO4t8Ia+xaO4ktSqX7O0lkNfDoi9qH1C+K6zLw+It4PbM3MzcBfRcSZwB7gceD1oyq4F++HL0nd9XOWzg7g6C7rL25bvgi4qNrSJElVqt2Vto7hS1J3tQp8SVK5WgW+p2VKUrlaBT4s00uAJWkZqFXge1qmJJWrVeCDd8uUpDK1C3xJUnf1CvxwDF+SytQr8CVJpWoV+E7ZSlK5WgU+4JiOJJWoVeCHV15JUqlaBT7YwZekMrULfElSd7UK/MALrySpTK0CX5JUrlaB75ytJJWrVeCDk7aSVKZWgW8HX5LK1Srwwa84lKQytQt8SVJ3tQr8iCAdxZekrmoV+JKkcrUKfCdtJalcz8CPiP0i4raIuCMi7o6I93Vps29EXBsRcxGxJSJmRlJtH5y0laTu+unhPw2clJmvANYBp0TEsR1t3gj8PDNfDHwEuLTSKvtlF1+SSq3o1SBbN6d5qni6snh09qPPAi4plr8EXB4RkWO4sc3j//t//O0X7xj49RvWr+WVhx1cYUWStDz0DHyAiNgH2Aa8GPh4Zm7paLIGeAAgM/dExG7gEODRjvfZCGwEmJ6eHq7yLtbPHMytP3yM/557tHfjLh558tckGPiSaqmvwM/M3wLrIuJA4KsR8fLMvGuxO8vMTcAmgNnZ2cp7/xvWT7Nh/eC/SI7/4M0VViNJy8uiztLJzCeAW4BTOjY9BKwFiIgVwAuAxyqob8k56Suprvo5S2eq6NkTEc8HTga+19FsM3BesXw2cPM4xu8lSeX6GdJZDXy6GMf/PeC6zLw+It4PbM3MzcCVwGciYg54HNgwsopHzCt1JdVVP2fp7ACO7rL+4rblXwOvrbY0SVKVanWl7bD8AhVJdWbgd3JER1JNGfht7OFLqjMDv4MdfEl1ZeBLUkMY+G2CwMsHJNWVgS9JDWHgt3HSVlKdGfgdHNCRVFcGfhs7+JLqzMDv4JytpLoy8CWpIQz8NhHhGL6k2jLwJakhDPw2TtpKqjMDv4NX2kqqKwO/nV18STVm4Hewfy+prgx8SWoIA79NgF18SbVl4EtSQxj4bcLbZUqqMQO/QzqmI6mmDPw29u8l1VnPwI+ItRFxS0TcExF3R8SFXdqcGBG7I2J78bh4NOWOntddSaqrFX202QO8IzNvj4gDgG0RcVNm3tPR7tuZeUb1JUqSqtCzh5+ZD2fm7cXyL4B7gTWjLmwcIuzhS6qvRY3hR8QMcDSwpcvm4yLijoi4MSKOLHn9xojYGhFbd+3atfhqJUkD6zvwI2J/4MvA2zLzyY7NtwOHZeYrgI8BX+v2Hpm5KTNnM3N2ampqwJJHJ5y2lVRjfQV+RKykFfafy8yvdG7PzCcz86li+QZgZUSsqrTSJeJpmZLqqp+zdAK4Erg3Mz9c0uZFRTsiYn3xvo9VWehS8LorSXXWz1k6xwPnAndGxPZi3buBaYDMvAI4G3hzROwBfgVsyAm9sfxkVi1JvfUM/Mz8Dj2uScrMy4HLqypKklQ9r7TtYAdfUl0Z+JLUEAZ+G++WKanODPwOTtpKqisDv439e0l1ZuA/h118SfVk4EtSQxj4bbxbpqQ6M/AlqSEM/DYRjuBLqi8DX5IawsBv4/3wJdWZgd9hQm/yKUk9GfiS1BAGfhsnbSXVmYEvSQ1h4LcJvPBKUn0Z+JLUEAZ+O++HL6nGDPwOjuhIqisDX5IawsBv05q0tY8vqZ4MfElqCAO/jXO2kuqsZ+BHxNqIuCUi7omIuyPiwi5tIiI+GhFzEbEjIo4ZTbmSpEGt6KPNHuAdmXl7RBwAbIuImzLznrY2pwIvKR6vBj5R/DlR7OBLqrOegZ+ZDwMPF8u/iIh7gTVAe+CfBVyTrRnPWyPiwIhYXbx2otz50G7OvXLLuMt4jvOPn+Gkl75w3GVImmD99PCfEREzwNFAZyKuAR5oe/5gsW5e4EfERmAjwPT09CJLHb0zjvpDvr7jpzz19J5xlzLP3Q89ydQB+xr4kobSd+BHxP7Al4G3ZeaTg+wsMzcBmwBmZ2eX3fmPbzjhcN5wwuHjLuM5Trj05nGXIKkG+jpLJyJW0gr7z2XmV7o0eQhY2/b80GKdqrLsfj1KmjT9nKUTwJXAvZn54ZJmm4HXFWfrHAvsnsTxe0mqs36GdI4HzgXujIjtxbp3A9MAmXkFcANwGjAH/BI4v/JKG8wvZpFUhX7O0vkOPc5YLM7OuaCqoiRJ1fNK2wkQXiEgqQIG/oTwpm6ShmXgTwDv8SOpCgb+hLB/L2lYBr4kNYSBPwFaX8wy7iokTToDX5IawsCfABHhGL6koRn4ktQQBv4E8KxMSVUw8CeEF15JGpaBL0kNYeBPAu+WKakCBr4kNYSBPwEC7OJLGpqBL0kNYeBPgPB2mZIqYOBPiHRMR9KQDHxJaggDfwJ4t0xJVTDwJakhDPwJEGEPX9LwDHxJaggDfwKE98uUVIGegR8RV0XEzoi4q2T7iRGxOyK2F4+Lqy9TnpYpaVgr+mhzNXA5cM0Cbb6dmWdUUpEkaSR69vAz81vA40tQi0o4aSupClWN4R8XEXdExI0RcWRZo4jYGBFbI2Lrrl27Ktq1JKkfVQT+7cBhmfkK4GPA18oaZuamzJzNzNmpqakKdt0cdvAlDWvowM/MJzPzqWL5BmBlRKwaujJJUqWGDvyIeFEUt3OMiPXFez427PvqWd4tU1IVep6lExGfB04EVkXEg8B7gZUAmXkFcDbw5ojYA/wK2JB+43blPKKShtUz8DPznB7bL6d12qYkaRnzStsJ0BrQsYsvaTgGviQ1hIE/AbzwSlIVDHxJaggDfwJ4VqakKhj4E8IRHUnDMvAngPfDl1QFA39CeC2bpGEZ+JLUEAb+BIhwDF/S8Ax8SWoIA38COGUrqQoG/oRwzlbSsAz8SeCVV5IqYOBPCDv4koZl4EtSQxj4EyDwwitJwzPwJakhDPwJ4JytpCoY+JLUEAb+BLCDL6kKBv6EcM5W0rAMfElqCAN/AkQE6aVXkobUM/Aj4qqI2BkRd5Vsj4j4aETMRcSOiDim+jIlScPqp4d/NXDKAttPBV5SPDYCnxi+LLVz0lZSFVb0apCZ34qImQWanAVck61LQW+NiAMjYnVmPlxVkYLbf/IEJ3/4v8ZdhqQK/Nmr1vKmPz5iyffbM/D7sAZ4oO35g8W65wR+RGyk9b8ApqenK9h1M5x73GF84+5Hxl2GpIqs2n/fsey3isDvW2ZuAjYBzM7OOgvZp7PWreGsdWvGXYakCVfFWToPAWvbnh9arJMkLSNVBP5m4HXF2TrHArsdv5ek5afnkE5EfB44EVgVEQ8C7wVWAmTmFcANwGnAHPBL4PxRFStJGlw/Z+mc02N7AhdUVpEkaSS80laSGsLAl6SGMPAlqSEMfElqiBjXl2NHxC7gJwO+fBXwaIXlVMnaFm+51gXWNojlWhfUo7bDMnNqkB2MLfCHERFbM3N23HV0Y22Lt1zrAmsbxHKtC6zNIR1JaggDX5IaYlIDf9O4C1iAtS3ecq0LrG0Qy7UuaHhtEzmGL0lavEnt4UuSFsnAl6SmyMyJetD6ft37aN2d810j3M/9wJ3AdmBrse5g4CbgB8WfBxXrA/hoUdMO4Ji29zmvaP8D4Ly29a8s3n+ueG0sUMtVwE7grrZ1I6+lbB991HYJre9E2F48TmvbdlGxn/uAP+n1uQKHA1uK9dcCzyvW71s8nyu2z3TUtRa4BbgHuBu4cLkctwVqG+txA/YDbgPuKOp63xDvVUm9fdR2NfDjtmO2bkw/B/sA3wWuXy7HrGuWjCowR/EoDuoPgSOA5xUf/stGtK/7gVUd6z6094AD7wIuLZZPA24s/pEdC2xp+4fyo+LPg4rlvQFzW9E2iteeukAtrwGOYX6ojryWsn30UdslwN90afuy4jPbt/jH+sPiMy39XIHrgA3F8hXAm4vltwBXFMsbgGs79rWa4occOAD4frH/sR+3BWob63Er/h77F8sraYXJsYt9ryrr7aO2q4Gzuxyzpf45eDvwLzwb+GM/Zl2zZBRhOaoHcBzwjbbnFwEXjWhf9/PcwL8PWN32Q3tfsfxJ4JzOdsA5wCfb1n+yWLca+F7b+nntSuqZYX6ojryWsn30UdsldA+ueZ8X8I3iM+36uRY/eI8CKzo//72vLZZXFO0W+l/SvwInL6fj1qW2ZXPcgN8Hbgdevdj3qrLekuPVXtvVdA/8Jfs8aX3L338CJwHXD3L8R33M9j4mbQy/7AvTRyGBf4+IbcWXrwO8MJ/9Nq9HgBf2qGuh9Q92Wb8YS1FL2T768daI2BERV0XEQQPWdgjwRGbu6VLbM68ptu8u2j9HRMwAR9PqFS6r49ZRG4z5uEXEPhGxndYw3U20epeLfa8q620/VvNqy8y9x+wDxTH7SETs/Xbwpfw8LwPeCfyueD7I8R/JMes0aYG/lE7IzGOAU4ELIuI17Ruz9Ws1x1JZh6WoZZH7+ATwR8A64GHgH0ZUVk8RsT/wZeBtmflk+7ZxH7cutY39uGXmbzNzHa1e63rgpUtdQ5nO2iLi5bR6uy8FXkVrmObvRlzDvM8zIs4AdmbmtlHutyqTFvhL9oXpmflQ8edO4Ku0/vH/LCJWAxR/7uxR10LrD+2yfjGWopayfSwoM39W/HD+DvgnWsdukNoeAw6MiBUd6+e9V7H9BUX7Z0TESlqB+rnM/EqPv9OSHrdutS2X41bU8gStieXjBnivKut9jrbaTsnMh7PlaeCfGfyYDfp5Hg+cGRH3A1+gNazzjwv8fcZyzJ7Ra8xnOT1ojXn9iNakxt4JjCNHsJ8/AA5oW/4fWjPlf8/8yZsPFcunM3+C6LZi/cG0ziA4qHj8GDi42NY5QXRaj5pmmD9OPvJayvbRR22r25b/GvhCsXwk8yemfkRrUqr0cwW+yPyJqbcUyxcwf/Lruo6aArgGuKxj/diP2wK1jfW4AVPAgcXy84FvA2cs9r2qrLeP2la3HdPLgA+O8efgRJ6dtB37MetaY9VhOeoHrdn379MaW3zPiPZxRHFg954C9p5i/SG0Jmd+APxH2z+UAD5e1HQnMNv2Xm+gddrUHHB+2/pZ4K7iNZez8ITj52n9F/83tMbq3rgUtZTto4/aPlPsewewmflB9p5iP/fRdmZS2edafBa3FTV/Edi3WL9f8Xyu2H5ER10n0Pqv9w7aTnNcDsdtgdrGetyAo2idWrij+HtdPMR7VVJvH7XdXByzu4DP8uyZPEv6c1C0O5FnA3/sx6zbw1srSFJDTNoYviRpQAa+JDWEgS9JDWHgS1JDGPiS1BAGviQ1hIEvSQ3x/4tppPoWqYdUAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(union_item['count'].values[40000:])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "大概有75000个pair至少共现一次" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻文章信息" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#不同类型的新闻出现的次数\n", - "plt.plot(user_click_merge['category_id'].value_counts().values)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#出现次数比较少的新闻类型, 有些新闻类型,基本上就出现过几次\n", - "plt.plot(user_click_merge['category_id'].value_counts().values[150:])" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 1.630633e+06\n", - "mean 2.043012e+02\n", - "std 6.382198e+01\n", - "min 0.000000e+00\n", - "25% 1.720000e+02\n", - "50% 1.970000e+02\n", - "75% 2.290000e+02\n", - "max 6.690000e+03\n", - "Name: words_count, dtype: float64" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#新闻字数的描述性统计\n", - "user_click_merge['words_count'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(user_click_merge['words_count'].values)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户点击的新闻类型的偏好\n", - "\n", - "此特征可以用于度量用户的兴趣是否广泛。" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUlUlEQVR4nO3dfZBc1Xnn8e8zM3pBaCwkNBJCAiQbsKwEy8CYwoEihTG2wXGwY5dDditWHGrZsp3EjpNdw9q1dtXGu3YqNvFWsomJIaESyoGAMSQFwRhjezeJJY+MAAsEEuJFEnoZAXpBGAlJZ//oK2UkzfRtzfR097nz/VRNze3Tt/s+Z27rp9unT98bKSUkSfnrancBkqTmMNAlqSIMdEmqCANdkirCQJekiuhp5cZmz56dFi5c2MpNSlL2Vq5cuT2l1Fe2XksDfeHChQwMDLRyk5KUvYh4rpH1HHKRpIow0CWpIgx0SaoIA12SKsJAl6SKMNAlqSIMdEmqiCwC/a6HN/J3P25oGqYkTVhZBPo9q17g9oEN7S5DkjpaFoEuSSpnoEtSRWQT6F4pT5LqyyLQI6LdJUhSx8si0CVJ5Qx0SaqIbAI94SC6JNWTRaA7gi5J5bIIdElSOQNdkioim0B3Hrok1ZdFoDsNXZLKZRHokqRy2QS6Qy6SVF8mge6YiySVySTQJUllDHRJqohsAt0hdEmqL4tAd9qiJJXLItAlSeUMdEmqiGwCPTkRXZLqyiLQHUKXpHJZBLokqZyBLkkVYaBLUkVkEejOQ5ekclkEuiSpXEOBHhG/HxGrI+JnEfGtiJgaEYsiYnlErIuI2yJi8ngXK0kaWWmgR8R84PeA/pTSLwLdwNXAV4AbUkpnAi8D14xnoU5Dl6T6Gh1y6QFOiIgeYBqwGXgncEdx/y3AB5peXSGciS5JpUoDPaW0CfgT4HlqQb4TWAnsSCntL1bbCMwf7vERcW1EDETEwODgYHOqliQdo5Ehl5nAVcAi4FTgROC9jW4gpXRjSqk/pdTf19c36kIlSfU1MuTyLuCZlNJgSul14NvARcBJxRAMwAJg0zjVCEDyjOiSVFcjgf48cGFETIuIAC4DHgceAj5crLMMuHt8SnQeuiQ1opEx9OXUPvz8KfBY8Zgbgc8Cn4mIdcDJwE3jWKckqURP+SqQUvoC8IWjmtcDFzS9IknSqGTzTVHnoUtSfVkEumPoklQui0CXJJUz0CWpIrIJdIfQJam+LALdc7lIUrksAl2SVC6bQE/OW5SkuvIIdEdcJKlUHoEuSSploEtSRWQT6I6gS1J9WQS6Q+iSVC6LQJcklTPQJaki8gl0B9Elqa4sAj08f64klcoi0CVJ5Qx0SaqIbALdIXRJqi+LQHcEXZLKZRHokqRyBrokVUQ2ge750CWpviwC3WnoklQui0CXJJUz0CWpIrIJdEfQJam+LALdIXRJKpdFoEuSyhnoklQR2QS609Alqb4sAt3zoUtSuYYCPSJOiog7ImJNRDwREe+IiFkR8UBErC1+zxzvYiVJI2v0CP3rwD+nlBYDS4EngOuAB1NKZwEPFrclSW1SGugRMQO4BLgJIKW0L6W0A7gKuKVY7RbgA+NTYk1yJrok1dXIEfoiYBD464h4OCK+GREnAnNTSpuLdbYAc4d7cERcGxEDETEwODg4qiIdQZekco0Eeg9wHvAXKaVzgT0cNbySaqdCHPYQOqV0Y0qpP6XU39fXN9Z6JUkjaCTQNwIbU0rLi9t3UAv4rRExD6D4vW18Sqxx2qIk1Vca6CmlLcCGiHhz0XQZ8DhwD7CsaFsG3D0uFYJjLpLUgJ4G1/td4NaImAysBz5G7T+D2yPiGuA54CPjU6IkqRENBXpKaRXQP8xdlzW1GknSqGXxTVFwDF2SymQR6OEguiSVyiLQJUnlDHRJqggDXZIqIotA9+y5klQui0CXJJUz0CWpIrIJ9OREdEmqK4tAdwhdksplEeiSpHIGuiRVRDaB7gi6JNWXRaA7D12SymUR6JKkcga6JFVENoHuNHRJqi+LQPd86JJULotAlySVM9AlqSKyCfTkTHRJqiuLQHceuiSVyyLQJUnlsgl0py1KUn1ZBLpDLpJULotAlySVM9AlqSKyCXSH0CWpvkwC3UF0SSqTSaBLkspkE+hOW5Sk+rII9NcPHGT7K3vbXYYkdbQsAv3nrx+gd2pPu8uQpI7WcKBHRHdEPBwR/1TcXhQRyyNiXUTcFhGTx6vIOb1TnOYiSSWO5wj9U8ATQ25/BbghpXQm8DJwTTMLG6o7ggMOoktSXQ0FekQsAN4HfLO4HcA7gTuKVW4BPjAO9QHQ3RUcOGigS1I9jR6h/ynwX4GDxe2TgR0ppf3F7Y3A/OEeGBHXRsRARAwMDg6Orsiu4KBH6JJUV2mgR8SvANtSSitHs4GU0o0ppf6UUn9fX99onqI25OIRuiTV1cjUkYuAX42IK4GpwBuArwMnRURPcZS+ANg0XkV2BZjnklRf6RF6Sun6lNKClNJC4Grg+yml/wg8BHy4WG0ZcPe4FdlV++r/QVNdkkY0lnnonwU+ExHrqI2p39Scko7VXZwQ3ZkukjSy4/q2TkrpB8APiuX1wAXNL+lYh47QDxxMTOpuxRYlKT9ZfFN0589fB2Dv/oMla0rSxJVFoJ86YyqAM10kqY4sAr2nu1bm/oMeoUvSSPII9GIMff8Bj9AlaSR5BPqhI3QDXZJGlEegHzpCd8hFkkaURaDvO1AL8hf37GtzJZLUubII9FNnnAD4TVFJqieLQJ86qVbmoSN1SdKxsgj0ScWHovv8YpEkjSiLQO/prn0o+tyLr7a5EknqXFkE+uzpUwCYMimLciWpLbJIyCk9DrlIUpksAn2ygS5JpfII9OJD0Uc27mhvIZLUwbII9ENf/T8020WSdKxsEnLJvDfw1NZX2l2GJHWsbAJ9z779nDjZyxVJ0kiyCfSz5/by6Kad7S5DkjpWNoH+2usHiHYXIUkdLJtAP/f0mezdf9ATdEnSCLIJ9JRqQf7Czp+3uRJJ6kzZBPpb5r0BgMHde9tciSR1pmwCfcYJkwBYs2V3myuRpM6UTaAvPqUXgE0vO+QiScPJJtB7p9aO0Fc8+1KbK5GkzpRNoE/u6WLxKb28+Ipj6JI0nGwCHeCkaZN4enAPe/bub3cpktRxsgr0S87uA+ClPfvaXIkkdZ6sAv3MvukA3PaTDW2uRJI6T1aB/stvrh2hv+KQiyQdI6tAn9LTTV/vFP7mX5/l1X2GuiQNlVWgA1z0ppMBvzEqSUcrDfSIOC0iHoqIxyNidUR8qmifFREPRMTa4vfM8S8XrjhnHgB/8t2nWrE5ScpGI0fo+4E/SCktAS4EPhkRS4DrgAdTSmcBDxa3x92Fb6wdoT+7fU8rNidJ2SgN9JTS5pTST4vl3cATwHzgKuCWYrVbgA+MU41HmHHCJN6/9FQe27STex/b3IpNSlIWjmsMPSIWAucCy4G5KaVDiboFmDvCY66NiIGIGBgcHBxLrYf92rnzAfju6i1NeT5JqoKGAz0ipgN3Ap9OKe0ael+qnax82CtPpJRuTCn1p5T6+/r6xlTsIZcunsPiU3r5zqoXWPGM53aRJGgw0CNiErUwvzWl9O2ieWtEzCvunwdsG58Sh/f+pacCcNfDm1q5WUnqWI3McgngJuCJlNLXhtx1D7CsWF4G3N388kb2yUvPZOHJ01i+/kV+8GRL/y+RpI7UyBH6RcBvAu+MiFXFz5XAl4HLI2It8K7idktddOZsNrz8Kv/noadbvWlJ6jg9ZSuklP4fECPcfVlzyzk+X/rgOWzbvZdHNuzgtp88z0f6T6P2hkKSJp7svil6tCXz3sC23Xv57J2P8cLO19pdjiS1TfaB/vuXn82f/YdzAbhz5UbWbNlV8ghJqqbsAx3gjFknAvC1B57iD25/pM3VSFJ7VCLQz1kwg4HPv4srfvEUtu56jR89NcimHV5MWtLEUolAB5g9fQoLZ5/I9lf28dGbV/Cfbhlod0mS1FKVCXSAT112Fnd+/Je4fMlctu56jSe37Gb94CvUvsgqSdVWqUCfOqmb88+Yydlzp/Pinn28509/xDu/+kP+8VFP4iWp+krnoefo2kvexDnzZ/Da6wf59G2reGZwDzte3UcQzJg2qd3lSdK4iFYOR/T396eBgdaNbaeUOPvz9/H6gX/v4/VXLOY///KbWlaDJI1VRKxMKfWXrVfJI/RDIoIbP9p/+GIYNzzwFOsHvTCGpGqqdKADXPrmOfDm2vKty5/nH1Zu4DuramdojIAvvv8XuPqC09tYoSQ1R+UDfajPXfkWfvzMi4dv/92/PccjG3dy9QVtLEqSmmRCBfqli+dw6eI5h28/sHord6/axL+s2364racr+J+/ds7ha5dKUi4mVKAf7ROXnnlEmKeU+M6qFxh49iUDXVJ2Kj3LZTTO/tx9zJ4+mQUzpx3R3tUF/+U9izn/jJltqkzSRNXoLJdKfbGoGX7rooWccfKJdHfFET8/Xv8SP/TKSJI62IQechnOf7vyLcO2n/OF+/mnxzazfvvw0x7n9E7l8+97C11dXmBDUnsY6A1631vnseLZl3h887HnW9/92n4Gd+/lYxct5LRZ04Z5tCSNPwO9QV/+0FtHvO/exzbziVt/yg3fe4qZ0ybXfZ63LpjBVW+b3+zyJMlAb4az5/Yye/oUvrt6a9319u4/QO/USQa6pHFhoDfBmXOmM/D5d5Wu9+X71vDN/7uev/rR+oafu6sr+NWlp9LXO2UsJUqaAAz0FjprznT2H0x86d4njutxr+7dz+9edtY4VSWpKgz0FvrQ+Qu44pxTOHgcU//f/kff4+ENO7i7OP/MaJw+axrnnu78eanqDPQWmzb5+P7k82eewPfXbOP7a0Y/B35KTxdr/sd7iXBKpVRlBnqHu+sTv8S23XtH/fjbBzbwjR+u50drtzOlp3nfI+vuCpYuOInJTXxOSWNjoHe43qmT6J06+qssLT6lF4BlN69oVkmH/fdfWcJvX7yo6c8raXQM9Ip7/1tP5bSZ09h34GBTn3fZzStYu+2VwxcPaYepk7o5ZcbUtm1f6jQGesX1dHfRv3BW05931omT+daK5/nWiueb/tzH486Pv4Pzz2h+/6QcGegalZuWvZ2123a3bfvbdu3lf923hme2v8qSeTPaVsfRpk7q8sNntY2nz1WWtu1+jQu+9GC7yzjGh85bwFc/srTdZahivEi0Km1O71Ru+PWlbN01+hlAzXbHyo08tbV971okA13Z+uC5C9pdwhEef2EX//joC5zzxfvbXcqE9JnLz+ZjF03sWVcGutQk11y8iJOn1z/bpsbHXQ9vYuC5lw30sTw4It4LfB3oBr6ZUvpyU6qSMrT0tJNYetpJ7S5jQlr53Mv8YM02Lv/aD9tdyohuWvZ2Tj95fK+XMOpAj4hu4M+By4GNwE8i4p6U0uPNKk6SGnHNxYu4f/WWdpdRVyu+VT2WI/QLgHUppfUAEfH3wFWAgS6ppa5623yvM8DYLhI9H9gw5PbGou0IEXFtRAxExMDg4OAYNidJqmfc3wOklG5MKfWnlPr7+vrGe3OSNGGNJdA3AacNub2gaJMktcFYAv0nwFkRsSgiJgNXA/c0pyxJ0vEa9YeiKaX9EfE7wP3Upi3enFJa3bTKJEnHZUzz0FNK9wL3NqkWSdIYeLkZSaoIA12SKqKlp8+NiEHguVE+fDawvYnl5MA+Twz2ufrG2t8zUkql875bGuhjEREDjZwPuErs88Rgn6uvVf11yEWSKsJAl6SKyCnQb2x3AW1gnycG+1x9LelvNmPokqT6cjpClyTVYaBLUkVkEegR8d6IeDIi1kXEde2u53hFxLMR8VhErIqIgaJtVkQ8EBFri98zi/aIiP9d9PXRiDhvyPMsK9ZfGxHLhrSfXzz/uuKx0YY+3hwR2yLiZ0Paxr2PI22jjX3+YkRsKvb1qoi4csh91xf1PxkR7xnSPuzruzjx3fKi/bbiJHhExJTi9rri/oUt6u9pEfFQRDweEasj4lNFe2X3c50+d+Z+Til19A+1E389DbwRmAw8Aixpd13H2YdngdlHtf0xcF2xfB3wlWL5SuA+IIALgeVF+yxgffF7ZrE8s7hvRbFuFI+9og19vAQ4D/hZK/s40jba2OcvAn84zLpLitfuFGBR8Zrurvf6Bm4Hri6W/xL4eLH8CeAvi+Wrgdta1N95wHnFci/wVNGvyu7nOn3uyP3c0n/0o/yDvgO4f8jt64Hr213XcfbhWY4N9CeBeUNeNE8Wy98AfuPo9YDfAL4xpP0bRds8YM2Q9iPWa3E/F3JkuI17H0faRhv7PNI/9CNet9TOUvqOkV7fRaBtB3qK9sPrHXpssdxTrBdt2N93U7umcOX38zB97sj9nMOQS0OXuutwCfhuRKyMiGuLtrkppc3F8hZgbrE8Un/rtW8cpr0TtKKPI22jnX6nGGK4ecjQwPH2+WRgR0pp/1HtRzxXcf/OYv2WKd7+nwssZ4Ls56P6DB24n3MI9Cq4OKV0HnAF8MmIuGTonan2X3Cl54+2oo8d8nf8C+BNwNuAzcBX21rNOIiI6cCdwKdTSruG3lfV/TxMnztyP+cQ6Nlf6i6ltKn4vQ24C7gA2BoR8wCK39uK1Ufqb732BcO0d4JW9HGkbbRFSmlrSulASukg8FfU9jUcf59fBE6KiJ6j2o94ruL+GcX64y4iJlELtltTSt8umiu9n4frc6fu5xwCPetL3UXEiRHRe2gZeDfwM2p9OPTp/jJqY3MU7R8tZghcCOws3mreD7w7ImYWb+/eTW2sbTOwKyIuLGYEfHTIc7VbK/o40jba4lDoFD5IbV9Drc6ri5kLi4CzqH0AOOzruzgKfQj4cPH4o/9+h/r8YeD7xfrjqvjb3wQ8kVL62pC7KrufR+pzx+7ndnywMIoPIq6k9uny08Dn2l3Pcdb+RmqfaD8CrD5UP7WxsAeBtcD3gFlFewB/XvT1MaB/yHP9NrCu+PnYkPb+4gX1NPBntOcDsm9Re+v5OrVxwGta0ceRttHGPv9t0adHi3+Q84as/7mi/icZMhNppNd38dpZUfwt/gGYUrRPLW6vK+5/Y4v6ezG1oY5HgVXFz5VV3s91+tyR+9mv/ktSReQw5CJJaoCBLkkVYaBLUkUY6JJUEQa6JFWEgS5JFWGgS1JF/H85cMkmMcaqfgAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), reverse=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从上图中可以看出有一小部分用户阅读类型是极其广泛的,大部分人都处在20个新闻类型以下。" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idcategory_id
count250000.000000250000.000000
mean124999.5000004.573188
std72168.9279864.419800
min0.0000001.000000
25%62499.7500002.000000
50%124999.5000003.000000
75%187499.2500006.000000
max249999.00000095.000000
\n", - "
" + "source": [ + "#点击次数在前50的用户\n", + "plt.plot(user_click_item_count[:50])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "点击次数排前50的用户的点击次数都在100次以上。思路:我们可以定义点击次数大于等于100次的用户为活跃用户,这是一种简单的处理思路, 判断用户活跃度,更加全面的是再结合上点击时间,后面我们会基于点击次数和点击时间两个方面来判断用户活跃度。" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAD4CAYAAAAaT9YAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAARV0lEQVR4nO3dfYxc1X3G8eexd7ExEDAYjEPYrkOQFZekKUxT2lKgJQHHSuWGphJIDaRYWaUBKUitKJQqRWlTNYnaSFWippvaMonASZsUGSVtg4tSXKkYYqd+WQqYlwLxSzAvcYgIBYxP/5i7u6Nl786dmTt7z5n7/UjWzt6Z3fmdnfGjM+ece65DCAIApGlB1QUAALpHiANAwghxAEgYIQ4ACSPEASBhQ/P5ZMuWLQujo6Pz+ZQAkLydO3c+H0I4fbb75jXER0dHtWPHjvl8SgBInu2n8+5jOAUAEkaIA0DCCHEASBghDgAJI8QBIGFtQ9z2RtuHbU/Mct8f2g62l/WnPADAXIr0xDdJWjPzoO2zJV0u6ZmSawIAFNR2nXgIYZvt0Vnu+oKkmyRtKbuome59+Fnt/uGRfj/Nm5yy5Dh99FdHtWCB5/25AaCIrk72sb1O0oEQwm577oCzPSZpTJJGRka6eTrdt+85fW177lr3vpjcZv2SVafrnNNPnNfnBoCiOg5x20sk/YmaQylthRDGJY1LUqPR6OoKFJ9ed54+ve68bn60a9/ec1A33PnfeuMYF80AEK9uVqecI2mlpN22n5L0Nkk/sH1mmYXFggsfAYhZxz3xEMJeSWdMfp8FeSOE8HyJdVXOYhwcQPyKLDHcLOl+Sats77e9vv9lVW9yqD+IrjiAeBVZnXJ1m/tHS6smIpP9cIZTAMSMMzZzTPXECXEAESPE22A4BUDMCPFcTGwCiB8hnoPhFAApIMRz0A8HkAJCvA164gBiRojnaLcnDADEgBDPMbVOnNUpACJGiOdgYhNACgjxHNOn3QNAvAhxAEgYIZ5jchfDwHgKgIgR4nkYTgGQAEI8B7sYAkgBIZ5jep04KQ4gXoQ4ACSMEM/BcAqAFBDiOVgnDiAFhHiO6SWGFRcCAHMgxHOw/xWAFBDibXCyD4CYEeI5WGAIIAWEeB52MQSQAEI8x9TEJn1xABEjxHMwsQkgBYR4O3TEAUSMEM/BxCaAFBDiOSY3wGJiE0DMCPEcjIkDSAEhnoOr3QNIASHeBsMpAGLWNsRtb7R92PZEy7E/t73H9i7b99h+a3/LnH/sYgggBUV64pskrZlx7PMhhHeHEN4j6duSPlVyXRFgUBxA/IbaPSCEsM326IxjL7V8e4IGsMM6tKAZ4us3fV8LIp/lPGHRQm25/iKNnLak6lIAzLO2IZ7H9mckXSPpJ5J+Y47HjUkak6SRkZFun27erX7rW3TTmlX66f8drbqUOR088oq27DqoA0deIcSBGuo6xEMIt0q61fYtkm6Q9Gc5jxuXNC5JjUYjmR778MIF+sSl76i6jLa2P/mCtuw6yCoaoKbKWJ1yh6TfKeH3oAtTAz1kOFBLXYW47XNbvl0n6ZFyykG3yHCgntoOp9jeLOlSScts71dz2GSt7VWSjkl6WtLH+1kk8rE9AFBvRVanXD3L4Q19qAVdiHzhDIA+44zNxLE9AFBvhHjizGXkgFojxAcEGQ7UEyGevMmJTWIcqCNCPHFMbAL1RognjsvIAfVGiCfO7JkL1BohPiBYYgjUEyGeuKnhFDIcqCVCPHFMbAL1RognzmLvFKDOCPHEMa8J1BshPiA42QeoJ0J8QBDhQD0R4oljYhOoN0I8cUxsAvVGiCfOXGQTqDVCfEDQEwfqiRBPHEsMgXojxBNnMbMJ1BkhnjguzwbUGyGeOC6UDNQbIT4g6IkD9USIJ46JTaDeCPHkMbEJ1BkhnrjpiU364kAdEeKJox8O1BshPiDoiAP1RIgnbvJq9ywxBOqJEE8cwylAvbUNcdsbbR+2PdFy7PO2H7G9x/Zdtk/pa5XIxRmbQL0V6YlvkrRmxrGtks4LIbxb0j5Jt5RcFwpiP3Gg3obaPSCEsM326Ixj97R8u13Sh0uuCx26b99zOvLK61WX0bMVJy/W2netqLoMIBltQ7yA6yR9I+9O22OSxiRpZGSkhKdDq5OXDOvERUO6e/dB3b37YNXllGLvbZfrpMXDVZcBJKGnELd9q6Sjku7Ie0wIYVzSuCQ1Gg0+9Jfs5OOHteNP36dXjx6rupSe3fnAM/rsvz2io2/wNgGK6jrEbX9U0gclXRY4XbBSi4cXavHwwqrL6Nnxw80pGt5MQHFdhbjtNZJuknRJCOFn5ZaEurJZMAl0qsgSw82S7pe0yvZ+2+slfVHSSZK22t5l+8t9rhM1wD4wQOeKrE65epbDG/pQC2pu+gIXAIrijE1Eh444UBwhjniwDwzQMUIc0WBaE+gcIY5omEFxoGOEOKJDhgPFEeKIBpt5AZ0jxBGNqXXi9MWBwghxRIOJTaBzhDiiwQUugM4R4ogOGQ4UR4gjGtMTm8Q4UBQhjngwnAJ0jBBHNJjYBDpHiANAwghxRGPyohAMpwDFEeKIxvTWKaQ4UBQhjmiwThzoHCEOAAkjxBGN6b1TABRFiCManOwDdI4QRzToiQOdI8QRHTriQHGEOAAkjBBHNMxFNoGOEeKIxlSEk+FAYYQ4osHEJtA5QhzRoScOFEeIIxpmM1qgY4Q4osHV7oHOEeKIBhObQOcIcUSDXQyBzrUNcdsbbR+2PdFy7HdtP2T7mO1Gf0tE3TCcAhRXpCe+SdKaGccmJF0paVvZBaHOmNgEOjXU7gEhhG22R2cce1hqPcMO6N2C7O30e//wgIYWDv5I37lnnKg7P3Zh1WUgcW1DvFe2xySNSdLIyEi/nw4J++WVp+m6X1upV15/o+pS+m7vgSP6rydeqLoMDIC+h3gIYVzSuCQ1Gg0GO5Hr5CXD+tRvra66jHnxha37NHHgparLwAAY/M+sQISmV+LQr0FvCHGgQmQ4elVkieFmSfdLWmV7v+31tj9ke7+kX5H0Hdvf7XehwCCZuhRdxXUgfUVWp1ydc9ddJdcC1AYLu1AWhlOACkxvMUBfHL0hxIEKsHc6ykKIAxWiI45eEeJABSbPdmafGPSKEAeAhBHiQAXYdhdlIcSBCnApOpSFEAcqRE8cvSLEgQpwPVGUhRAHKsBgCspCiAMVYGITZSHEgQqwARbKQogDFWLvFPSKEAcqwN4pKAshDgAJI8SBCkztnUJXHD0ixIEKTC0xJMTRI0IcqBAn+6BXhDhQAdaJoyyEOFABzthEWQhxoALTF4UAekOIAxWYHk4hxtEbQhyoEBGOXhHiQAUmx8TpiKNXhDhQBTO1iXIQ4kAFpnriDKigR4Q4UAFPpzjQE0IcqBAZjl4R4kAFpi4KQYqjR4Q4UAHmNVGWtiFue6Ptw7YnWo6danur7ceyr0v7WyYwWJjYRFmK9MQ3SVoz49jNku4NIZwr6d7sewAFsQEWyjLU7gEhhG22R2ccXifp0uz27ZL+Q9Ifl1kYUAf/sveQli45ruoyorRoeIHev3q5Fg0trLqUqLUN8RzLQwiHsts/krQ874G2xySNSdLIyEiXTwcMlmUnLpIk/cV3Hq64krj9/Ucu0BU/f2bVZUSt2xCfEkIItnM/FIYQxiWNS1Kj0eDDIyDpsncu1/ZbLtNrR49VXUqUnn7xZX1kw4N6lb9PW92G+LO2V4QQDtleIelwmUUBdXDmyYurLiFar73RDG92eWyv2yWGd0u6Nrt9raQt5ZQDACzB7ESRJYabJd0vaZXt/bbXS/orSe+3/Zik92XfA0ApyPDiiqxOuTrnrstKrgUAJLVc+YjRlLY4YxNAtDgZqj1CHEB0uGhGcYQ4gOgwsVkcIQ4gOuzyWBwhDiA6U3vLVFtGEghxANHiZJ/2CHEA0SLC2yPEAUSHic3iCHEA0TGD4oUR4gCiw5WPiiPEAUSLec32CHEA0WE0pThCHEB0zD6GhRHiAKLDhaSLI8QBRIeJzeIIcQDRoifeHiEOID5MbBZGiAOIDhObxRHiAKJjrgpRGCEOIDrTE5tohxAHEC064u0R4gCiM321e1K8HUIcQHSY1iyOEAcQHfZOKY4QBxAdLpRcHCEOIFpkeHuEOID4TG2ARYy3Q4gDiA7X2CyOEAcQHTK8OEIcQHSm14lXXEgCCHEA0WI/8fZ6CnHbn7Q9Yfsh2zeWVBOAmmP/q+K6DnHb50n6mKT3SvoFSR+0/Y6yCgNQX0xsFjfUw8++U9IDIYSfSZLt+yRdKelzZRQGoL4mT/b5yn8+qW/u3F9xNeX4yyvfpV8aPbX039tLiE9I+ozt0yS9ImmtpB0zH2R7TNKYJI2MjPTwdADqYvHwAn38knP0zIsvV11KaY4fXtiX3+teFtPbXi/pE5JelvSQpFdDCDfmPb7RaIQdO96U8wCAOdjeGUJozHZfTxObIYQNIYQLQggXS/qxpH29/D4AQGd6GU6R7TNCCIdtj6g5Hn5hOWUBAIroKcQlfSsbE39d0vUhhCO9lwQAKKqnEA8h/HpZhQAAOscZmwCQMEIcABJGiANAwghxAEhYTyf7dPxk9nOSnu7yx5dJer7EclJAm+uBNtdDL23+uRDC6bPdMa8h3gvbO/LOWBpUtLkeaHM99KvNDKcAQMIIcQBIWEohPl51ARWgzfVAm+uhL21OZkwcAPBmKfXEAQAzEOIAkLAkQtz2GtuP2n7c9s1V19ML20/Z3mt7l+0d2bFTbW+1/Vj2dWl23Lb/Nmv3Htvnt/yea7PHP2b72qraMxvbG20ftj3Rcqy0Ntq+IPsbPp79bOVXZMxp8222D2Sv9S7ba1vuuyWr/1HbV7Qcn/W9bnul7Qey49+wfdz8tW52ts+2/T3b/5NdLP2T2fGBfa3naHN1r3UIIep/khZKekLS2yUdJ2m3pNVV19VDe56StGzGsc9Jujm7fbOkz2a310r6VzUv/n2hmtc0laRTJT2ZfV2a3V5addta2nOxpPMlTfSjjZIezB7r7Gc/EGmbb5P0R7M8dnX2Pl4kaWX2/l4413td0j9Kuiq7/WVJfxBBm1dIOj+7fZKaF4VZPciv9Rxtruy1TqEn/l5Jj4cQngwhvCbp65LWVVxT2dZJuj27fbuk3245/tXQtF3SKbZXSLpC0tYQwoshhB9L2ippzTzXnCuEsE3SizMOl9LG7L63hBC2h+a7/Kstv6syOW3Os07S10MIr4YQ/lfS42q+z2d9r2e9z9+U9M3s51v/fpUJIRwKIfwgu/1TSQ9LOksD/FrP0eY8fX+tUwjxsyT9sOX7/Zr7jxa7IOke2zvdvIi0JC0PIRzKbv9I0vLsdl7bU/yblNXGs7LbM4/H6oZs6GDj5LCCOm/zaZKOhBCOzjgeDdujkn5R0gOqyWs9o81SRa91CiE+aC4KIZwv6QOSrrd9ceudWY9joNd91qGNmb+TdI6k90g6JOmvK62mT2yfKOlbkm4MIbzUet+gvtaztLmy1zqFED8g6eyW79+WHUtSCOFA9vWwpLvU/Fj1bPbRUdnXw9nD89qe4t+krDYeyG7PPB6dEMKzIYQ3QgjHJH1Fzdda6rzNL6g59DA043jlbA+rGWZ3hBD+OTs80K/1bG2u8rVOIcS/L+ncbMb2OElXSbq74pq6YvsE2ydN3pZ0uaQJNdszOSN/raQt2e27JV2TzepfKOkn2cfU70q63PbS7GPb5dmxmJXSxuy+l2xfmI0fXtPyu6IyGWSZD6n5WkvNNl9le5HtlZLOVXMCb9b3etab/Z6kD2c/3/r3q0z2998g6eEQwt+03DWwr3Vemyt9rauc6S36T81Z7X1qzubeWnU9PbTj7WrOQu+W9NBkW9QcB7tX0mOS/l3SqdlxS/pS1u69khotv+s6NSdJHpf0+1W3bUY7N6v5kfJ1Ncf01pfZRkmN7D/JE5K+qOzM4wjb/LWsTXuy/8wrWh5/a1b/o2pZcZH3Xs/eOw9mf4t/krQogjZfpOZQyR5Ju7J/awf5tZ6jzZW91px2DwAJS2E4BQCQgxAHgIQR4gCQMEIcABJGiANAwghxAEgYIQ4ACft/AbwTsfQSxAYAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " user_id category_id\n", - "count 250000.000000 250000.000000\n", - "mean 124999.500000 4.573188\n", - "std 72168.927986 4.419800\n", - "min 0.000000 1.000000\n", - "25% 62499.750000 2.000000\n", - "50% 124999.500000 3.000000\n", - "75% 187499.250000 6.000000\n", - "max 249999.000000 95.000000" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user_click_merge.groupby('user_id')['category_id'].nunique().reset_index().describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户查看文章的长度的分布\n", - "\n", - "通过统计不同用户点击新闻的平均字数,这个可以反映用户是对长文更感兴趣还是对短文更感兴趣。" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从上图中可以发现有一小部分人看的文章平均词数非常高,也有一小部分人看的平均文章次数非常低。\n", - "\n", - "大多数人偏好于阅读字数在200-400字之间的新闻。" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#挑出大多数人的区间仔细看看\n", - "plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True)[1000:45000])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以发现大多数人都是看250字以下的文章" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idwords_count
count250000.000000250000.000000
mean124999.500000205.830189
std72168.92798647.174030
min0.0000008.000000
25%62499.750000187.500000
50%124999.500000202.000000
75%187499.250000217.750000
max249999.0000003434.500000
\n", - "
" + "source": [ + "#点击次数排名在[25000:50000]之间\n", + "plt.plot(user_click_item_count[25000:50000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看出点击次数小于等于两次的用户非常的多,这些用户可以认为是非活跃用户" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻点击次数分析" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:42:14.526476Z", + "start_time": "2020-11-13T15:42:14.463642Z" + } + }, + "outputs": [], + "source": [ + "item_click_count = sorted(user_click_merge.groupby('click_article_id')['user_id'].count(), reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:42:16.198000Z", + "start_time": "2020-11-13T15:42:16.044455Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " user_id words_count\n", - "count 250000.000000 250000.000000\n", - "mean 124999.500000 205.830189\n", - "std 72168.927986 47.174030\n", - "min 0.000000 8.000000\n", - "25% 62499.750000 187.500000\n", - "50% 124999.500000 202.000000\n", - "75% 187499.250000 217.750000\n", - "max 249999.000000 3434.500000" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#更加详细的参数\n", - "user_click_merge.groupby('user_id')['words_count'].mean().reset_index().describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 用户点击新闻的时间分析" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "#为了更好的可视化,这里把时间进行归一化操作\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "mm = MinMaxScaler()\n", - "user_click_merge['click_timestamp'] = mm.fit_transform(user_click_merge[['click_timestamp']])\n", - "user_click_merge['created_at_ts'] = mm.fit_transform(user_click_merge[['created_at_ts']])\n", - "\n", - "user_click_merge = user_click_merge.sort_values('click_timestamp')" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
182499901623000.00000043201252552810.989186193
22499981609740.00000241121132552810.989092259
302499851609740.0000034117182882810.989092259
502499791623000.00000441171252222810.989186193
252499881609740.0000044117121217172810.989092259
\n", - "
" + "source": [ + "plt.plot(item_click_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "18 249990 162300 0.000000 4 \n", - "2 249998 160974 0.000002 4 \n", - "30 249985 160974 0.000003 4 \n", - "50 249979 162300 0.000004 4 \n", - "25 249988 160974 0.000004 4 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "18 3 20 1 25 \n", - "2 1 12 1 13 \n", - "30 1 17 1 8 \n", - "50 1 17 1 25 \n", - "25 1 17 1 21 \n", - "\n", - " click_referrer_type rank click_cnts category_id created_at_ts \\\n", - "18 2 5 5 281 0.989186 \n", - "2 2 5 5 281 0.989092 \n", - "30 2 8 8 281 0.989092 \n", - "50 2 2 2 281 0.989186 \n", - "25 2 17 17 281 0.989092 \n", - "\n", - " words_count \n", - "18 193 \n", - "2 259 \n", - "30 259 \n", - "50 193 \n", - "25 259 " - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user_click_merge.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "def mean_diff_time_func(df, col):\n", - " df = pd.DataFrame(df, columns={col})\n", - " df['time_shift1'] = df[col].shift(1).fillna(0)\n", - " df['diff_time'] = abs(df[col] - df['time_shift1'])\n", - " return df['diff_time'].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "# 点击时间差的平均值\n", - "mean_diff_click_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'click_timestamp'))" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(sorted(mean_diff_click_time.values, reverse=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从上图可以发现不同用户点击文章的时间差是有差异的" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "# 前后点击文章的创建时间差的平均值\n", - "mean_diff_created_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'created_at_ts'))" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(sorted(mean_diff_created_time.values, reverse=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从图中可以发现用户先后点击文章,文章的创建时间也是有差异的" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defaulting to user installation because normal site-packages is not writeable\n", - "Looking in indexes: https://mirrors.aliyun.com/pypi/simple\n", - "Collecting gensim\n", - " Downloading https://mirrors.aliyun.com/pypi/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)\n", - "\u001b[K |████████████████████████████████| 24.2 MB 91.0 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: six>=1.5.0 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.15.0)\n", - "Requirement already satisfied: numpy>=1.11.3 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.19.1)\n", - "Requirement already satisfied: scipy>=0.18.1 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.5.4)\n", - "Requirement already satisfied: numpy>=1.11.3 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.19.1)\n", - "Collecting smart-open>=1.8.1\n", - " Downloading https://mirrors.aliyun.com/pypi/packages/e3/cf/6311dfb0aff3e295d63930dea72e3029800242cdfe0790478e33eccee2ab/smart_open-4.0.1.tar.gz (117 kB)\n", - "\u001b[K |████████████████████████████████| 117 kB 96.7 MB/s eta 0:00:01\n", - "\u001b[?25hBuilding wheels for collected packages: smart-open\n", - " Building wheel for smart-open (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for smart-open: filename=smart_open-4.0.1-py3-none-any.whl size=108249 sha256=50eb67320a58790e8b173971aeb6af7b636d48259d7c9de759612e58e334215b\n", - " Stored in directory: /home/admin/.cache/pip/wheels/c3/14/fc/a0e523e5d2f13d083ce0af09d4e2861d8e2ec65fc466fb1dff\n", - "Successfully built smart-open\n", - "Installing collected packages: smart-open, gensim\n", - "Successfully installed gensim-3.8.3 smart-open-4.0.1\n" - ] - } - ], - "source": [ - "# 安装gensim\n", - "!pip install gensim" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "from gensim.models import Word2Vec\n", - "import logging, pickle\n", - "\n", - "# 需要注意这里模型只迭代了一次\n", - "def trian_item_word2vec(click_df, embed_size=16, save_name='item_w2v_emb.pkl', split_char=' '):\n", - " click_df = click_df.sort_values('click_timestamp')\n", - " # 只有转换成字符串才可以进行训练\n", - " click_df['click_article_id'] = click_df['click_article_id'].astype(str)\n", - " # 转换成句子的形式\n", - " docs = click_df.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()\n", - " docs = docs['click_article_id'].values.tolist()\n", - "\n", - " # 为了方便查看训练的进度,这里设定一个log信息\n", - " logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", - "\n", - " # 这里的参数对训练得到的向量影响也很大,默认负采样为5\n", - " w2v = Word2Vec(docs, size=16, sg=1, window=5, seed=2020, workers=24, min_count=1, iter=10)\n", - " \n", - " # 保存成字典的形式\n", - " item_w2v_emb_dict = {k: w2v[k] for k in click_df['click_article_id']}\n", - " \n", - " return item_w2v_emb_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "item_w2v_emb_dict = trian_item_word2vec(user_click_merge)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_type
25667190841199197150704527612941171202
25668190841285298150704530292041171202
25669190841156624150704663888541171202
25670190841129029150704666888541171202
107739164226214800150713140246441171212
\n", - "
" + "source": [ + "plt.plot(item_click_count[:100])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看出点击次数最多的前100篇新闻,点击次数大于1000次" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(item_click_count[:20])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "点击次数最多的前20篇新闻,点击次数大于2500。思路:可以定义这些新闻为热门新闻, 这个也是简单的处理方式,后面我们也是根据点击次数和时间进行文章热度的一个划分。" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(item_click_count[3500:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以发现很多新闻只被点击过一两次。思路:可以定义这些新闻是冷门新闻" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻共现频次:两篇新闻连续出现的次数" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
count433597.000000
mean3.184139
std18.851753
min1.000000
25%1.000000
50%1.000000
75%2.000000
max2202.000000
\n", + "
" + ], + "text/plain": [ + " count\n", + "count 433597.000000\n", + "mean 3.184139\n", + "std 18.851753\n", + "min 1.000000\n", + "25% 1.000000\n", + "50% 1.000000\n", + "75% 2.000000\n", + "max 2202.000000" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = user_click_merge.sort_values('click_timestamp')\n", + "tmp['next_item'] = tmp.groupby(['user_id'])['click_article_id'].transform(lambda x:x.shift(-1))\n", + "union_item = tmp.groupby(['click_article_id','next_item'])['click_timestamp'].agg({'count'}).reset_index().sort_values('count', ascending=False)\n", + "union_item[['count']].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "由统计数据可以看出,平均共现次数3.18,最高为2202。\n", + "\n", + "说明用户看的新闻,相关性是比较强的。" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#画个图直观地看一看\n", + "x = union_item['click_article_id']\n", + "y = union_item['count']\n", + "plt.scatter(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAATdElEQVR4nO3df6xkZX3H8fe37Aq2EPmxN7pd9nKhmhgxuOB1hUANISHlV+CPYrqkRUTNNoopVlsrmiCamIhNlSpG3ApF1Cr4syuFWFqwahuW7OKy/BK9KgYQ3AVkkarU1W//mLMwd5hzZ+7MmTt3znm/ksmeOeeZOd89s/dzn32ec85EZiJJqr/fG3cBkqSlYeBLUkMY+JLUEAa+JDWEgS9JDbFiXDtetWpVzszMjGv3kjSRtm3b9mhmTg3y2rEF/szMDFu3bh3X7iVpIkXETwZ9rUM6ktQQBr4kNYSBL0kNYeBLUkMY+JLUEH0HfkTsExHfjYjru2zbNyKujYi5iNgSETOVVilJGtpievgXAveWbHsj8PPMfDHwEeDSYQuTJFWrr/PwI+JQ4HTgA8DbuzQ5C7ikWP4ScHlERI7g3sv3PfIL/m3HT0u3n/CSKdYffnDVu5WkidfvhVeXAe8EDijZvgZ4ACAz90TEbuAQ4NH2RhGxEdgIMD09PUC5MLfzKT52y1zXbZlw648f57q/PG6g95akOusZ+BFxBrAzM7dFxInD7CwzNwGbAGZnZwfq/Z9+1GpOP+r0rtv+/FO38vRvfjd4gZJUY/2M4R8PnBkR9wNfAE6KiM92tHkIWAsQESuAFwCPVVhn3/z+LknqrmfgZ+ZFmXloZs4AG4CbM/MvOpptBs4rls8u2pi9krSMDHzztIh4P7A1MzcDVwKfiYg54HFavxiWXBD4e0aSultU4GfmN4FvFssXt63/NfDaKguTJFWrVlfaRoy7AklavmoV+OCkrSSVqV3gS5K6q13gO2crSd3VLvAlSd3VKvAjwjF8SSpRq8CXJJWrVeB7VqYklatV4APO2kpSiVoFvhdeSVK5WgU+eOGVJJWpXeBLkrqrVeAHDuFLUplaBb4kqVytAj+ctZWkUrUKfIB02laSuqpV4Nu/l6RytQp8cNJWksrULvAlSd3VKvAj7OFLUplaBb4kqVzNAt9pW0kqU7PA9146klSmVoHvdVeSVK5n4EfEfhFxW0TcERF3R8T7urR5fUTsiojtxeNNoym3t3TWVpK6WtFHm6eBkzLzqYhYCXwnIm7MzFs72l2bmW+tvkRJUhV6Bn62usxPFU9XFo9l2Y12REeSyvU1hh8R+0TEdmAncFNmbunS7E8jYkdEfCki1pa8z8aI2BoRW3ft2jV41ZKkResr8DPzt5m5DjgUWB8RL+9o8nVgJjOPAm4CPl3yPpsyczYzZ6empoYouzsnbSWp3KLO0snMJ4BbgFM61j+WmU8XTz8FvLKS6gbgnK0kddfPWTpTEXFgsfx84GTgex1tVrc9PRO4t8Ia+xaO4ktSqX7O0lkNfDoi9qH1C+K6zLw+It4PbM3MzcBfRcSZwB7gceD1oyq4F++HL0nd9XOWzg7g6C7rL25bvgi4qNrSJElVqt2Vto7hS1J3tQp8SVK5WgW+p2VKUrlaBT4s00uAJWkZqFXge1qmJJWrVeCDd8uUpDK1C3xJUnf1CvxwDF+SytQr8CVJpWoV+E7ZSlK5WgU+4JiOJJWoVeCHV15JUqlaBT7YwZekMrULfElSd7UK/MALrySpTK0CX5JUrlaB75ytJJWrVeCDk7aSVKZWgW8HX5LK1Srwwa84lKQytQt8SVJ3tQr8iCAdxZekrmoV+JKkcrUKfCdtJalcz8CPiP0i4raIuCMi7o6I93Vps29EXBsRcxGxJSJmRlJtH5y0laTu+unhPw2clJmvANYBp0TEsR1t3gj8PDNfDHwEuLTSKvtlF1+SSq3o1SBbN6d5qni6snh09qPPAi4plr8EXB4RkWO4sc3j//t//O0X7xj49RvWr+WVhx1cYUWStDz0DHyAiNgH2Aa8GPh4Zm7paLIGeAAgM/dExG7gEODRjvfZCGwEmJ6eHq7yLtbPHMytP3yM/557tHfjLh558tckGPiSaqmvwM/M3wLrIuJA4KsR8fLMvGuxO8vMTcAmgNnZ2cp7/xvWT7Nh/eC/SI7/4M0VViNJy8uiztLJzCeAW4BTOjY9BKwFiIgVwAuAxyqob8k56Suprvo5S2eq6NkTEc8HTga+19FsM3BesXw2cPM4xu8lSeX6GdJZDXy6GMf/PeC6zLw+It4PbM3MzcCVwGciYg54HNgwsopHzCt1JdVVP2fp7ACO7rL+4rblXwOvrbY0SVKVanWl7bD8AhVJdWbgd3JER1JNGfht7OFLqjMDv4MdfEl1ZeBLUkMY+G2CwMsHJNWVgS9JDWHgt3HSVlKdGfgdHNCRVFcGfhs7+JLqzMDv4JytpLoy8CWpIQz8NhHhGL6k2jLwJakhDPw2TtpKqjMDv4NX2kqqKwO/nV18STVm4Hewfy+prgx8SWoIA79NgF18SbVl4EtSQxj4bcLbZUqqMQO/QzqmI6mmDPw29u8l1VnPwI+ItRFxS0TcExF3R8SFXdqcGBG7I2J78bh4NOWOntddSaqrFX202QO8IzNvj4gDgG0RcVNm3tPR7tuZeUb1JUqSqtCzh5+ZD2fm7cXyL4B7gTWjLmwcIuzhS6qvRY3hR8QMcDSwpcvm4yLijoi4MSKOLHn9xojYGhFbd+3atfhqJUkD6zvwI2J/4MvA2zLzyY7NtwOHZeYrgI8BX+v2Hpm5KTNnM3N2ampqwJJHJ5y2lVRjfQV+RKykFfafy8yvdG7PzCcz86li+QZgZUSsqrTSJeJpmZLqqp+zdAK4Erg3Mz9c0uZFRTsiYn3xvo9VWehS8LorSXXWz1k6xwPnAndGxPZi3buBaYDMvAI4G3hzROwBfgVsyAm9sfxkVi1JvfUM/Mz8Dj2uScrMy4HLqypKklQ9r7TtYAdfUl0Z+JLUEAZ+G++WKanODPwOTtpKqisDv439e0l1ZuA/h118SfVk4EtSQxj4bbxbpqQ6M/AlqSEM/DYRjuBLqi8DX5IawsBv4/3wJdWZgd9hQm/yKUk9GfiS1BAGfhsnbSXVmYEvSQ1h4LcJvPBKUn0Z+JLUEAZ+O++HL6nGDPwOjuhIqisDX5IawsBv05q0tY8vqZ4MfElqCAO/jXO2kuqsZ+BHxNqIuCUi7omIuyPiwi5tIiI+GhFzEbEjIo4ZTbmSpEGt6KPNHuAdmXl7RBwAbIuImzLznrY2pwIvKR6vBj5R/DlR7OBLqrOegZ+ZDwMPF8u/iIh7gTVAe+CfBVyTrRnPWyPiwIhYXbx2otz50G7OvXLLuMt4jvOPn+Gkl75w3GVImmD99PCfEREzwNFAZyKuAR5oe/5gsW5e4EfERmAjwPT09CJLHb0zjvpDvr7jpzz19J5xlzLP3Q89ydQB+xr4kobSd+BHxP7Al4G3ZeaTg+wsMzcBmwBmZ2eX3fmPbzjhcN5wwuHjLuM5Trj05nGXIKkG+jpLJyJW0gr7z2XmV7o0eQhY2/b80GKdqrLsfj1KmjT9nKUTwJXAvZn54ZJmm4HXFWfrHAvsnsTxe0mqs36GdI4HzgXujIjtxbp3A9MAmXkFcANwGjAH/BI4v/JKG8wvZpFUhX7O0vkOPc5YLM7OuaCqoiRJ1fNK2wkQXiEgqQIG/oTwpm6ShmXgTwDv8SOpCgb+hLB/L2lYBr4kNYSBPwFaX8wy7iokTToDX5IawsCfABHhGL6koRn4ktQQBv4E8KxMSVUw8CeEF15JGpaBL0kNYeBPAu+WKakCBr4kNYSBPwEC7OJLGpqBL0kNYeBPgPB2mZIqYOBPiHRMR9KQDHxJaggDfwJ4t0xJVTDwJakhDPwJEGEPX9LwDHxJaggDfwKE98uUVIGegR8RV0XEzoi4q2T7iRGxOyK2F4+Lqy9TnpYpaVgr+mhzNXA5cM0Cbb6dmWdUUpEkaSR69vAz81vA40tQi0o4aSupClWN4R8XEXdExI0RcWRZo4jYGBFbI2Lrrl27Ktq1JKkfVQT+7cBhmfkK4GPA18oaZuamzJzNzNmpqakKdt0cdvAlDWvowM/MJzPzqWL5BmBlRKwaujJJUqWGDvyIeFEUt3OMiPXFez427PvqWd4tU1IVep6lExGfB04EVkXEg8B7gZUAmXkFcDbw5ojYA/wK2JB+43blPKKShtUz8DPznB7bL6d12qYkaRnzStsJ0BrQsYsvaTgGviQ1hIE/AbzwSlIVDHxJaggDfwJ4VqakKhj4E8IRHUnDMvAngPfDl1QFA39CeC2bpGEZ+JLUEAb+BIhwDF/S8Ax8SWoIA38COGUrqQoG/oRwzlbSsAz8SeCVV5IqYOBPCDv4koZl4EtSQxj4EyDwwitJwzPwJakhDPwJ4JytpCoY+JLUEAb+BLCDL6kKBv6EcM5W0rAMfElqCAN/AkQE6aVXkobUM/Aj4qqI2BkRd5Vsj4j4aETMRcSOiDim+jIlScPqp4d/NXDKAttPBV5SPDYCnxi+LLVz0lZSFVb0apCZ34qImQWanAVck61LQW+NiAMjYnVmPlxVkYLbf/IEJ3/4v8ZdhqQK/Nmr1vKmPz5iyffbM/D7sAZ4oO35g8W65wR+RGyk9b8ApqenK9h1M5x73GF84+5Hxl2GpIqs2n/fsey3isDvW2ZuAjYBzM7OOgvZp7PWreGsdWvGXYakCVfFWToPAWvbnh9arJMkLSNVBP5m4HXF2TrHArsdv5ek5afnkE5EfB44EVgVEQ8C7wVWAmTmFcANwGnAHPBL4PxRFStJGlw/Z+mc02N7AhdUVpEkaSS80laSGsLAl6SGMPAlqSEMfElqiBjXl2NHxC7gJwO+fBXwaIXlVMnaFm+51gXWNojlWhfUo7bDMnNqkB2MLfCHERFbM3N23HV0Y22Lt1zrAmsbxHKtC6zNIR1JaggDX5IaYlIDf9O4C1iAtS3ecq0LrG0Qy7UuaHhtEzmGL0lavEnt4UuSFsnAl6SmyMyJetD6ft37aN2d810j3M/9wJ3AdmBrse5g4CbgB8WfBxXrA/hoUdMO4Ji29zmvaP8D4Ly29a8s3n+ueG0sUMtVwE7grrZ1I6+lbB991HYJre9E2F48TmvbdlGxn/uAP+n1uQKHA1uK9dcCzyvW71s8nyu2z3TUtRa4BbgHuBu4cLkctwVqG+txA/YDbgPuKOp63xDvVUm9fdR2NfDjtmO2bkw/B/sA3wWuXy7HrGuWjCowR/EoDuoPgSOA5xUf/stGtK/7gVUd6z6094AD7wIuLZZPA24s/pEdC2xp+4fyo+LPg4rlvQFzW9E2iteeukAtrwGOYX6ojryWsn30UdslwN90afuy4jPbt/jH+sPiMy39XIHrgA3F8hXAm4vltwBXFMsbgGs79rWa4occOAD4frH/sR+3BWob63Er/h77F8sraYXJsYt9ryrr7aO2q4Gzuxyzpf45eDvwLzwb+GM/Zl2zZBRhOaoHcBzwjbbnFwEXjWhf9/PcwL8PWN32Q3tfsfxJ4JzOdsA5wCfb1n+yWLca+F7b+nntSuqZYX6ojryWsn30UdsldA+ueZ8X8I3iM+36uRY/eI8CKzo//72vLZZXFO0W+l/SvwInL6fj1qW2ZXPcgN8Hbgdevdj3qrLekuPVXtvVdA/8Jfs8aX3L338CJwHXD3L8R33M9j4mbQy/7AvTRyGBf4+IbcWXrwO8MJ/9Nq9HgBf2qGuh9Q92Wb8YS1FL2T768daI2BERV0XEQQPWdgjwRGbu6VLbM68ptu8u2j9HRMwAR9PqFS6r49ZRG4z5uEXEPhGxndYw3U20epeLfa8q620/VvNqy8y9x+wDxTH7SETs/Xbwpfw8LwPeCfyueD7I8R/JMes0aYG/lE7IzGOAU4ELIuI17Ruz9Ws1x1JZh6WoZZH7+ATwR8A64GHgH0ZUVk8RsT/wZeBtmflk+7ZxH7cutY39uGXmbzNzHa1e63rgpUtdQ5nO2iLi5bR6uy8FXkVrmObvRlzDvM8zIs4AdmbmtlHutyqTFvhL9oXpmflQ8edO4Ku0/vH/LCJWAxR/7uxR10LrD+2yfjGWopayfSwoM39W/HD+DvgnWsdukNoeAw6MiBUd6+e9V7H9BUX7Z0TESlqB+rnM/EqPv9OSHrdutS2X41bU8gStieXjBnivKut9jrbaTsnMh7PlaeCfGfyYDfp5Hg+cGRH3A1+gNazzjwv8fcZyzJ7Ra8xnOT1ojXn9iNakxt4JjCNHsJ8/AA5oW/4fWjPlf8/8yZsPFcunM3+C6LZi/cG0ziA4qHj8GDi42NY5QXRaj5pmmD9OPvJayvbRR22r25b/GvhCsXwk8yemfkRrUqr0cwW+yPyJqbcUyxcwf/Lruo6aArgGuKxj/diP2wK1jfW4AVPAgcXy84FvA2cs9r2qrLeP2la3HdPLgA+O8efgRJ6dtB37MetaY9VhOeoHrdn379MaW3zPiPZxRHFg954C9p5i/SG0Jmd+APxH2z+UAD5e1HQnMNv2Xm+gddrUHHB+2/pZ4K7iNZez8ITj52n9F/83tMbq3rgUtZTto4/aPlPsewewmflB9p5iP/fRdmZS2edafBa3FTV/Edi3WL9f8Xyu2H5ER10n0Pqv9w7aTnNcDsdtgdrGetyAo2idWrij+HtdPMR7VVJvH7XdXByzu4DP8uyZPEv6c1C0O5FnA3/sx6zbw1srSFJDTNoYviRpQAa+JDWEgS9JDWHgS1JDGPiS1BAGviQ1hIEvSQ3x/4tppPoWqYdUAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(union_item['count'].values[40000:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "大概有75000个pair至少共现一次" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻文章信息" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#不同类型的新闻出现的次数\n", + "plt.plot(user_click_merge['category_id'].value_counts().values)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#出现次数比较少的新闻类型, 有些新闻类型,基本上就出现过几次\n", + "plt.plot(user_click_merge['category_id'].value_counts().values[150:])" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 1.630633e+06\n", + "mean 2.043012e+02\n", + "std 6.382198e+01\n", + "min 0.000000e+00\n", + "25% 1.720000e+02\n", + "50% 1.970000e+02\n", + "75% 2.290000e+02\n", + "max 6.690000e+03\n", + "Name: words_count, dtype: float64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#新闻字数的描述性统计\n", + "user_click_merge['words_count'].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(user_click_merge['words_count'].values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户点击的新闻类型的偏好\n", + "\n", + "此特征可以用于度量用户的兴趣是否广泛。" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUlUlEQVR4nO3dfZBc1Xnn8e8zM3pBaCwkNBJCAiQbsKwEy8CYwoEihTG2wXGwY5dDditWHGrZsp3EjpNdw9q1dtXGu3YqNvFWsomJIaESyoGAMSQFwRhjezeJJY+MAAsEEuJFEnoZAXpBGAlJZ//oK2UkzfRtzfR097nz/VRNze3Tt/s+Z27rp9unT98bKSUkSfnrancBkqTmMNAlqSIMdEmqCANdkirCQJekiuhp5cZmz56dFi5c2MpNSlL2Vq5cuT2l1Fe2XksDfeHChQwMDLRyk5KUvYh4rpH1HHKRpIow0CWpIgx0SaoIA12SKsJAl6SKMNAlqSIMdEmqiCwC/a6HN/J3P25oGqYkTVhZBPo9q17g9oEN7S5DkjpaFoEuSSpnoEtSRWQT6F4pT5LqyyLQI6LdJUhSx8si0CVJ5Qx0SaqIbAI94SC6JNWTRaA7gi5J5bIIdElSOQNdkioim0B3Hrok1ZdFoDsNXZLKZRHokqRy2QS6Qy6SVF8mge6YiySVySTQJUllDHRJqohsAt0hdEmqL4tAd9qiJJXLItAlSeUMdEmqiGwCPTkRXZLqyiLQHUKXpHJZBLokqZyBLkkVYaBLUkVkEejOQ5ekclkEuiSpXEOBHhG/HxGrI+JnEfGtiJgaEYsiYnlErIuI2yJi8ngXK0kaWWmgR8R84PeA/pTSLwLdwNXAV4AbUkpnAi8D14xnoU5Dl6T6Gh1y6QFOiIgeYBqwGXgncEdx/y3AB5peXSGciS5JpUoDPaW0CfgT4HlqQb4TWAnsSCntL1bbCMwf7vERcW1EDETEwODgYHOqliQdo5Ehl5nAVcAi4FTgROC9jW4gpXRjSqk/pdTf19c36kIlSfU1MuTyLuCZlNJgSul14NvARcBJxRAMwAJg0zjVCEDyjOiSVFcjgf48cGFETIuIAC4DHgceAj5crLMMuHt8SnQeuiQ1opEx9OXUPvz8KfBY8Zgbgc8Cn4mIdcDJwE3jWKckqURP+SqQUvoC8IWjmtcDFzS9IknSqGTzTVHnoUtSfVkEumPoklQui0CXJJUz0CWpIrIJdIfQJam+LALdc7lIUrksAl2SVC6bQE/OW5SkuvIIdEdcJKlUHoEuSSploEtSRWQT6I6gS1J9WQS6Q+iSVC6LQJcklTPQJaki8gl0B9Elqa4sAj08f64klcoi0CVJ5Qx0SaqIbALdIXRJqi+LQHcEXZLKZRHokqRyBrokVUQ2ge750CWpviwC3WnoklQui0CXJJUz0CWpIrIJdEfQJam+LALdIXRJKpdFoEuSyhnoklQR2QS609Alqb4sAt3zoUtSuYYCPSJOiog7ImJNRDwREe+IiFkR8UBErC1+zxzvYiVJI2v0CP3rwD+nlBYDS4EngOuAB1NKZwEPFrclSW1SGugRMQO4BLgJIKW0L6W0A7gKuKVY7RbgA+NTYk1yJrok1dXIEfoiYBD464h4OCK+GREnAnNTSpuLdbYAc4d7cERcGxEDETEwODg4qiIdQZekco0Eeg9wHvAXKaVzgT0cNbySaqdCHPYQOqV0Y0qpP6XU39fXN9Z6JUkjaCTQNwIbU0rLi9t3UAv4rRExD6D4vW18Sqxx2qIk1Vca6CmlLcCGiHhz0XQZ8DhwD7CsaFsG3D0uFYJjLpLUgJ4G1/td4NaImAysBz5G7T+D2yPiGuA54CPjU6IkqRENBXpKaRXQP8xdlzW1GknSqGXxTVFwDF2SymQR6OEguiSVyiLQJUnlDHRJqggDXZIqIotA9+y5klQui0CXJJUz0CWpIrIJ9OREdEmqK4tAdwhdksplEeiSpHIGuiRVRDaB7gi6JNWXRaA7D12SymUR6JKkcga6JFVENoHuNHRJqi+LQPd86JJULotAlySVM9AlqSKyCfTkTHRJqiuLQHceuiSVyyLQJUnlsgl0py1KUn1ZBLpDLpJULotAlySVM9AlqSKyCXSH0CWpvkwC3UF0SSqTSaBLkspkE+hOW5Sk+rII9NcPHGT7K3vbXYYkdbQsAv3nrx+gd2pPu8uQpI7WcKBHRHdEPBwR/1TcXhQRyyNiXUTcFhGTx6vIOb1TnOYiSSWO5wj9U8ATQ25/BbghpXQm8DJwTTMLG6o7ggMOoktSXQ0FekQsAN4HfLO4HcA7gTuKVW4BPjAO9QHQ3RUcOGigS1I9jR6h/ynwX4GDxe2TgR0ppf3F7Y3A/OEeGBHXRsRARAwMDg6Orsiu4KBH6JJUV2mgR8SvANtSSitHs4GU0o0ppf6UUn9fX99onqI25OIRuiTV1cjUkYuAX42IK4GpwBuArwMnRURPcZS+ANg0XkV2BZjnklRf6RF6Sun6lNKClNJC4Grg+yml/wg8BHy4WG0ZcPe4FdlV++r/QVNdkkY0lnnonwU+ExHrqI2p39Scko7VXZwQ3ZkukjSy4/q2TkrpB8APiuX1wAXNL+lYh47QDxxMTOpuxRYlKT9ZfFN0589fB2Dv/oMla0rSxJVFoJ86YyqAM10kqY4sAr2nu1bm/oMeoUvSSPII9GIMff8Bj9AlaSR5BPqhI3QDXZJGlEegHzpCd8hFkkaURaDvO1AL8hf37GtzJZLUubII9FNnnAD4TVFJqieLQJ86qVbmoSN1SdKxsgj0ScWHovv8YpEkjSiLQO/prn0o+tyLr7a5EknqXFkE+uzpUwCYMimLciWpLbJIyCk9DrlIUpksAn2ygS5JpfII9OJD0Uc27mhvIZLUwbII9ENf/T8020WSdKxsEnLJvDfw1NZX2l2GJHWsbAJ9z779nDjZyxVJ0kiyCfSz5/by6Kad7S5DkjpWNoH+2usHiHYXIUkdLJtAP/f0mezdf9ATdEnSCLIJ9JRqQf7Czp+3uRJJ6kzZBPpb5r0BgMHde9tciSR1pmwCfcYJkwBYs2V3myuRpM6UTaAvPqUXgE0vO+QiScPJJtB7p9aO0Fc8+1KbK5GkzpRNoE/u6WLxKb28+Ipj6JI0nGwCHeCkaZN4enAPe/bub3cpktRxsgr0S87uA+ClPfvaXIkkdZ6sAv3MvukA3PaTDW2uRJI6T1aB/stvrh2hv+KQiyQdI6tAn9LTTV/vFP7mX5/l1X2GuiQNlVWgA1z0ppMBvzEqSUcrDfSIOC0iHoqIxyNidUR8qmifFREPRMTa4vfM8S8XrjhnHgB/8t2nWrE5ScpGI0fo+4E/SCktAS4EPhkRS4DrgAdTSmcBDxa3x92Fb6wdoT+7fU8rNidJ2SgN9JTS5pTST4vl3cATwHzgKuCWYrVbgA+MU41HmHHCJN6/9FQe27STex/b3IpNSlIWjmsMPSIWAucCy4G5KaVDiboFmDvCY66NiIGIGBgcHBxLrYf92rnzAfju6i1NeT5JqoKGAz0ipgN3Ap9OKe0ael+qnax82CtPpJRuTCn1p5T6+/r6xlTsIZcunsPiU3r5zqoXWPGM53aRJGgw0CNiErUwvzWl9O2ieWtEzCvunwdsG58Sh/f+pacCcNfDm1q5WUnqWI3McgngJuCJlNLXhtx1D7CsWF4G3N388kb2yUvPZOHJ01i+/kV+8GRL/y+RpI7UyBH6RcBvAu+MiFXFz5XAl4HLI2It8K7idktddOZsNrz8Kv/noadbvWlJ6jg9ZSuklP4fECPcfVlzyzk+X/rgOWzbvZdHNuzgtp88z0f6T6P2hkKSJp7svil6tCXz3sC23Xv57J2P8cLO19pdjiS1TfaB/vuXn82f/YdzAbhz5UbWbNlV8ghJqqbsAx3gjFknAvC1B57iD25/pM3VSFJ7VCLQz1kwg4HPv4srfvEUtu56jR89NcimHV5MWtLEUolAB5g9fQoLZ5/I9lf28dGbV/Cfbhlod0mS1FKVCXSAT112Fnd+/Je4fMlctu56jSe37Gb94CvUvsgqSdVWqUCfOqmb88+Yydlzp/Pinn28509/xDu/+kP+8VFP4iWp+krnoefo2kvexDnzZ/Da6wf59G2reGZwDzte3UcQzJg2qd3lSdK4iFYOR/T396eBgdaNbaeUOPvz9/H6gX/v4/VXLOY///KbWlaDJI1VRKxMKfWXrVfJI/RDIoIbP9p/+GIYNzzwFOsHvTCGpGqqdKADXPrmOfDm2vKty5/nH1Zu4DuramdojIAvvv8XuPqC09tYoSQ1R+UDfajPXfkWfvzMi4dv/92/PccjG3dy9QVtLEqSmmRCBfqli+dw6eI5h28/sHord6/axL+s2364racr+J+/ds7ha5dKUi4mVKAf7ROXnnlEmKeU+M6qFxh49iUDXVJ2Kj3LZTTO/tx9zJ4+mQUzpx3R3tUF/+U9izn/jJltqkzSRNXoLJdKfbGoGX7rooWccfKJdHfFET8/Xv8SP/TKSJI62IQechnOf7vyLcO2n/OF+/mnxzazfvvw0x7n9E7l8+97C11dXmBDUnsY6A1631vnseLZl3h887HnW9/92n4Gd+/lYxct5LRZ04Z5tCSNPwO9QV/+0FtHvO/exzbziVt/yg3fe4qZ0ybXfZ63LpjBVW+b3+zyJMlAb4az5/Yye/oUvrt6a9319u4/QO/USQa6pHFhoDfBmXOmM/D5d5Wu9+X71vDN/7uev/rR+oafu6sr+NWlp9LXO2UsJUqaAAz0FjprznT2H0x86d4njutxr+7dz+9edtY4VSWpKgz0FvrQ+Qu44pxTOHgcU//f/kff4+ENO7i7OP/MaJw+axrnnu78eanqDPQWmzb5+P7k82eewPfXbOP7a0Y/B35KTxdr/sd7iXBKpVRlBnqHu+sTv8S23XtH/fjbBzbwjR+u50drtzOlp3nfI+vuCpYuOInJTXxOSWNjoHe43qmT6J06+qssLT6lF4BlN69oVkmH/fdfWcJvX7yo6c8raXQM9Ip7/1tP5bSZ09h34GBTn3fZzStYu+2VwxcPaYepk7o5ZcbUtm1f6jQGesX1dHfRv3BW05931omT+daK5/nWiueb/tzH486Pv4Pzz2h+/6QcGegalZuWvZ2123a3bfvbdu3lf923hme2v8qSeTPaVsfRpk7q8sNntY2nz1WWtu1+jQu+9GC7yzjGh85bwFc/srTdZahivEi0Km1O71Ru+PWlbN01+hlAzXbHyo08tbV971okA13Z+uC5C9pdwhEef2EX//joC5zzxfvbXcqE9JnLz+ZjF03sWVcGutQk11y8iJOn1z/bpsbHXQ9vYuC5lw30sTw4It4LfB3oBr6ZUvpyU6qSMrT0tJNYetpJ7S5jQlr53Mv8YM02Lv/aD9tdyohuWvZ2Tj95fK+XMOpAj4hu4M+By4GNwE8i4p6U0uPNKk6SGnHNxYu4f/WWdpdRVyu+VT2WI/QLgHUppfUAEfH3wFWAgS6ppa5623yvM8DYLhI9H9gw5PbGou0IEXFtRAxExMDg4OAYNidJqmfc3wOklG5MKfWnlPr7+vrGe3OSNGGNJdA3AacNub2gaJMktcFYAv0nwFkRsSgiJgNXA/c0pyxJ0vEa9YeiKaX9EfE7wP3Upi3enFJa3bTKJEnHZUzz0FNK9wL3NqkWSdIYeLkZSaoIA12SKqKlp8+NiEHguVE+fDawvYnl5MA+Twz2ufrG2t8zUkql875bGuhjEREDjZwPuErs88Rgn6uvVf11yEWSKsJAl6SKyCnQb2x3AW1gnycG+1x9LelvNmPokqT6cjpClyTVYaBLUkVkEegR8d6IeDIi1kXEde2u53hFxLMR8VhErIqIgaJtVkQ8EBFri98zi/aIiP9d9PXRiDhvyPMsK9ZfGxHLhrSfXzz/uuKx0YY+3hwR2yLiZ0Paxr2PI22jjX3+YkRsKvb1qoi4csh91xf1PxkR7xnSPuzruzjx3fKi/bbiJHhExJTi9rri/oUt6u9pEfFQRDweEasj4lNFe2X3c50+d+Z+Til19A+1E389DbwRmAw8Aixpd13H2YdngdlHtf0xcF2xfB3wlWL5SuA+IIALgeVF+yxgffF7ZrE8s7hvRbFuFI+9og19vAQ4D/hZK/s40jba2OcvAn84zLpLitfuFGBR8Zrurvf6Bm4Hri6W/xL4eLH8CeAvi+Wrgdta1N95wHnFci/wVNGvyu7nOn3uyP3c0n/0o/yDvgO4f8jt64Hr213XcfbhWY4N9CeBeUNeNE8Wy98AfuPo9YDfAL4xpP0bRds8YM2Q9iPWa3E/F3JkuI17H0faRhv7PNI/9CNet9TOUvqOkV7fRaBtB3qK9sPrHXpssdxTrBdt2N93U7umcOX38zB97sj9nMOQS0OXuutwCfhuRKyMiGuLtrkppc3F8hZgbrE8Un/rtW8cpr0TtKKPI22jnX6nGGK4ecjQwPH2+WRgR0pp/1HtRzxXcf/OYv2WKd7+nwssZ4Ls56P6DB24n3MI9Cq4OKV0HnAF8MmIuGTonan2X3Cl54+2oo8d8nf8C+BNwNuAzcBX21rNOIiI6cCdwKdTSruG3lfV/TxMnztyP+cQ6Nlf6i6ltKn4vQ24C7gA2BoR8wCK39uK1Ufqb732BcO0d4JW9HGkbbRFSmlrSulASukg8FfU9jUcf59fBE6KiJ6j2o94ruL+GcX64y4iJlELtltTSt8umiu9n4frc6fu5xwCPetL3UXEiRHRe2gZeDfwM2p9OPTp/jJqY3MU7R8tZghcCOws3mreD7w7ImYWb+/eTW2sbTOwKyIuLGYEfHTIc7VbK/o40jba4lDoFD5IbV9Drc6ri5kLi4CzqH0AOOzruzgKfQj4cPH4o/9+h/r8YeD7xfrjqvjb3wQ8kVL62pC7KrufR+pzx+7ndnywMIoPIq6k9uny08Dn2l3Pcdb+RmqfaD8CrD5UP7WxsAeBtcD3gFlFewB/XvT1MaB/yHP9NrCu+PnYkPb+4gX1NPBntOcDsm9Re+v5OrVxwGta0ceRttHGPv9t0adHi3+Q84as/7mi/icZMhNppNd38dpZUfwt/gGYUrRPLW6vK+5/Y4v6ezG1oY5HgVXFz5VV3s91+tyR+9mv/ktSReQw5CJJaoCBLkkVYaBLUkUY6JJUEQa6JFWEgS5JFWGgS1JF/H85cMkmMcaqfgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), reverse=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从上图中可以看出有一小部分用户阅读类型是极其广泛的,大部分人都处在20个新闻类型以下。" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcategory_id
count250000.000000250000.000000
mean124999.5000004.573188
std72168.9279864.419800
min0.0000001.000000
25%62499.7500002.000000
50%124999.5000003.000000
75%187499.2500006.000000
max249999.00000095.000000
\n", + "
" + ], + "text/plain": [ + " user_id category_id\n", + "count 250000.000000 250000.000000\n", + "mean 124999.500000 4.573188\n", + "std 72168.927986 4.419800\n", + "min 0.000000 1.000000\n", + "25% 62499.750000 2.000000\n", + "50% 124999.500000 3.000000\n", + "75% 187499.250000 6.000000\n", + "max 249999.000000 95.000000" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_click_merge.groupby('user_id')['category_id'].nunique().reset_index().describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户查看文章的长度的分布\n", + "\n", + "通过统计不同用户点击新闻的平均字数,这个可以反映用户是对长文更感兴趣还是对短文更感兴趣。" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从上图中可以发现有一小部分人看的文章平均词数非常高,也有一小部分人看的平均文章次数非常低。\n", + "\n", + "大多数人偏好于阅读字数在200-400字之间的新闻。" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD4CAYAAAAXUaZHAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAgU0lEQVR4nO3deZgddb3n8ff37Kf79N6dTqc7OwkkBAVskEW9GkTZRvAZveLMKKOMzFXnjj7eRwWduc/1GX1mXHG7i4zgBfVeYdBR5MooXhYFgRgkgZAQ6OwdOul9306f85s/TiV09k56qT5Vn9fznCdVv6o6/T0F/TnVv/pVlTnnEBGRYIn4XYCIiMw8hbuISAAp3EVEAkjhLiISQAp3EZEAivldAEBtba1btmyZ32WIiBSVZ599ttM5V3e8ZfMi3JctW8bGjRv9LkNEpKiY2Z4TLVO3jIhIACncRUQCSOEuIhJACncRkQBSuIuIBJDCXUQkgBTuIiIBVNThvv3AAF//zXa6h8b9LkVEZF4p6nDf2THIdx5poX1g1O9SRETmlaIO91QiCsDIeM7nSkRE5peiDvd03Av3rMJdRGSyQIT7qMJdROQIxR3uh7tl8j5XIiIyvxR3uKtbRkTkuIo63FMKdxGR45pyuJtZ1MyeM7MHvfl/NLNdZrbJe53vtZuZfdvMWszseTO7cJZqP9wts6tjaLZ+hIhIUTqdh3V8AtgGlE9q+7Rz7v6j1rsaWOW93gj8vffvjCv1wj0VL+o/QEREZtyUUtHMmoBrge9PYfXrgXtcwdNApZk1TKPGk9VFeSrGsMa5i4gcYaqHvN8EPgMcPSzlS17Xy+1mlvTaGoF9k9Zp9dqOYGa3mNlGM9vY0dFxmmW/pjQZY2hs4oy3FxEJolOGu5ldB7Q75549atFtwDnARUA18NnT+cHOuTucc83Ouea6uuM+33VKShJRHbmLiBxlKkfulwPvMrPdwE+A9Wb2I+dcm9f1Mgb8ALjYW38/sHjS9k1e26woScR4pX1gtt5eRKQonTLcnXO3OeeanHPLgBuBR5xz/+FQP7qZGXADsMXb5AHgg96omUuAPudc26xUDwyPT1CaPJ3zwiIiwTedVPyxmdUBBmwC/sJr/xVwDdACDAMfmk6Bp3L2wjK2H9CRu4jIZKcV7s65x4DHvOn1J1jHAR+fbmFTlUnG2KFx7iIiRwjMAPHCd4qIiEAAwn1lXQaAIY2YERE5rOjDvSwVB6B3WI/aExE5pOjD/dCtB/Z0DftciYjI/FH04X72wjIA2vr0HFURkUOKPtyX15YC8FJbv8+ViIjMH0Uf7iWJwmjOsQk9jUlE5JCiD3eA2kyClvZBv8sQEZk3AhHuFek4w3oak4jIYYEI9/ryFG29I36XISIybwQi3BeUJenROHcRkcMCEe61mSTZnGNUXTMiIkBAwn1hRQqAvd26kElEBAIS7msaCs/sbu1RuIuIQEDCva6s8PjWVw5qOKSICAQk3Fd4V6nu14gZEREgIOEei0YoTUR18zAREU8gwh1gVX2ZrlIVEfEEJtzLUjHa+tQtIyICAQr3pqo0eQcTOd1ATEQkQOFeAsDBgTGfKxER8V9gwn3VgsKzVF85OOBzJSIi/gtMuB+6SnVoTLcgEBEJTLgfupCpY0CP2xMRCUy4V5UkAHh+f5/PlYiI+C8w4Z6KR8kkY+zuHPK7FBER3wUm3AGW1pRwsF+jZUREAhXuC8tTDI5N+F2GiIjvAhXul66soW8kS3u/TqqKSLgFKtxrM4URM/t0X3cRCblAhfvSmsJVqn0jWZ8rERHxV6DC/dCRux7aISJhF6hwb6pKA2jEjIiE3pTD3cyiZvacmT3ozS83s2fMrMXM7jWzhNee9OZbvOXLZqn249XIkuoStryqC5lEJNxO58j9E8C2SfNfBm53zp0F9AA3e+03Az1e++3eenPGDCI2lz9RRGT+mVK4m1kTcC3wfW/egPXA/d4qdwM3eNPXe/N4y6/w1p8T6xZV8OL+/rn6cSIi89JUj9y/CXwGOPQkjBqg1zl36IqhVqDRm24E9gF4y/u89Y9gZreY2UYz29jR0XFm1R9HLu9wM/ZuIiLF6ZThbmbXAe3OuWdn8gc75+5wzjU755rr6upm7H1X12cYGp8gn1fEi0h4xaawzuXAu8zsGiAFlAPfAirNLOYdnTcB+7319wOLgVYziwEVQNeMV34C5ek4zsHA6AQVJfG5+rEiIvPKKY/cnXO3OeeanHPLgBuBR5xz/x54FHiPt9pNwC+86Qe8ebzljzjn5uwwuiJdCHRdyCQiYTadce6fBT5lZi0U+tTv9NrvBGq89k8Bt06vxNOjcBcRmVq3zGHOuceAx7zpncDFx1lnFHjvDNR2RhTuIiIBu0IVCn3uALu79NAOEQmvwIV7TabwuL383HXzi4jMO4EL90PPUn21V/d0F5HwCly4x6MRzOBA34jfpYiI+CZw4Q7QWJlmT7ce2CEi4RXIcE/GIqjLXUTCLJDhvqahnH4NhRSREAtkuFeWxNnZqaGQIhJegQz3Q/cMy+nmYSISUoEM9zULywDoGNDj9kQknAIZ7pXeWPc9ukpVREIqkOH+uqYKAPZqOKSIhFQgw70mkwSge2jc50pERPwRyHAvTURJRCMKdxEJrUCGu5lRXZpga5selC0i4RTIcAeIGPQM68hdRMIpsOG+tKaUg/0aCiki4RTYcD97YRkdA2O6kElEQimw4d5YmQbg1V7d+ldEwiew4X6eN9Zdj9sTkTAKbLgvqykFYHeXLmQSkfAJbLgvKEuSikfYo7tDikgIBTbcIxFjSXWJjtxFJJQCG+4A9eUpNrf2+l2GiMicC3S4l6VijGVzfpchIjLnAh3ur2uqpH90goFRPXJPRMIl0OFeU1q4r/tLBwZ8rkREZG4FOtzXNRbGuh/sH/W5EhGRuRXocF9aUwLAH3d1+1yJiMjcCnS4lyRiJGIR+kbU5y4i4RLocAe4bGWN+txFJHQCH+7rFlXwSvsgYxMaEiki4XHKcDezlJltMLPNZvaimX3Ba/9HM9tlZpu81/leu5nZt82sxcyeN7MLZ/kznNRZCzLk8o69ulJVREIkNoV1xoD1zrlBM4sDT5jZQ96yTzvn7j9q/auBVd7rjcDfe//6YkVd4QZiOzqGWFVf5lcZIiJz6pRH7q5g0JuNe6+TPQHjeuAeb7ungUoza5h+qWdmeW0h3HfpBmIiEiJT6nM3s6iZbQLagYedc894i77kdb3cbmZJr60R2Ddp81avzRdlqTh1ZUl2dgyeemURkYCYUrg753LOufOBJuBiM1sH3AacA1wEVAOfPZ0fbGa3mNlGM9vY0dFxelWfphW1pezUkbuIhMhpjZZxzvUCjwJXOefavK6XMeAHwMXeavuBxZM2a/Lajn6vO5xzzc655rq6ujMqfqpW1GV05C4ioTKV0TJ1ZlbpTaeBK4GXDvWjm5kBNwBbvE0eAD7ojZq5BOhzzrXNQu1TtrKulJ7hLO0Dug2BiITDVEbLNAB3m1mUwpfBfc65B83sETOrAwzYBPyFt/6vgGuAFmAY+NCMV32aFpSnAGg5OMiCspTP1YiIzL5Thrtz7nngguO0rz/B+g74+PRLmzlrG8oBeKV9kMvOqvW5GhGR2Rf4K1ShMBwyHjWe2tHldykiInMiFOEejRgr6zLs7NRJVREJh1CEO8D6cxbw8kHdY0ZEwiE04d5QmQbglYM6eheR4AtNuF++sgaArW39PlciIjL7QhPuS2tKScUjbFO4i0gIhCbcoxHjrAUZdcuISCiEJtwBzm2oYMurfRSG4ouIBFeown1dUwW9w1n26MEdIhJwoQr3cxYWHtbx3L4enysREZldoQr3CxZXkohF2PqqTqqKSLCFKtxj0Qgr6zI8/vLs3j9eRMRvoQp3gMbKNK09I4xP5P0uRURk1oQu3N/b3MTweI5ndukmYiISXKEL98u8K1X/9tEWnysREZk9oQv3slScxso0G3Z1MzKum4iJSDCFLtwBPn/tGvIONrf2+l2KiMisCGW4X7qi0DXz7B6NdxeRYApluFeVJlhRV8pzexXuIhJMoQx3gAuXVPHY9g7dZ0ZEAim04X5eYwUTeceLulpVRAIotOH+b16/CIAfPLnb30JERGZBaMO9ujTBitpSHt56wO9SRERmXGjDHeCd6xbSPzpBe/+o36WIiMyoUIf7tec1APCYbiQmIgET6nA/d1E59eVJfqdwF5GACXW4mxmXrqjh6Z1dGhIpIoES6nAHePOqOjoHx9nc2ud3KSIiMyb04f72NfXEIsZDW9r8LkVEZMaEPtwrSuJcflYtv3pB4S4iwRH6cAe4Ys0C9nWPsKNj0O9SRERmhMIduGJNPQCPbGv3uRIRkZmhcKfwXNWz68t4eNtBv0sREZkRpwx3M0uZ2QYz22xmL5rZF7z25Wb2jJm1mNm9Zpbw2pPefIu3fNksf4YZceXaejbs6qatb8TvUkREpm0qR+5jwHrn3OuB84GrzOwS4MvA7c65s4Ae4GZv/ZuBHq/9dm+9ee8d5xa6Zp5s0YOzRaT4nTLcXcGhM41x7+WA9cD9XvvdwA3e9PXePN7yK8zMZqrg2XLuogqWVJdw3x/3+V2KiMi0TanP3cyiZrYJaAceBnYAvc65CW+VVqDRm24E9gF4y/uAmuO85y1mttHMNnZ0+H/5fzRi3HjxYjbs7qalfcDvckREpmVK4e6cyznnzgeagIuBc6b7g51zdzjnmp1zzXV1ddN9uxnxvubFJKIRfvzMXr9LERGZltMaLeOc6wUeBS4FKs0s5i1qAvZ70/uBxQDe8gqgKDqyazJJ3nFuPf/3uf2MZnN+lyMicsamMlqmzswqvek0cCWwjULIv8db7SbgF970A9483vJHXBHdlev9Fy+hdzjL1369XTcTE5GiNZUj9wbgUTN7Hvgj8LBz7kHgs8CnzKyFQp/6nd76dwI1XvungFtnvuzZc9nKGhoqUnz/iV38/pVOv8sRETkjNh+OTpubm93GjRv9LuOw3uFx3vKVR1m5IMPPPnoZRTDYR0RCyMyedc41H2+ZrlA9jsqSBLdevYbn9vbyr7olgYgUIYX7Cby3uYnltaV88V+26uSqiBQdhfsJxKMR/vq6tezuGua+jbqwSUSKi8L9JN56dh3NS6v42q+3s/2ALmwSkeKhcD8JM+OL715HNue46a4NdA6O+V2SiMiUKNxP4ZyF5fz4I2+kY3CMz9z/vN/liIhMicJ9Ci5cUsWnrlzNIy+184cdGvsuIvOfwn2Kbn7TchZVpPhvP99CNpf3uxwRkZNSuE9RKh7lC9evY2fHEHc+scvvckRETkrhfhrevmYBV66t56u/3s6GXd1+lyMickIK99NgZnzxhnVUpuPcdNcG/tCi/ncRmZ8U7qepvjzFD29+IxGDD9y1gef29vhdkojIMRTuZ2DtonL+3yffwsLyFDfdtYGXD+oCJxGZXxTuZ2hxdQl3/ceLyDt413ef4AdP6iSriMwfCvdpOHthGT/72GWUJmJ84ZdbueWejbz4ap/fZYmIKNyna3V9GU9/7go++taV/GbrQa799hP8ZIOewSoi/lK4z4B4NMJnrzqHB//yTSyvLeXWn73AF375Irm8/w9CEZFwUrjPoHWNFTz0iTfzjrX1/ODJ3dx89x8ZGde94EVk7incZ1gqHuV7H3gDf7n+LB7b3sG7/+5Jfv7cfvI6iheROaRwnwVmxl+942y+++8uYG/3MJ+8dxO3/PBZPdFJROaMwn0WXfe6RWz5m3dy40WL+e22g1z3nSfY1z3sd1kiEgIK91kWiRj/69++jv9+3Vr2dA1xxTce5+u/2a6jeBGZVQr3OXLzm5bz0CfewkXLqvjOIy28+SuP8otN+3FOffEiMvMU7nPorAUZfvjhN/LV97wOAz7xk0185J6N6qoRkRmncJ9jkYjx3ubFPPHZ9Xz8bSt55KV21n/9Mb73+A4GRrN+lyciAaFw90kiFuHT7zyHn3/8cs5rrOB/PvQSb/vaY9z3x31+lyYiAWDzoc+3ubnZbdy40e8yfPX4yx18+aGX2NrWTyIW4YpzFvDuCxp5+5p6IhHzuzwRmYfM7FnnXPPxlsXmuhg5vj9bXcflK2u4d+M+HtnWzqPb23loywFW12e45rwGVtZliEaMiBnRiBGLGJGIETUjHjXOX1JJMhb1+2OIyDyhI/d5anwiz70b9/Gjp/awfQr3i88kYzRUpHhvcxMfuGQZ6YSCXiToTnbkrnCf55xz7OseYWwiR845cnlHPo83nSeXh92dQ2xu7WXTvl5efLWfTDLGu85fxJ83L+b1TRWYqVtHJIgU7iHyh5ZO7npyF4+/3EE251hcneaqcxfy7guaWLuo3O/yRGQGKdxD6GD/KL/c/CoPbz3IM7u6AVhRW8p1r1/EO9bWs7q+jERMg6VEipnCPeQO9BWC/qd/amX7wQGcg+rSBG89u46r1zVw0bIqKksSfpcpIqdpWuFuZouBe4B6wAF3OOe+ZWZ/A3wE6PBW/Zxz7lfeNrcBNwM54L865359sp+hcJ87e7uGeWpnJ7/d1s5j29vJ5hzJWIRV9RkuX1lLRUmc5qXVvGFpFVENwRSZ16Yb7g1Ag3PuT2ZWBjwL3AD8OTDonPvaUeuvBf4ZuBhYBPwWWO2cO+GdshTu/ugaHGPTvl5+93IHT+7ooqV98PCykkSUt52zgDefVUvzsipW1mV0YlZknpnWOHfnXBvQ5k0PmNk2oPEkm1wP/MQ5NwbsMrMWCkH/1GlXLrOqJpPkijX1XLGmHoB83tE7kuXhrQd4/OUOnmzp5F+ebwOgvjxJVUmCRCxCPBqhPBXj8rNqWVVfxiUrqjXGXmSeOa2LmMxsGXAB8AxwOfBfzOyDwEbgr5xzPRSC/+lJm7VynC8DM7sFuAVgyZIlZ1K7zLBIxKguTfC+i5bwvouWkM87nt/fxx93dbOptZfsRJ5sLk8259h+YIBHt3cc3nZNQzlvOquGt6yu49xFFVSXqg9fxE9TPqFqZhngceBLzrmfmVk90EmhH/5/UOi6+bCZfRd42jn3I2+7O4GHnHP3n+i91S1TfPJ5x8GBUTbv6+O5fT387uVOWtoHyOYK/z+tayynMp0gGYuQSkS5eFk1y2tLWdeo4BeZKdO+/YCZxYGfAj92zv0MwDl3cNLy/w086M3uBxZP2rzJa5MAiUSMhoo0DRVprlq3kNuuhqGxCX7/Sgcv7O9j075eRrI5ekfGae8fO9y9E4sYNZkE5ak4l66sYdWCDJeurGV5balO4IrMoFOGuxXOot0JbHPOfWNSe4PXHw/wbmCLN/0A8E9m9g0KJ1RXARtmtGqZl0qTMa5a18BV6xqOaM/lHS3tg3QNjvHUzi7a+8fY3zvCD5/ew6E/HONR4w1Lq6hMJwr30IkYUYOIvXYPnUjEiEagPBXnbecsYGVdRn8FiJzAVI7cLwc+ALxgZpu8ts8B7zez8yl0y+wG/jOAc+5FM7sP2ApMAB8/2UgZCb5oxDh7YRlQxmVn1R5un8jl2ds9zNM7u2lpH+TpnV10D40XbrHg8P515PPOu91C4XYM3cPj/N1jO4gYrKjLUJqMUV0S55IVNTRWpWleWs3CipR/H1hkHtBFTFJ02vtHebGtn6d2dLG3a5iRbI4dHYO09owcXicWMZKxCKXJGOcvrmRRZZo3LK3ivMYKKtJxqnTELwGgK1Ql8JxzdA6Oc6BvlCdaOhkcyzKazdM1OMbm1j7a+kYYzeYPr7+kuoSVdaUsr81w4dJKqkoSrKrPsKBMR/xSPHQ/dwk8M6OuLEldWZLzmiqOWZ7N5Xl6ZxcdA2Mc7B9j875e9vUM8+SOLu56ctfh9WpKE5QmYywoS9K8rJrSRJRkPEIyFmVpTQnnLqqgLBUjGYvooi6Z1xTuEgrxaIQ3r6o7pr1/NEtr9wg9w+Ns2tfL/t4RRsZzvHxwgO/9bgcn+sO2Ih2neWkVNZkETVUlrG0op7YsSWU6zpLqEj09S3yncJdQK0/FWbsoDsDlk072QqGrZyLvGJ/IMzQ+wba2AXZ3DjE4NsGOjkFeaO3jxVf7OdA/esR2VSVx6stTlCZjNFWlWdtQTnk6TioeIZOMc15jBfXlSR35y6xSuIucgHmPMIxHI15XTYo/W33s0X/fSJbtBwYYHMtysH+MZ/f00DeSZXB0gqd2dPGLTa8e570pnPBNxFhZl2H1wgyliRhlqRiNVWkWlKVIJ6I0edMip0vhLjJNFek4Fy+vPjz//otfu52Gc46e4SxjEzlGs3l6h8cPh/9oNkf/yARbXu3jwefbGBnPMTaRP+b9V9dnWFiRpiwVo74sxaLKFMl4lGQsQmNlmtcvriST1K+yHEn/R4jMIjM76kKrUi5YUnXC9UezOfZ2D9M9NM7A6ATb2vp5bm8PPcNZWruHeaSvnZHssZeNVKTjZJIxMskYCytSNFSkSMQixCIR4jGjsTLN6voyFpQlKU3GqM0kdUVwwCncReaRVDzK6vqyw/NXrq0/Ynk+7xgcn2A0m2NorHDid0fHIAf6Rhkez9E/kmVfzwhb2/qZ8G7yNpLNkcsfeWY4EY2wqDLFqvoyMskYqXiEukySRZVpajJJKtJxmqrS1Jen9CVQpBTuIkUkEjHKU3HKU3Eog+W1pafcxjnH7q5h9nYP0zM0zsBoltbeEXZ1DLGrc4jRiRwj43m6hsaOGR1UliycA0jFo2SSMZbWlLC8tpSSROELoSIdZ2FFirPry4hF9djG+UThLhJwZsby2tJTfhFkc3kO9I3SO5ylc7Bw/58t+/voHhpnJJtjYHSCBza/ysDoxHG3T8YipBNRakoTLK/NsLAiSToeJR2PUluWZHFVCY1VaeLRiPdcAKOqJEFcXwqzQuEuIkDhWoDF1SUsrj7xOs4572RwvnDXz+Fx9nQNT/oLIMeBvlF2dw3xp709jGZzjGRzJ7xewAwWlqdYVJkmk4xRmoxSVZJgSXUJTVUlpOIRUvEo1aUJltaUUJJQZE2V9pSITJmZHfUw9ZOfIIbCDeC6h8Z5pX2A7qFxsrk84xOFV8fAGK09IxzoH6V3JEtrzzA9w1m6h8aP+17VpQnS8SgliULgN1amaaouIeENWS0MW43SUJFmYUXq8PqpeDR05w4U7iIyq6KR124NMVWdg2N0DY4zms0xms3RPjDG7s4hDg4UThwPjU3QM5Tl9y2ddAyMTek9y1Mx6stTrKgrJRGLkvC6h8pSMRaWp1hYkaKyJE4yFiERLdx2orIkTmW68HjJYqNwF5F5pzaTpDYztS8D5xy5vCObc4zn8vSPZGnrG6Wtb4S+kSwj4zmGxwtdSK09I+zuHGbc++thbCLPwGj2uNcXHGIGS6tLiEcjxKKFcwWxiBGLRKgqjVNXlqS6JEFFSeHJY4lYhKT3pVGRTtBUlaY2kyRizOlVyQp3ESlqZkYsasSikCZKRTrO4uqSKW9/6EKztr4R+kcmGM/lyU68dk6hrW+Uvd3DTOQKt6OYyOeZyDmyuTy7OofYsKub3pHsCc8rTFb4YogUvhyiRiYV46ZLl/Gf3rxiGnvg+BTuIhJqhy40m85TvXJ5x+DoBGO5HGPZPOO5PH0jWfpGsuzpHKJvZIJcPk82f+ivjDy5fOHk9Ol0V50OhbuIyDRFI0ZFSRyIH7vw7DkvB4DiO0sgIiKnpHAXEQkghbuISAAp3EVEAkjhLiISQAp3EZEAUriLiASQwl1EJIDMTeWa2dkuwqwD2HOGm9cCnTNYTlBovxxL++RY2ifHKqZ9stQ5d+xT25kn4T4dZrbROdfsdx3zjfbLsbRPjqV9cqyg7BN1y4iIBJDCXUQkgIIQ7nf4XcA8pf1yLO2TY2mfHCsQ+6To+9xFRORYQThyFxGRoyjcRUQCqKjD3cyuMrPtZtZiZrf6Xc9MM7O7zKzdzLZMaqs2s4fN7BXv3yqv3czs296+eN7MLpy0zU3e+q+Y2U2T2t9gZi9423zb5vIBj2fIzBab2aNmttXMXjSzT3jtod0vZpYysw1mttnbJ1/w2peb2TPe57jXzBJee9Kbb/GWL5v0Xrd57dvN7J2T2ovyd83Momb2nJk96M2HZ58454ryBUSBHcAKIAFsBtb6XdcMf8a3ABcCWya1fQW41Zu+FfiyN30N8BBgwCXAM157NbDT+7fKm67ylm3w1jVv26v9/sxT2CcNwIXedBnwMrA2zPvFqzPjTceBZ7z67wNu9Nr/AfioN/0x4B+86RuBe73ptd7vURJY7v1+RYv5dw34FPBPwIPefGj2STEfuV8MtDjndjrnxoGfANf7XNOMcs79Dug+qvl64G5v+m7ghknt97iCp4FKM2sA3gk87Jzrds71AA8DV3nLyp1zT7vC/8X3THqvecs51+ac+5M3PQBsAxoJ8X7xPtugN3voWW8OWA/c77UfvU8O7av7gSu8v06uB37inBtzzu0CWij8nhXl75qZNQHXAt/35o0Q7ZNiDvdGYN+k+VavLejqnXNt3vQBoN6bPtH+OFl763Hai4b3p/MFFI5UQ71fvO6HTUA7hS+qHUCvc27CW2Xy5zj82b3lfUANp7+v5rtvAp8B8t58DSHaJ8Uc7qHnHVmGciyrmWWAnwKfdM71T14Wxv3inMs5584HmigcVZ7jb0X+MrPrgHbn3LN+1+KXYg73/cDiSfNNXlvQHfS6DvD+bffaT7Q/TtbedJz2ec/M4hSC/cfOuZ95zaHfLwDOuV7gUeBSCl1QMW/R5M9x+LN7yyuALk5/X81nlwPvMrPdFLpM1gPfIkz7xO9O/zN9ATEKJ8GW89oJjXP9rmsWPucyjjyh+lWOPHH4FW/6Wo48cbjBa68GdlE4aVjlTVd7y44+cXiN3593CvvDKPSDf/Oo9tDuF6AOqPSm08DvgeuA/8ORJw8/5k1/nCNPHt7nTZ/LkScPd1I4cVjUv2vAW3nthGpo9onvBUzzP9o1FEZL7AA+73c9s/D5/hloA7IU+vRuptAP+K/AK8BvJwWSAX/r7YsXgOZJ7/NhCieCWoAPTWpvBrZ423wX74rl+fwC3kShy+V5YJP3uibM+wV4HfCct0+2AH/tta+g8EXV4oVa0mtPefMt3vIVk97r897n3s6kUULF/Lt2VLiHZp/o9gMiIgFUzH3uIiJyAgp3EZEAUriLiASQwl1EJIAU7iIiAaRwFxEJIIW7iEgA/X/oUOvlK5thTAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#挑出大多数人的区间仔细看看\n", + "plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True)[1000:45000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以发现大多数人都是看250字以下的文章" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idwords_count
count250000.000000250000.000000
mean124999.500000205.830189
std72168.92798647.174030
min0.0000008.000000
25%62499.750000187.500000
50%124999.500000202.000000
75%187499.250000217.750000
max249999.0000003434.500000
\n", + "
" + ], + "text/plain": [ + " user_id words_count\n", + "count 250000.000000 250000.000000\n", + "mean 124999.500000 205.830189\n", + "std 72168.927986 47.174030\n", + "min 0.000000 8.000000\n", + "25% 62499.750000 187.500000\n", + "50% 124999.500000 202.000000\n", + "75% 187499.250000 217.750000\n", + "max 249999.000000 3434.500000" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#更加详细的参数\n", + "user_click_merge.groupby('user_id')['words_count'].mean().reset_index().describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 用户点击新闻的时间分析" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "#为了更好的可视化,这里把时间进行归一化操作\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "mm = MinMaxScaler()\n", + "user_click_merge['click_timestamp'] = mm.fit_transform(user_click_merge[['click_timestamp']])\n", + "user_click_merge['created_at_ts'] = mm.fit_transform(user_click_merge[['created_at_ts']])\n", + "\n", + "user_click_merge = user_click_merge.sort_values('click_timestamp')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
182499901623000.00000043201252552810.989186193
22499981609740.00000241121132552810.989092259
302499851609740.0000034117182882810.989092259
502499791623000.00000441171252222810.989186193
252499881609740.0000044117121217172810.989092259
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "18 249990 162300 0.000000 4 \n", + "2 249998 160974 0.000002 4 \n", + "30 249985 160974 0.000003 4 \n", + "50 249979 162300 0.000004 4 \n", + "25 249988 160974 0.000004 4 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "18 3 20 1 25 \n", + "2 1 12 1 13 \n", + "30 1 17 1 8 \n", + "50 1 17 1 25 \n", + "25 1 17 1 21 \n", + "\n", + " click_referrer_type rank click_cnts category_id created_at_ts \\\n", + "18 2 5 5 281 0.989186 \n", + "2 2 5 5 281 0.989092 \n", + "30 2 8 8 281 0.989092 \n", + "50 2 2 2 281 0.989186 \n", + "25 2 17 17 281 0.989092 \n", + "\n", + " words_count \n", + "18 193 \n", + "2 259 \n", + "30 259 \n", + "50 193 \n", + "25 259 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_click_merge.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "def mean_diff_time_func(df, col):\n", + " df = pd.DataFrame(df, columns={col})\n", + " df['time_shift1'] = df[col].shift(1).fillna(0)\n", + " df['diff_time'] = abs(df[col] - df['time_shift1'])\n", + " return df['diff_time'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "# 点击时间差的平均值\n", + "mean_diff_click_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'click_timestamp'))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(sorted(mean_diff_click_time.values, reverse=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从上图可以发现不同用户点击文章的时间差是有差异的" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "# 前后点击文章的创建时间差的平均值\n", + "mean_diff_created_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'created_at_ts'))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(sorted(mean_diff_created_time.values, reverse=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从图中可以发现用户先后点击文章,文章的创建时间也是有差异的" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Looking in indexes: https://mirrors.aliyun.com/pypi/simple\n", + "Collecting gensim\n", + " Downloading https://mirrors.aliyun.com/pypi/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)\n", + "\u001b[K |████████████████████████████████| 24.2 MB 91.0 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: six>=1.5.0 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.15.0)\n", + "Requirement already satisfied: numpy>=1.11.3 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.19.1)\n", + "Requirement already satisfied: scipy>=0.18.1 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.5.4)\n", + "Requirement already satisfied: numpy>=1.11.3 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.19.1)\n", + "Collecting smart-open>=1.8.1\n", + " Downloading https://mirrors.aliyun.com/pypi/packages/e3/cf/6311dfb0aff3e295d63930dea72e3029800242cdfe0790478e33eccee2ab/smart_open-4.0.1.tar.gz (117 kB)\n", + "\u001b[K |████████████████████████████████| 117 kB 96.7 MB/s eta 0:00:01\n", + "\u001b[?25hBuilding wheels for collected packages: smart-open\n", + " Building wheel for smart-open (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for smart-open: filename=smart_open-4.0.1-py3-none-any.whl size=108249 sha256=50eb67320a58790e8b173971aeb6af7b636d48259d7c9de759612e58e334215b\n", + " Stored in directory: /home/admin/.cache/pip/wheels/c3/14/fc/a0e523e5d2f13d083ce0af09d4e2861d8e2ec65fc466fb1dff\n", + "Successfully built smart-open\n", + "Installing collected packages: smart-open, gensim\n", + "Successfully installed gensim-3.8.3 smart-open-4.0.1\n" + ] + } + ], + "source": [ + "# 安装gensim\n", + "!pip install gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.models import Word2Vec\n", + "import logging, pickle\n", + "\n", + "# 需要注意这里模型只迭代了一次\n", + "def trian_item_word2vec(click_df, embed_size=16, save_name='item_w2v_emb.pkl', split_char=' '):\n", + " click_df = click_df.sort_values('click_timestamp')\n", + " # 只有转换成字符串才可以进行训练\n", + " click_df['click_article_id'] = click_df['click_article_id'].astype(str)\n", + " # 转换成句子的形式\n", + " docs = click_df.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()\n", + " docs = docs['click_article_id'].values.tolist()\n", + "\n", + " # 为了方便查看训练的进度,这里设定一个log信息\n", + " logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", + "\n", + " # 这里的参数对训练得到的向量影响也很大,默认负采样为5\n", + " w2v = Word2Vec(docs, size=16, sg=1, window=5, seed=2020, workers=24, min_count=1, iter=10)\n", + " \n", + " # 保存成字典的形式\n", + " item_w2v_emb_dict = {k: w2v[k] for k in click_df['click_article_id']}\n", + " \n", + " return item_w2v_emb_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "item_w2v_emb_dict = trian_item_word2vec(user_click_merge)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_type
25667190841199197150704527612941171202
25668190841285298150704530292041171202
25669190841156624150704663888541171202
25670190841129029150704666888541171202
107739164226214800150713140246441171212
\n", + "
" + ], + "text/plain": [ + " user_id ... click_referrer_type\n", + "25667 190841 ... 2\n", + "25668 190841 ... 2\n", + "25669 190841 ... 2\n", + "25670 190841 ... 2\n", + "107739 164226 ... 2\n", + "\n", + "[5 rows x 9 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 随机选择5个用户,查看这些用户前后查看文章的相似性\n", + "sub_user_ids = np.random.choice(user_click_merge.user_id.unique(), size=15, replace=False)\n", + "sub_user_info = user_click_merge[user_click_merge['user_id'].isin(sub_user_ids)]\n", + "\n", + "sub_user_info.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# 上一个版本,这个函数使用的是赛题提供的词向量,但是由于给出的embedding并不是所有的数据的embedding,所以运行下面画图函数的时候会报keyerror的错误\n", + "# 为了防止出现这个错误,这里修改为使用word2vec训练得到的词向量进行可视化\n", + "def get_item_sim_list(df):\n", + " sim_list = []\n", + " item_list = df['click_article_id'].values\n", + " for i in range(0, len(item_list)-1):\n", + " emb1 = item_w2v_emb_dict[str(item_list[i])] # 需要注意的是word2vec训练时候使用的是str类型的数据\n", + " emb2 = item_w2v_emb_dict[str(item_list[i+1])]\n", + " sim_list.append(np.dot(emb1,emb2)/(np.linalg.norm(emb1)*(np.linalg.norm(emb2))))\n", + " sim_list.append(0)\n", + " return sim_list" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " user_id ... click_referrer_type\n", - "25667 190841 ... 2\n", - "25668 190841 ... 2\n", - "25669 190841 ... 2\n", - "25670 190841 ... 2\n", - "107739 164226 ... 2\n", - "\n", - "[5 rows x 9 columns]" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" + "source": [ + "for _, user_df in sub_user_info.groupby('user_id'):\n", + " item_sim_list = get_item_sim_list(user_df)\n", + " plt.plot(item_sim_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这里由于对词向量的训练迭代次数不是很多,所以看到的可视化结果不是很准确,可以训练更多次来观察具体的现象。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 总结\n", + "\n", + "通过数据分析的过程, 我们目前可以得到以下几点重要的信息, 这个对于我们进行后面的特征制作和分析非常有帮助:\n", + "1. 训练集和测试集的用户id没有重复,也就是测试集里面的用户模型是没有见过的\n", + "2. 训练集中用户最少的点击文章数是2, 而测试集里面用户最少的点击文章数是1\n", + "3. 用户对于文章存在重复点击的情况, 但这个都存在于训练集里面\n", + "4. 同一用户的点击环境存在不唯一的情况,后面做这部分特征的时候可以采用统计特征\n", + "5. 用户点击文章的次数有很大的区分度,后面可以根据这个制作衡量用户活跃度的特征\n", + "6. 文章被用户点击的次数也有很大的区分度,后面可以根据这个制作衡量文章热度的特征\n", + "7. 用户看的新闻,相关性是比较强的,所以往往我们判断用户是否对某篇文章感兴趣的时候, 在很大程度上会和他历史点击过的文章有关\n", + "8. 用户点击的文章字数有比较大的区别, 这个可以反映用户对于文章字数的区别\n", + "9. 用户点击过的文章主题也有很大的区别, 这个可以反映用户的主题偏好\n", + "10.不同用户点击文章的时间差也会有所区别, 这个可以反映用户对于文章时效性的偏好\n", + "\n", + "所以根据上面的一些分析,可以更好的帮助我们后面做好特征工程, 充分挖掘数据的隐含信息。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", + "\n", + "![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" + ] } - ], - "source": [ - "# 随机选择5个用户,查看这些用户前后查看文章的相似性\n", - "sub_user_ids = np.random.choice(user_click_merge.user_id.unique(), size=15, replace=False)\n", - "sub_user_info = user_click_merge[user_click_merge['user_id'].isin(sub_user_ids)]\n", - "\n", - "sub_user_info.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "# 上一个版本,这个函数使用的是赛题提供的词向量,但是由于给出的embedding并不是所有的数据的embedding,所以运行下面画图函数的时候会报keyerror的错误\n", - "# 为了防止出现这个错误,这里修改为使用word2vec训练得到的词向量进行可视化\n", - "def get_item_sim_list(df):\n", - " sim_list = []\n", - " item_list = df['click_article_id'].values\n", - " for i in range(0, len(item_list)-1):\n", - " emb1 = item_w2v_emb_dict[str(item_list[i])] # 需要注意的是word2vec训练时候使用的是str类型的数据\n", - " emb2 = item_w2v_emb_dict[str(item_list[i+1])]\n", - " sim_list.append(np.dot(emb1,emb2)/(np.linalg.norm(emb1)*(np.linalg.norm(emb2))))\n", - " sim_list.append(0)\n", - " return sim_list" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" + ], + "metadata": { + "kernelspec": { + "display_name": "Keras Code", + "language": "python", + "name": "dswipython" + }, + "language_info": { + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "tianchi_metadata": { + "competitions": [], + "datasets": [], + "description": "", + "notebookId": "130008", + "source": "dsw" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "278px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } - ], - "source": [ - "for _, user_df in sub_user_info.groupby('user_id'):\n", - " item_sim_list = get_item_sim_list(user_df)\n", - " plt.plot(item_sim_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "这里由于对词向量的训练迭代次数不是很多,所以看到的可视化结果不是很准确,可以训练更多次来观察具体的现象。" - ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 总结\n", - "\n", - "通过数据分析的过程, 我们目前可以得到以下几点重要的信息, 这个对于我们进行后面的特征制作和分析非常有帮助:\n", - "1. 训练集和测试集的用户id没有重复,也就是测试集里面的用户模型是没有见过的\n", - "2. 训练集中用户最少的点击文章数是2, 而测试集里面用户最少的点击文章数是1\n", - "3. 用户对于文章存在重复点击的情况, 但这个都存在于训练集里面\n", - "4. 同一用户的点击环境存在不唯一的情况,后面做这部分特征的时候可以采用统计特征\n", - "5. 用户点击文章的次数有很大的区分度,后面可以根据这个制作衡量用户活跃度的特征\n", - "6. 文章被用户点击的次数也有很大的区分度,后面可以根据这个制作衡量文章热度的特征\n", - "7. 用户看的新闻,相关性是比较强的,所以往往我们判断用户是否对某篇文章感兴趣的时候, 在很大程度上会和他历史点击过的文章有关\n", - "8. 用户点击的文章字数有比较大的区别, 这个可以反映用户对于文章字数的区别\n", - "9. 用户点击过的文章主题也有很大的区别, 这个可以反映用户的主题偏好\n", - "10.不同用户点击文章的时间差也会有所区别, 这个可以反映用户对于文章时效性的偏好\n", - "\n", - "所以根据上面的一些分析,可以更好的帮助我们后面做好特征工程, 充分挖掘数据的隐含信息。" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Keras Code", - "language": "python", - "name": "dswipython" - }, - "language_info": { - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "tianchi_metadata": { - "competitions": [], - "datasets": [], - "description": "", - "notebookId": "130008", - "source": "dsw" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "278px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git "a/docs/ch03/ch3.1/jupyter/\347\211\271\345\276\201\345\267\245\347\250\213.ipynb" "b/docs/ch03/ch3.1/jupyter/\347\211\271\345\276\201\345\267\245\347\250\213.ipynb" index f4e21cabc..d74eed156 100644 --- "a/docs/ch03/ch3.1/jupyter/\347\211\271\345\276\201\345\267\245\347\250\213.ipynb" +++ "b/docs/ch03/ch3.1/jupyter/\347\211\271\345\276\201\345\267\245\347\250\213.ipynb" @@ -1,1772 +1,1772 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 制作特征和标签, 转成监督学习问题\n", - "我们先捋一下基于原始的给定数据, 有哪些特征可以直接利用:\n", - "1. 文章的自身特征, category_id表示这文章的类型, created_at_ts表示文章建立的时间, 这个关系着文章的时效性, words_count是文章的字数, 一般字数太长我们不太喜欢点击, 也不排除有人就喜欢读长文。\n", - "2. 文章的内容embedding特征, 这个召回的时候用过, 这里可以选择使用, 也可以选择不用, 也可以尝试其他类型的embedding特征, 比如W2V等\n", - "3. 用户的设备特征信息\n", - "\n", - "上面这些直接可以用的特征, 待做完特征工程之后, 直接就可以根据article_id或者是user_id把这些特征加入进去。 但是我们需要先基于召回的结果, 构造一些特征,然后制作标签,形成一个监督学习的数据集。

\n", - "构造监督数据集的思路, 根据召回结果, 我们会得到一个{user_id: [可能点击的文章列表]}形式的字典。 那么我们就可以对于每个用户, 每篇可能点击的文章构造一个监督测试集, 比如对于用户user1, 假设得到的他的召回列表{user1: [item1, item2, item3]}, 我们就可以得到三行数据(user1, item1), (user1, item2), (user1, item3)的形式, 这就是监督测试集时候的前两列特征。

\n", - "\n", - "构造特征的思路是这样, 我们知道每个用户的点击文章是与其历史点击的文章信息是有很大关联的, 比如同一个主题, 相似等等。 所以特征构造这块很重要的一系列特征**是要结合用户的历史点击文章信息**。我们已经得到了每个用户及点击候选文章的两列的一个数据集, 而我们的目的是要预测最后一次点击的文章, 比较自然的一个思路就是和其最后几次点击的文章产生关系, 这样既考虑了其历史点击文章信息, 又得离最后一次点击较近,因为新闻很大的一个特点就是注重时效性。 往往用户的最后一次点击会和其最后几次点击有很大的关联。 所以我们就可以对于每个候选文章, 做出与最后几次点击相关的特征如下:\n", - "1. 候选item与最后几次点击的相似性特征(embedding内积) --- 这个直接关联用户历史行为\n", - "2. 候选item与最后几次点击的相似性特征的统计特征 --- 统计特征可以减少一些波动和异常\n", - "3. 候选item与最后几次点击文章的字数差的特征 --- 可以通过字数看用户偏好\n", - "4. 候选item与最后几次点击的文章建立的时间差特征 --- 时间差特征可以看出该用户对于文章的实时性的偏好 \n", - "\n", - "\n", - "还需要考虑一下\n", - "**5. 如果使用了youtube召回的话, 我们还可以制作用户与候选item的相似特征**\n", - "\n", - "\n", - "\n", - "当然, 上面只是提供了一种基于用户历史行为做特征工程的思路, 大家也可以思维风暴一下,尝试一些其他的特征。 下面我们就实现上面的这些特征的制作, 下面的逻辑是这样:\n", - "1. 我们首先获得用户的最后一次点击操作和用户的历史点击, 这个基于我们的日志数据集做\n", - "2. 基于用户的历史行为制作特征, 这个会用到用户的历史点击表, 最后的召回列表, 文章的信息表和embedding向量\n", - "3. 制作标签, 形成最后的监督学习数据集" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 导包" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:00.341709Z", - "start_time": "2020-11-17T09:06:58.723900Z" - }, - "cell_style": "center", - "scrolled": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import pickle\n", - "from tqdm import tqdm\n", - "import gc, os\n", - "import logging\n", - "import time\n", - "import lightgbm as lgb\n", - "from gensim.models import Word2Vec\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# df节省内存函数" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:02.411005Z", - "start_time": "2020-11-17T09:07:02.397830Z" - } - }, - "outputs": [], - "source": [ - "# 节省内存的一个函数\n", - "# 减少内存\n", - "def reduce_mem(df):\n", - " starttime = time.time()\n", - " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", - " start_mem = df.memory_usage().sum() / 1024**2\n", - " for col in df.columns:\n", - " col_type = df[col].dtypes\n", - " if col_type in numerics:\n", - " c_min = df[col].min()\n", - " c_max = df[col].max()\n", - " if pd.isnull(c_min) or pd.isnull(c_max):\n", - " continue\n", - " if str(col_type)[:3] == 'int':\n", - " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", - " df[col] = df[col].astype(np.int8)\n", - " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", - " df[col] = df[col].astype(np.int16)\n", - " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", - " df[col] = df[col].astype(np.int32)\n", - " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", - " df[col] = df[col].astype(np.int64)\n", - " else:\n", - " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", - " df[col] = df[col].astype(np.float16)\n", - " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", - " df[col] = df[col].astype(np.float32)\n", - " else:\n", - " df[col] = df[col].astype(np.float64)\n", - " end_mem = df.memory_usage().sum() / 1024**2\n", - " print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,\n", - " 100*(start_mem-end_mem)/start_mem,\n", - " (time.time()-starttime)/60))\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:05.031436Z", - "start_time": "2020-11-17T09:07:05.026822Z" - } - }, - "outputs": [], - "source": [ - "data_path = './data_raw/'\n", - "save_path = './temp_results/'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 数据读取\n", - "\n", - "## 训练和验证集的划分\n", - "\n", - "划分训练和验证集的原因是为了在线下验证模型参数的好坏,为了完全模拟测试集,我们这里就在训练集中抽取部分用户的所有信息来作为验证集。提前做训练验证集划分的好处就是可以分解制作排序特征时的压力,一次性做整个数据集的排序特征可能时间会比较长。" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:07.230308Z", - "start_time": "2020-11-17T09:07:07.221081Z" - } - }, - "outputs": [], - "source": [ - "# all_click_df指的是训练集\n", - "# sample_user_nums 采样作为验证集的用户数量\n", - "def trn_val_split(all_click_df, sample_user_nums):\n", - " all_click = all_click_df\n", - " all_user_ids = all_click.user_id.unique()\n", - " \n", - " # replace=True表示可以重复抽样,反之不可以\n", - " sample_user_ids = np.random.choice(all_user_ids, size=sample_user_nums, replace=False) \n", - " \n", - " click_val = all_click[all_click['user_id'].isin(sample_user_ids)]\n", - " click_trn = all_click[~all_click['user_id'].isin(sample_user_ids)]\n", - " \n", - " # 将验证集中的最后一次点击给抽取出来作为答案\n", - " click_val = click_val.sort_values(['user_id', 'click_timestamp'])\n", - " val_ans = click_val.groupby('user_id').tail(1)\n", - " \n", - " click_val = click_val.groupby('user_id').apply(lambda x: x[:-1]).reset_index(drop=True)\n", - " \n", - " # 去除val_ans中某些用户只有一个点击数据的情况,如果该用户只有一个点击数据,又被分到ans中,\n", - " # 那么训练集中就没有这个用户的点击数据,出现用户冷启动问题,给自己模型验证带来麻烦\n", - " val_ans = val_ans[val_ans.user_id.isin(click_val.user_id.unique())] # 保证答案中出现的用户再验证集中还有\n", - " click_val = click_val[click_val.user_id.isin(val_ans.user_id.unique())]\n", - " \n", - " return click_trn, click_val, val_ans" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 获取历史点击和最后一次点击" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:19.202550Z", - "start_time": "2020-11-17T09:07:19.195766Z" - } - }, - "outputs": [], - "source": [ - "# 获取当前数据的历史点击和最后一次点击\n", - "def get_hist_and_last_click(all_click):\n", - " all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])\n", - " click_last_df = all_click.groupby('user_id').tail(1)\n", - "\n", - " # 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下\n", - " def hist_func(user_df):\n", - " if len(user_df) == 1:\n", - " return user_df\n", - " else:\n", - " return user_df[:-1]\n", - "\n", - " click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n", - "\n", - " return click_hist_df, click_last_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取训练、验证及测试集" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:21.181211Z", - "start_time": "2020-11-17T09:07:21.171338Z" - } - }, - "outputs": [], - "source": [ - "def get_trn_val_tst_data(data_path, offline=True):\n", - " if offline:\n", - " click_trn_data = pd.read_csv(data_path+'train_click_log.csv') # 训练集用户点击日志\n", - " click_trn_data = reduce_mem(click_trn_data)\n", - " click_trn, click_val, val_ans = trn_val_split(click_trn_data, sample_user_nums)\n", - " else:\n", - " click_trn = pd.read_csv(data_path+'train_click_log.csv')\n", - " click_trn = reduce_mem(click_trn)\n", - " click_val = None\n", - " val_ans = None\n", - " \n", - " click_tst = pd.read_csv(data_path+'testA_click_log.csv')\n", - " \n", - " return click_trn, click_val, click_tst, val_ans" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取召回列表" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:23.210604Z", - "start_time": "2020-11-17T09:07:23.203652Z" - } - }, - "outputs": [], - "source": [ - "# 返回多路召回列表或者单路召回\n", - "def get_recall_list(save_path, single_recall_model=None, multi_recall=False):\n", - " if multi_recall:\n", - " return pickle.load(open(save_path + 'final_recall_items_dict.pkl', 'rb'))\n", - " \n", - " if single_recall_model == 'i2i_itemcf':\n", - " return pickle.load(open(save_path + 'itemcf_recall_dict.pkl', 'rb'))\n", - " elif single_recall_model == 'i2i_emb_itemcf':\n", - " return pickle.load(open(save_path + 'itemcf_emb_dict.pkl', 'rb'))\n", - " elif single_recall_model == 'user_cf':\n", - " return pickle.load(open(save_path + 'youtubednn_usercf_dict.pkl', 'rb'))\n", - " elif single_recall_model == 'youtubednn':\n", - " return pickle.load(open(save_path + 'youtube_u2i_dict.pkl', 'rb'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取各种Embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Word2Vec训练及gensim的使用\n", - "\n", - "Word2Vec主要思想是:一个词的上下文可以很好的表达出词的语义。通过无监督学习产生词向量的方式。word2vec中有两个非常经典的模型:skip-gram和cbow。\n", - "\n", - "- skip-gram:已知中心词预测周围词。\n", - "- cbow:已知周围词预测中心词。\n", - "![image-20201106225233086](http://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20201106225233086.png)\n", - "\n", - "在使用gensim训练word2vec的时候,有几个比较重要的参数\n", - "- size: 表示词向量的维度。\n", - "- window:决定了目标词会与多远距离的上下文产生关系。\n", - "- sg: 如果是0,则是CBOW模型,是1则是Skip-Gram模型。\n", - "- workers: 表示训练时候的线程数量\n", - "- min_count: 设置最小的\n", - "- iter: 训练时遍历整个数据集的次数\n", - "\n", - "**注意**\n", - "1. 训练的时候输入的语料库一定要是字符组成的二维数组,如:[['北', '京', '你', '好'], ['上', '海', '你', '好']]\n", - "2. 使用模型的时候有一些默认值,可以通过在Jupyter里面通过`Word2Vec??`查看\n", - "\n", - "\n", - "下面是个简单的测试样例:\n", - "```\n", - "from gensim.models import Word2Vec\n", - "doc = [['30760', '157507'],\n", - " ['289197', '63746'],\n", - " ['36162', '168401'],\n", - " ['50644', '36162']]\n", - "w2v = Word2Vec(docs, size=12, sg=1, window=2, seed=2020, workers=2, min_count=1, iter=1)\n", - "\n", - "# 查看'30760'表示的词向量\n", - "w2v['30760']\n", - "```\n", - "\n", - "skip-gram和cbow的详细原理可以参考下面的博客:\n", - "- [word2vec原理(一) CBOW与Skip-Gram模型基础](https://www.cnblogs.com/pinard/p/7160330.html) \n", - "- [word2vec原理(二) 基于Hierarchical Softmax的模型](https://www.cnblogs.com/pinard/p/7160330.html) \n", - "- [word2vec原理(三) 基于Negative Sampling的模型](https://www.cnblogs.com/pinard/p/7249903.html) " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:26.676173Z", - "start_time": "2020-11-17T09:07:26.667926Z" - } - }, - "outputs": [], - "source": [ - "def trian_item_word2vec(click_df, embed_size=64, save_name='item_w2v_emb.pkl', split_char=' '):\n", - " click_df = click_df.sort_values('click_timestamp')\n", - " # 只有转换成字符串才可以进行训练\n", - " click_df['click_article_id'] = click_df['click_article_id'].astype(str)\n", - " # 转换成句子的形式\n", - " docs = click_df.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()\n", - " docs = docs['click_article_id'].values.tolist()\n", - "\n", - " # 为了方便查看训练的进度,这里设定一个log信息\n", - " logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", - "\n", - " # 这里的参数对训练得到的向量影响也很大,默认负采样为5\n", - " w2v = Word2Vec(docs, size=16, sg=1, window=5, seed=2020, workers=24, min_count=1, iter=1)\n", - " \n", - " # 保存成字典的形式\n", - " item_w2v_emb_dict = {k: w2v[k] for k in click_df['click_article_id']}\n", - " pickle.dump(item_w2v_emb_dict, open(save_path + 'item_w2v_emb.pkl', 'wb'))\n", - " \n", - " return item_w2v_emb_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:27.285690Z", - "start_time": "2020-11-17T09:07:27.276646Z" - } - }, - "outputs": [], - "source": [ - "# 可以通过字典查询对应的item的Embedding\n", - "def get_embedding(save_path, all_click_df):\n", - " if os.path.exists(save_path + 'item_content_emb.pkl'):\n", - " item_content_emb_dict = pickle.load(open(save_path + 'item_content_emb.pkl', 'rb'))\n", - " else:\n", - " print('item_content_emb.pkl 文件不存在...')\n", - " \n", - " # w2v Embedding是需要提前训练好的\n", - " if os.path.exists(save_path + 'item_w2v_emb.pkl'):\n", - " item_w2v_emb_dict = pickle.load(open(save_path + 'item_w2v_emb.pkl', 'rb'))\n", - " else:\n", - " item_w2v_emb_dict = trian_item_word2vec(all_click_df)\n", - " \n", - " if os.path.exists(save_path + 'item_youtube_emb.pkl'):\n", - " item_youtube_emb_dict = pickle.load(open(save_path + 'item_youtube_emb.pkl', 'rb'))\n", - " else:\n", - " print('item_youtube_emb.pkl 文件不存在...')\n", - " \n", - " if os.path.exists(save_path + 'user_youtube_emb.pkl'):\n", - " user_youtube_emb_dict = pickle.load(open(save_path + 'user_youtube_emb.pkl', 'rb'))\n", - " else:\n", - " print('user_youtube_emb.pkl 文件不存在...')\n", - " \n", - " return item_content_emb_dict, item_w2v_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取文章信息" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:28.391797Z", - "start_time": "2020-11-17T09:07:28.386650Z" - } - }, - "outputs": [], - "source": [ - "def get_article_info_df():\n", - " article_info_df = pd.read_csv(data_path + 'articles.csv')\n", - " article_info_df = reduce_mem(article_info_df)\n", - " \n", - " return article_info_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取数据" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:32.362045Z", - "start_time": "2020-11-17T09:07:29.490413Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-- Mem. usage decreased to 23.34 Mb (69.4% reduction),time spend:0.00 min\n" - ] - } - ], - "source": [ - "# 这里offline的online的区别就是验证集是否为空\n", - "click_trn, click_val, click_tst, val_ans = get_trn_val_tst_data(data_path, offline=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:10.378966Z", - "start_time": "2020-11-17T09:07:32.468580Z" - } - }, - "outputs": [], - "source": [ - "click_trn_hist, click_trn_last = get_hist_and_last_click(click_trn)\n", - "\n", - "if click_val is not None:\n", - " click_val_hist, click_val_last = click_val, val_ans\n", - "else:\n", - " click_val_hist, click_val_last = None, None\n", - " \n", - "click_tst_hist = click_tst" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 对训练数据做负采样\n", - "\n", - "通过召回我们将数据转换成三元组的形式(user1, item1, label)的形式,观察发现正负样本差距极度不平衡,我们可以先对负样本进行下采样,下采样的目的一方面缓解了正负样本比例的问题,另一方面也减小了我们做排序特征的压力,我们在做负采样的时候又有哪些东西是需要注意的呢?\n", - "\n", - "1. 只对负样本进行下采样(如果有比较好的正样本扩充的方法其实也是可以考虑的)\n", - "2. 负采样之后,保证所有的用户和文章仍然出现在采样之后的数据中\n", - "3. 下采样的比例可以根据实际情况人为的控制\n", - "4. 做完负采样之后,更新此时新的用户召回文章列表,因为后续做特征的时候可能用到相对位置的信息。\n", - "\n", - "其实负采样也可以留在后面做完特征在进行,这里由于做排序特征太慢了,所以把负采样的环节提到前面了。" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:36.096678Z", - "start_time": "2020-11-17T09:11:36.090911Z" - } - }, - "outputs": [], - "source": [ - "# 将召回列表转换成df的形式\n", - "def recall_dict_2_df(recall_list_dict):\n", - " df_row_list = [] # [user, item, score]\n", - " for user, recall_list in tqdm(recall_list_dict.items()):\n", - " for item, score in recall_list:\n", - " df_row_list.append([user, item, score])\n", - " \n", - " col_names = ['user_id', 'sim_item', 'score']\n", - " recall_list_df = pd.DataFrame(df_row_list, columns=col_names)\n", - " \n", - " return recall_list_df" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:37.668844Z", - "start_time": "2020-11-17T09:11:37.659774Z" - } - }, - "outputs": [], - "source": [ - "# 负采样函数,这里可以控制负采样时的比例, 这里给了一个默认的值\n", - "def neg_sample_recall_data(recall_items_df, sample_rate=0.001):\n", - " pos_data = recall_items_df[recall_items_df['label'] == 1]\n", - " neg_data = recall_items_df[recall_items_df['label'] == 0]\n", - " \n", - " print('pos_data_num:', len(pos_data), 'neg_data_num:', len(neg_data), 'pos/neg:', len(pos_data)/len(neg_data))\n", - " \n", - " # 分组采样函数\n", - " def neg_sample_func(group_df):\n", - " neg_num = len(group_df)\n", - " sample_num = max(int(neg_num * sample_rate), 1) # 保证最少有一个\n", - " sample_num = min(sample_num, 5) # 保证最多不超过5个,这里可以根据实际情况进行选择\n", - " return group_df.sample(n=sample_num, replace=True)\n", - " \n", - " # 对用户进行负采样,保证所有用户都在采样后的数据中\n", - " neg_data_user_sample = neg_data.groupby('user_id', group_keys=False).apply(neg_sample_func)\n", - " # 对文章进行负采样,保证所有文章都在采样后的数据中\n", - " neg_data_item_sample = neg_data.groupby('sim_item', group_keys=False).apply(neg_sample_func)\n", - " \n", - " # 将上述两种情况下的采样数据合并\n", - " neg_data_new = neg_data_user_sample.append(neg_data_item_sample)\n", - " # 由于上述两个操作是分开的,可能将两个相同的数据给重复选择了,所以需要对合并后的数据进行去重\n", - " neg_data_new = neg_data_new.sort_values(['user_id', 'score']).drop_duplicates(['user_id', 'sim_item'], keep='last')\n", - " \n", - " # 将正样本数据合并\n", - " data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)\n", - " \n", - " return data_new" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:39.481715Z", - "start_time": "2020-11-17T09:11:39.475144Z" - } - }, - "outputs": [], - "source": [ - "# 召回数据打标签\n", - "def get_rank_label_df(recall_list_df, label_df, is_test=False):\n", - " # 测试集是没有标签了,为了后面代码同一一些,这里直接给一个负数替代\n", - " if is_test:\n", - " recall_list_df['label'] = -1\n", - " return recall_list_df\n", - " \n", - " label_df = label_df.rename(columns={'click_article_id': 'sim_item'})\n", - " recall_list_df_ = recall_list_df.merge(label_df[['user_id', 'sim_item', 'click_timestamp']], \\\n", - " how='left', on=['user_id', 'sim_item'])\n", - " recall_list_df_['label'] = recall_list_df_['click_timestamp'].apply(lambda x: 0.0 if np.isnan(x) else 1.0)\n", - " del recall_list_df_['click_timestamp']\n", - " \n", - " return recall_list_df_" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:41.555566Z", - "start_time": "2020-11-17T09:11:41.546766Z" - } - }, - "outputs": [], - "source": [ - "def get_user_recall_item_label_df(click_trn_hist, click_val_hist, click_tst_hist,click_trn_last, click_val_last, recall_list_df):\n", - " # 获取训练数据的召回列表\n", - " trn_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_trn_hist['user_id'].unique())]\n", - " # 训练数据打标签\n", - " trn_user_item_label_df = get_rank_label_df(trn_user_items_df, click_trn_last, is_test=False)\n", - " # 训练数据负采样\n", - " trn_user_item_label_df = neg_sample_recall_data(trn_user_item_label_df)\n", - " \n", - " if click_val is not None:\n", - " val_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_val_hist['user_id'].unique())]\n", - " val_user_item_label_df = get_rank_label_df(val_user_items_df, click_val_last, is_test=False)\n", - " val_user_item_label_df = neg_sample_recall_data(val_user_item_label_df)\n", - " else:\n", - " val_user_item_label_df = None\n", - " \n", - " # 测试数据不需要进行负采样,直接对所有的召回商品进行打-1标签\n", - " tst_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_tst_hist['user_id'].unique())]\n", - " tst_user_item_label_df = get_rank_label_df(tst_user_items_df, None, is_test=True)\n", - " \n", - " return trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:23:35.357045Z", - "start_time": "2020-11-17T17:23:12.378284Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [00:12<00:00, 20689.39it/s]\n" - ] - } - ], - "source": [ - "# 读取召回列表\n", - "recall_list_dict = get_recall_list(save_path, single_recall_model='i2i_itemcf') # 这里只选择了单路召回的结果,也可以选择多路召回结果\n", - "# 将召回数据转换成df\n", - "recall_list_df = recall_dict_2_df(recall_list_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:29:04.598214Z", - "start_time": "2020-11-17T17:23:40.001052Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pos_data_num: 64190 neg_data_num: 1935810 pos/neg: 0.03315924600038227\n" - ] - } - ], - "source": [ - "# 给训练验证数据打标签,并负采样(这一部分时间比较久)\n", - "trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df = get_user_recall_item_label_df(click_trn_hist, \n", - " click_val_hist, \n", - " click_tst_hist,\n", - " click_trn_last, \n", - " click_val_last, \n", - " recall_list_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:23:11.642944Z", - "start_time": "2020-11-17T17:23:08.475Z" - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "trn_user_item_label_df.label" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 将召回数据转换成字典" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:36:22.800449Z", - "start_time": "2020-11-17T17:36:22.794670Z" - } - }, - "outputs": [], - "source": [ - "# 将最终的召回的df数据转换成字典的形式做排序特征\n", - "def make_tuple_func(group_df):\n", - " row_data = []\n", - " for name, row_df in group_df.iterrows():\n", - " row_data.append((row_df['sim_item'], row_df['score'], row_df['label']))\n", - " \n", - " return row_data" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:40:05.991819Z", - "start_time": "2020-11-17T17:36:26.536429Z" - } - }, - "outputs": [], - "source": [ - "trn_user_item_label_tuples = trn_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", - "trn_user_item_label_tuples_dict = dict(zip(trn_user_item_label_tuples['user_id'], trn_user_item_label_tuples[0]))\n", - "\n", - "if val_user_item_label_df is not None:\n", - " val_user_item_label_tuples = val_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", - " val_user_item_label_tuples_dict = dict(zip(val_user_item_label_tuples['user_id'], val_user_item_label_tuples[0]))\n", - "else:\n", - " val_user_item_label_tuples_dict = None\n", - " \n", - "tst_user_item_label_tuples = tst_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", - "tst_user_item_label_tuples_dict = dict(zip(tst_user_item_label_tuples['user_id'], tst_user_item_label_tuples[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T07:59:53.141560Z", - "start_time": "2020-11-17T07:59:53.133599Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 特征工程" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 制作与用户历史行为相关特征\n", - "对于每个用户召回的每个商品, 做特征。 具体步骤如下:\n", - "* 对于每个用户, 获取最后点击的N个商品的item_id, \n", - " * 对于该用户的每个召回商品, 计算与上面最后N次点击商品的相似度的和(最大, 最小,均值), 时间差特征,相似性特征,字数差特征,与该用户的相似性特征" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T01:07:47.268035Z", - "start_time": "2020-11-18T01:07:47.250449Z" - } - }, - "outputs": [], - "source": [ - "# 下面基于data做历史相关的特征\n", - "def create_feature(users_id, recall_list, click_hist_df, articles_info, articles_emb, user_emb=None, N=1):\n", - " \"\"\"\n", - " 基于用户的历史行为做相关特征\n", - " :param users_id: 用户id\n", - " :param recall_list: 对于每个用户召回的候选文章列表\n", - " :param click_hist_df: 用户的历史点击信息\n", - " :param articles_info: 文章信息\n", - " :param articles_emb: 文章的embedding向量, 这个可以用item_content_emb, item_w2v_emb, item_youtube_emb\n", - " :param user_emb: 用户的embedding向量, 这个是user_youtube_emb, 如果没有也可以不用, 但要注意如果要用的话, articles_emb就要用item_youtube_emb的形式, 这样维度才一样\n", - " :param N: 最近的N次点击 由于testA日志里面很多用户只存在一次历史点击, 所以为了不产生空值,默认是1\n", - " \"\"\"\n", - " \n", - " # 建立一个二维列表保存结果, 后面要转成DataFrame\n", - " all_user_feas = []\n", - " i = 0\n", - " for user_id in tqdm(users_id):\n", - " # 该用户的最后N次点击\n", - " hist_user_items = click_hist_df[click_hist_df['user_id']==user_id]['click_article_id'][-N:]\n", - " \n", - " # 遍历该用户的召回列表\n", - " for rank, (article_id, score, label) in enumerate(recall_list[user_id]):\n", - " # 该文章建立时间, 字数\n", - " a_create_time = articles_info[articles_info['article_id']==article_id]['created_at_ts'].values[0]\n", - " a_words_count = articles_info[articles_info['article_id']==article_id]['words_count'].values[0]\n", - " single_user_fea = [user_id, article_id]\n", - " # 计算与最后点击的商品的相似度的和, 最大值和最小值, 均值\n", - " sim_fea = []\n", - " time_fea = []\n", - " word_fea = []\n", - " # 遍历用户的最后N次点击文章\n", - " for hist_item in hist_user_items:\n", - " b_create_time = articles_info[articles_info['article_id']==hist_item]['created_at_ts'].values[0]\n", - " b_words_count = articles_info[articles_info['article_id']==hist_item]['words_count'].values[0]\n", - " \n", - " sim_fea.append(np.dot(articles_emb[hist_item], articles_emb[article_id]))\n", - " time_fea.append(abs(a_create_time-b_create_time))\n", - " word_fea.append(abs(a_words_count-b_words_count))\n", - " \n", - " single_user_fea.extend(sim_fea) # 相似性特征\n", - " single_user_fea.extend(time_fea) # 时间差特征\n", - " single_user_fea.extend(word_fea) # 字数差特征\n", - " single_user_fea.extend([max(sim_fea), min(sim_fea), sum(sim_fea), sum(sim_fea) / len(sim_fea)]) # 相似性的统计特征\n", - " \n", - " if user_emb: # 如果用户向量有的话, 这里计算该召回文章与用户的相似性特征 \n", - " single_user_fea.append(np.dot(user_emb[user_id], articles_emb[article_id]))\n", - " \n", - " single_user_fea.extend([score, rank, label]) \n", - " # 加入到总的表中\n", - " all_user_feas.append(single_user_fea)\n", - " \n", - " # 定义列名\n", - " id_cols = ['user_id', 'click_article_id']\n", - " sim_cols = ['sim' + str(i) for i in range(N)]\n", - " time_cols = ['time_diff' + str(i) for i in range(N)]\n", - " word_cols = ['word_diff' + str(i) for i in range(N)]\n", - " sat_cols = ['sim_max', 'sim_min', 'sim_sum', 'sim_mean']\n", - " user_item_sim_cols = ['user_item_sim'] if user_emb else []\n", - " user_score_rank_label = ['score', 'rank', 'label']\n", - " cols = id_cols + sim_cols + time_cols + word_cols + sat_cols + user_item_sim_cols + user_score_rank_label\n", - " \n", - " # 转成DataFrame\n", - " df = pd.DataFrame( all_user_feas, columns=cols)\n", - " \n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T01:08:17.531694Z", - "start_time": "2020-11-18T01:08:10.754702Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-- Mem. usage decreased to 5.56 Mb (50.0% reduction),time spend:0.00 min\n" - ] - } - ], - "source": [ - "article_info_df = get_article_info_df()\n", - "all_click = click_trn.append(click_tst)\n", - "item_content_emb_dict, item_w2v_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict = get_embedding(save_path, all_click)" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:06:22.709350Z", - "start_time": "2020-11-18T01:08:39.923811Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 200000/200000 [50:16<00:00, 66.31it/s] \n", - "100%|██████████| 50000/50000 [1:07:21<00:00, 12.37it/s]\n" - ] - } - ], - "source": [ - "# 获取训练验证及测试数据中召回列文章相关特征\n", - "trn_user_item_feats_df = create_feature(trn_user_item_label_tuples_dict.keys(), trn_user_item_label_tuples_dict, \\\n", - " click_trn_hist, article_info_df, item_content_emb_dict)\n", - "\n", - "if val_user_item_label_tuples_dict is not None:\n", - " val_user_item_feats_df = create_feature(val_user_item_label_tuples_dict.keys(), val_user_item_label_tuples_dict, \\\n", - " click_val_hist, article_info_df, item_content_emb_dict)\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "tst_user_item_feats_df = create_feature(tst_user_item_label_tuples_dict.keys(), tst_user_item_label_tuples_dict, \\\n", - " click_tst_hist, article_info_df, item_content_emb_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:13:58.573422Z", - "start_time": "2020-11-18T03:13:40.157228Z" - } - }, - "outputs": [], - "source": [ - "# 保存一份省的每次都要重新跑,每次跑的时间都比较长\n", - "trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n", - "\n", - "tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:14:22.838154Z", - "start_time": "2020-11-18T03:14:22.828212Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 用户和文章特征\n", - "### 用户相关特征\n", - "这一块,正式进行特征工程,既要拼接上已有的特征, 也会做更多的特征出来,我们来梳理一下已有的特征和可构造特征:\n", - "1. 文章自身的特征, 文章字数,文章创建时间, 文章的embedding (articles表中)\n", - "2. 用户点击环境特征, 那些设备的特征(这个在df中)\n", - "3. 对于用户和商品还可以构造的特征:\n", - " * 基于用户的点击文章次数和点击时间构造可以表现用户活跃度的特征\n", - " * 基于文章被点击次数和时间构造可以反映文章热度的特征\n", - " * 用户的时间统计特征: 根据其点击的历史文章列表的点击时间和文章的创建时间做统计特征,比如求均值, 这个可以反映用户对于文章时效的偏好\n", - " * 用户的主题爱好特征, 对于用户点击的历史文章主题进行一个统计, 然后对于当前文章看看是否属于用户已经点击过的主题\n", - " * 用户的字数爱好特征, 对于用户点击的历史文章的字数统计, 求一个均值" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T03:16:37.637495Z", - "start_time": "2020-11-14T03:16:37.618229Z" - } - }, - "outputs": [], - "source": [ - "click_tst.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:09:11.675550Z", - "start_time": "2020-11-17T02:09:10.265134Z" - } - }, - "outputs": [], - "source": [ - "# 读取文章特征\n", - "articles = pd.read_csv(data_path+'articles.csv')\n", - "articles = reduce_mem(articles)\n", - "\n", - "# 日志数据,就是前面的所有数据\n", - "if click_val is not None:\n", - " all_data = click_trn.append(click_val)\n", - "all_data = click_trn.append(click_tst)\n", - "all_data = reduce_mem(all_data)\n", - "\n", - "# 拼上文章信息\n", - "all_data = all_data.merge(articles, left_on='click_article_id', right_on='article_id')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T03:17:12.256244Z", - "start_time": "2020-11-14T03:17:12.250452Z" - } - }, - "outputs": [], - "source": [ - "all_data.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 分析一下点击时间和点击文章的次数,区分用户活跃度\n", - "如果某个用户点击文章之间的时间间隔比较小, 同时点击的文章次数很多的话, 那么我们认为这种用户一般就是活跃用户, 当然衡量用户活跃度的方式可能多种多样, 这里我们只提供其中一种,我们写一个函数, 得到可以衡量用户活跃度的特征,逻辑如下:\n", - "1. 首先根据用户user_id分组, 对于每个用户,计算点击文章的次数, 两两点击文章时间间隔的均值\n", - "2. 把点击次数取倒数和时间间隔的均值统一归一化,然后两者相加合并,该值越小, 说明用户越活跃\n", - "3. 注意, 上面两两点击文章的时间间隔均值, 会出现如果用户只点击了一次的情况,这时候时间间隔均值那里会出现空值, 对于这种情况最后特征那里给个大数进行区分\n", - "\n", - "这个的衡量标准就是先把点击的次数取到数然后归一化, 然后点击的时间差归一化, 然后两者相加进行合并, 该值越小, 说明被点击的次数越多, 且间隔时间短。 " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:28:55.336058Z", - "start_time": "2020-11-17T02:28:55.324332Z" - } - }, - "outputs": [], - "source": [ - " def active_level(all_data, cols):\n", - " \"\"\"\n", - " 制作区分用户活跃度的特征\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " data = all_data[cols]\n", - " data.sort_values(['user_id', 'click_timestamp'], inplace=True)\n", - " user_act = pd.DataFrame(data.groupby('user_id', as_index=False)[['click_article_id', 'click_timestamp']].\\\n", - " agg({'click_article_id':np.size, 'click_timestamp': {list}}).values, columns=['user_id', 'click_size', 'click_timestamp'])\n", - " \n", - " # 计算时间间隔的均值\n", - " def time_diff_mean(l):\n", - " if len(l) == 1:\n", - " return 1\n", - " else:\n", - " return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])\n", - " \n", - " user_act['time_diff_mean'] = user_act['click_timestamp'].apply(lambda x: time_diff_mean(x))\n", - " \n", - " # 点击次数取倒数\n", - " user_act['click_size'] = 1 / user_act['click_size']\n", - " \n", - " # 两者归一化\n", - " user_act['click_size'] = (user_act['click_size'] - user_act['click_size'].min()) / (user_act['click_size'].max() - user_act['click_size'].min())\n", - " user_act['time_diff_mean'] = (user_act['time_diff_mean'] - user_act['time_diff_mean'].min()) / (user_act['time_diff_mean'].max() - user_act['time_diff_mean'].min()) \n", - " user_act['active_level'] = user_act['click_size'] + user_act['time_diff_mean']\n", - " \n", - " user_act['user_id'] = user_act['user_id'].astype('int')\n", - " del user_act['click_timestamp']\n", - " \n", - " return user_act" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:30:12.696060Z", - "start_time": "2020-11-17T02:29:01.523837Z" - } - }, - "outputs": [], - "source": [ - "user_act_fea = active_level(all_data, ['user_id', 'click_article_id', 'click_timestamp'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:28:53.996742Z", - "start_time": "2020-11-17T02:09:18.374Z" - } - }, - "outputs": [], - "source": [ - "user_act_fea.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 分析一下点击时间和被点击文章的次数, 衡量文章热度特征\n", - "和上面同样的思路, 如果一篇文章在很短的时间间隔之内被点击了很多次, 说明文章比较热门,实现的逻辑和上面的基本一致, 只不过这里是按照点击的文章进行分组:\n", - "1. 根据文章进行分组, 对于每篇文章的用户, 计算点击的时间间隔\n", - "2. 将用户的数量取倒数, 然后用户的数量和时间间隔归一化, 然后相加得到热度特征, 该值越小, 说明被点击的次数越大且时间间隔越短, 文章比较热\n", - "\n", - "当然, 这只是给出一种判断文章热度的一种方法, 这里大家也可以头脑风暴一下" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:41:26.398567Z", - "start_time": "2020-11-17T02:41:26.386668Z" - } - }, - "outputs": [], - "source": [ - " def hot_level(all_data, cols):\n", - " \"\"\"\n", - " 制作衡量文章热度的特征\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " data = all_data[cols]\n", - " data.sort_values(['click_article_id', 'click_timestamp'], inplace=True)\n", - " article_hot = pd.DataFrame(data.groupby('click_article_id', as_index=False)[['user_id', 'click_timestamp']].\\\n", - " agg({'user_id':np.size, 'click_timestamp': {list}}).values, columns=['click_article_id', 'user_num', 'click_timestamp'])\n", - " \n", - " # 计算被点击时间间隔的均值\n", - " def time_diff_mean(l):\n", - " if len(l) == 1:\n", - " return 1\n", - " else:\n", - " return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])\n", - " \n", - " article_hot['time_diff_mean'] = article_hot['click_timestamp'].apply(lambda x: time_diff_mean(x))\n", - " \n", - " # 点击次数取倒数\n", - " article_hot['user_num'] = 1 / article_hot['user_num']\n", - " \n", - " # 两者归一化\n", - " article_hot['user_num'] = (article_hot['user_num'] - article_hot['user_num'].min()) / (article_hot['user_num'].max() - article_hot['user_num'].min())\n", - " article_hot['time_diff_mean'] = (article_hot['time_diff_mean'] - article_hot['time_diff_mean'].min()) / (article_hot['time_diff_mean'].max() - article_hot['time_diff_mean'].min()) \n", - " article_hot['hot_level'] = article_hot['user_num'] + article_hot['time_diff_mean']\n", - " \n", - " article_hot['click_article_id'] = article_hot['click_article_id'].astype('int')\n", - " \n", - " del article_hot['click_timestamp']\n", - " \n", - " return article_hot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:41:44.635900Z", - "start_time": "2020-11-17T02:41:31.473032Z" - } - }, - "outputs": [], - "source": [ - "article_hot_fea = hot_level(all_data, ['user_id', 'click_article_id', 'click_timestamp']) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T03:19:54.775290Z", - "start_time": "2020-11-14T03:19:54.763699Z" - } - }, - "outputs": [], - "source": [ - "article_hot_fea.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的系列习惯\n", - "这个基于原来的日志表做一个类似于article的那种DataFrame, 存放用户特有的信息, 主要包括点击习惯, 爱好特征之类的\n", - "* 用户的设备习惯, 这里取最常用的设备(众数)\n", - "* 用户的时间习惯: 根据其点击过得历史文章的时间来做一个统计(这个感觉最好是把时间戳里的时间特征的h特征提出来,看看用户习惯一天的啥时候点击文章), 但这里先用转换的时间吧, 求个均值\n", - "* 用户的爱好特征, 对于用户点击的历史文章主题进行用户的爱好判别, 更偏向于哪几个主题, 这个最好是multi-hot进行编码, 先试试行不\n", - "* 用户文章的字数差特征, 用户的爱好文章的字数习惯\n", - "\n", - "这些就是对用户进行分组, 然后统计即可" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的设备习惯" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T04:22:48.877978Z", - "start_time": "2020-11-17T04:22:48.872049Z" - } - }, - "outputs": [], - "source": [ - "def device_fea(all_data, cols):\n", - " \"\"\"\n", - " 制作用户的设备特征\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " user_device_info = all_data[cols]\n", - " \n", - " # 用众数来表示每个用户的设备信息\n", - " user_device_info = user_device_info.groupby('user_id').agg(lambda x: x.value_counts().index[0]).reset_index()\n", - " \n", - " return user_device_info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T05:27:10.897473Z", - "start_time": "2020-11-17T04:49:33.214865Z" - } - }, - "outputs": [], - "source": [ - "# 设备特征(这里时间会比较长)\n", - "device_cols = ['user_id', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', 'click_region', 'click_referrer_type']\n", - "user_device_info = device_fea(all_data, device_cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T04:20:39.765842Z", - "start_time": "2020-11-14T04:20:39.747087Z" - } - }, - "outputs": [], - "source": [ - "user_device_info.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的时间习惯" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:11:50.889905Z", - "start_time": "2020-11-17T06:11:50.882653Z" - } - }, - "outputs": [], - "source": [ - "def user_time_hob_fea(all_data, cols):\n", - " \"\"\"\n", - " 制作用户的时间习惯特征\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " user_time_hob_info = all_data[cols]\n", - " \n", - " # 先把时间戳进行归一化\n", - " mm = MinMaxScaler()\n", - " user_time_hob_info['click_timestamp'] = mm.fit_transform(user_time_hob_info[['click_timestamp']])\n", - " user_time_hob_info['created_at_ts'] = mm.fit_transform(user_time_hob_info[['created_at_ts']])\n", - "\n", - " user_time_hob_info = user_time_hob_info.groupby('user_id').agg('mean').reset_index()\n", - " \n", - " user_time_hob_info.rename(columns={'click_timestamp': 'user_time_hob1', 'created_at_ts': 'user_time_hob2'}, inplace=True)\n", - " return user_time_hob_info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:31:51.646110Z", - "start_time": "2020-11-17T06:31:51.171431Z" - } - }, - "outputs": [], - "source": [ - "user_time_hob_cols = ['user_id', 'click_timestamp', 'created_at_ts']\n", - "user_time_hob_info = user_time_hob_fea(all_data, user_time_hob_cols)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的主题爱好\n", - "这里先把用户点击的文章属于的主题转成一个列表, 后面再总的汇总的时候单独制作一个特征, 就是文章的主题如果属于这里面, 就是1, 否则就是0。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:31:56.571088Z", - "start_time": "2020-11-17T06:31:56.565304Z" - } - }, - "outputs": [], - "source": [ - "def user_cat_hob_fea(all_data, cols):\n", - " \"\"\"\n", - " 用户的主题爱好\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " user_category_hob_info = all_data[cols]\n", - " user_category_hob_info = user_category_hob_info.groupby('user_id').agg({list}).reset_index()\n", - " \n", - " user_cat_hob_info = pd.DataFrame()\n", - " user_cat_hob_info['user_id'] = user_category_hob_info['user_id']\n", - " user_cat_hob_info['cate_list'] = user_category_hob_info['category_id']\n", - " \n", - " return user_cat_hob_info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:32:55.150800Z", - "start_time": "2020-11-17T06:32:00.740046Z" - } - }, - "outputs": [], - "source": [ - "user_category_hob_cols = ['user_id', 'category_id']\n", - "user_cat_hob_info = user_cat_hob_fea(all_data, user_category_hob_cols)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的字数偏好特征" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:48:12.988460Z", - "start_time": "2020-11-17T06:48:12.547000Z" - } - }, - "outputs": [], - "source": [ - "user_wcou_info = all_data.groupby('user_id')['words_count'].agg('mean').reset_index()\n", - "user_wcou_info.rename(columns={'words_count': 'words_hbo'}, inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的信息特征合并保存" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:48:18.289591Z", - "start_time": "2020-11-17T06:48:17.084408Z" - } - }, - "outputs": [], - "source": [ - "# 所有表进行合并\n", - "user_info = pd.merge(user_act_fea, user_device_info, on='user_id')\n", - "user_info = user_info.merge(user_time_hob_info, on='user_id')\n", - "user_info = user_info.merge(user_cat_hob_info, on='user_id')\n", - "user_info = user_info.merge(user_wcou_info, on='user_id')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:48:26.907785Z", - "start_time": "2020-11-17T06:48:21.457597Z" - } - }, - "outputs": [], - "source": [ - "# 这样用户特征以后就可以直接读取了\n", - "user_info.to_csv(save_path + 'user_info.csv', index=False) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户特征直接读入\n", - "如果前面关于用户的特征工程已经给做完了,后面可以直接读取" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:15:49.502826Z", - "start_time": "2020-11-18T03:15:48.062243Z" - } - }, - "outputs": [], - "source": [ - "# 把用户信息直接读入进来\n", - "user_info = pd.read_csv(save_path + 'user_info.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:15:56.899635Z", - "start_time": "2020-11-18T03:15:53.701818Z" - } - }, - "outputs": [], - "source": [ - "if os.path.exists(save_path + 'trn_user_item_feats_df.csv'):\n", - " trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')\n", - " \n", - "if os.path.exists(save_path + 'tst_user_item_feats_df.csv'):\n", - " tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')\n", - "\n", - "if os.path.exists(save_path + 'val_user_item_feats_df.csv'):\n", - " val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')\n", - "else:\n", - " val_user_item_feats_df = None" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:16:02.739197Z", - "start_time": "2020-11-18T03:16:01.725028Z" - } - }, - "outputs": [], - "source": [ - "# 拼上用户特征\n", - "# 下面是线下验证的\n", - "trn_user_item_feats_df = trn_user_item_feats_df.merge(user_info, on='user_id', how='left')\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df = val_user_item_feats_df.merge(user_info, on='user_id', how='left')\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "tst_user_item_feats_df = tst_user_item_feats_df.merge(user_info, on='user_id',how='left')" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:16:06.989877Z", - "start_time": "2020-11-18T03:16:06.983327Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['user_id', 'click_article_id', 'sim0', 'time_diff0', 'word_diff0',\n", - " 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank', 'label',\n", - " 'click_size', 'time_diff_mean', 'active_level', 'click_environment',\n", - " 'click_deviceGroup', 'click_os', 'click_country', 'click_region',\n", - " 'click_referrer_type', 'user_time_hob1', 'user_time_hob2', 'cate_list',\n", - " 'words_hbo'],\n", - " dtype='object')" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_user_item_feats_df.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T03:13:36.071236Z", - "start_time": "2020-11-14T03:13:36.050188Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 文章的特征直接读入" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:16:12.793070Z", - "start_time": "2020-11-18T03:16:12.425380Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-- Mem. usage decreased to 5.56 Mb (50.0% reduction),time spend:0.00 min\n" - ] - } - ], - "source": [ - "articles = pd.read_csv(data_path+'articles.csv')\n", - "articles = reduce_mem(articles)" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:16:18.118507Z", - "start_time": "2020-11-18T03:16:16.344338Z" - } - }, - "outputs": [], - "source": [ - "# 拼上文章特征\n", - "trn_user_item_feats_df = trn_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df = val_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n", - "else:\n", - " val_user_item_feats_df = None\n", - "\n", - "tst_user_item_feats_df = tst_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 召回文章的主题是否在用户的爱好里面" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:17:40.251797Z", - "start_time": "2020-11-18T03:16:28.130012Z" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 制作特征和标签, 转成监督学习问题\n", + "我们先捋一下基于原始的给定数据, 有哪些特征可以直接利用:\n", + "1. 文章的自身特征, category_id表示这文章的类型, created_at_ts表示文章建立的时间, 这个关系着文章的时效性, words_count是文章的字数, 一般字数太长我们不太喜欢点击, 也不排除有人就喜欢读长文。\n", + "2. 文章的内容embedding特征, 这个召回的时候用过, 这里可以选择使用, 也可以选择不用, 也可以尝试其他类型的embedding特征, 比如W2V等\n", + "3. 用户的设备特征信息\n", + "\n", + "上面这些直接可以用的特征, 待做完特征工程之后, 直接就可以根据article_id或者是user_id把这些特征加入进去。 但是我们需要先基于召回的结果, 构造一些特征,然后制作标签,形成一个监督学习的数据集。

\n", + "构造监督数据集的思路, 根据召回结果, 我们会得到一个{user_id: [可能点击的文章列表]}形式的字典。 那么我们就可以对于每个用户, 每篇可能点击的文章构造一个监督测试集, 比如对于用户user1, 假设得到的他的召回列表{user1: [item1, item2, item3]}, 我们就可以得到三行数据(user1, item1), (user1, item2), (user1, item3)的形式, 这就是监督测试集时候的前两列特征。

\n", + "\n", + "构造特征的思路是这样, 我们知道每个用户的点击文章是与其历史点击的文章信息是有很大关联的, 比如同一个主题, 相似等等。 所以特征构造这块很重要的一系列特征**是要结合用户的历史点击文章信息**。我们已经得到了每个用户及点击候选文章的两列的一个数据集, 而我们的目的是要预测最后一次点击的文章, 比较自然的一个思路就是和其最后几次点击的文章产生关系, 这样既考虑了其历史点击文章信息, 又得离最后一次点击较近,因为新闻很大的一个特点就是注重时效性。 往往用户的最后一次点击会和其最后几次点击有很大的关联。 所以我们就可以对于每个候选文章, 做出与最后几次点击相关的特征如下:\n", + "1. 候选item与最后几次点击的相似性特征(embedding内积) --- 这个直接关联用户历史行为\n", + "2. 候选item与最后几次点击的相似性特征的统计特征 --- 统计特征可以减少一些波动和异常\n", + "3. 候选item与最后几次点击文章的字数差的特征 --- 可以通过字数看用户偏好\n", + "4. 候选item与最后几次点击的文章建立的时间差特征 --- 时间差特征可以看出该用户对于文章的实时性的偏好 \n", + "\n", + "\n", + "还需要考虑一下\n", + "**5. 如果使用了youtube召回的话, 我们还可以制作用户与候选item的相似特征**\n", + "\n", + "\n", + "\n", + "当然, 上面只是提供了一种基于用户历史行为做特征工程的思路, 大家也可以思维风暴一下,尝试一些其他的特征。 下面我们就实现上面的这些特征的制作, 下面的逻辑是这样:\n", + "1. 我们首先获得用户的最后一次点击操作和用户的历史点击, 这个基于我们的日志数据集做\n", + "2. 基于用户的历史行为制作特征, 这个会用到用户的历史点击表, 最后的召回列表, 文章的信息表和embedding向量\n", + "3. 制作标签, 形成最后的监督学习数据集" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 导包" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:00.341709Z", + "start_time": "2020-11-17T09:06:58.723900Z" + }, + "cell_style": "center", + "scrolled": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "from tqdm import tqdm\n", + "import gc, os\n", + "import logging\n", + "import time\n", + "import lightgbm as lgb\n", + "from gensim.models import Word2Vec\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# df节省内存函数" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:02.411005Z", + "start_time": "2020-11-17T09:07:02.397830Z" + } + }, + "outputs": [], + "source": [ + "# 节省内存的一个函数\n", + "# 减少内存\n", + "def reduce_mem(df):\n", + " starttime = time.time()\n", + " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", + " start_mem = df.memory_usage().sum() / 1024**2\n", + " for col in df.columns:\n", + " col_type = df[col].dtypes\n", + " if col_type in numerics:\n", + " c_min = df[col].min()\n", + " c_max = df[col].max()\n", + " if pd.isnull(c_min) or pd.isnull(c_max):\n", + " continue\n", + " if str(col_type)[:3] == 'int':\n", + " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", + " df[col] = df[col].astype(np.int8)\n", + " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", + " df[col] = df[col].astype(np.int16)\n", + " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", + " df[col] = df[col].astype(np.int32)\n", + " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", + " df[col] = df[col].astype(np.int64)\n", + " else:\n", + " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", + " df[col] = df[col].astype(np.float16)\n", + " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", + " df[col] = df[col].astype(np.float32)\n", + " else:\n", + " df[col] = df[col].astype(np.float64)\n", + " end_mem = df.memory_usage().sum() / 1024**2\n", + " print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,\n", + " 100*(start_mem-end_mem)/start_mem,\n", + " (time.time()-starttime)/60))\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:05.031436Z", + "start_time": "2020-11-17T09:07:05.026822Z" + } + }, + "outputs": [], + "source": [ + "data_path = './data_raw/'\n", + "save_path = './temp_results/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 数据读取\n", + "\n", + "## 训练和验证集的划分\n", + "\n", + "划分训练和验证集的原因是为了在线下验证模型参数的好坏,为了完全模拟测试集,我们这里就在训练集中抽取部分用户的所有信息来作为验证集。提前做训练验证集划分的好处就是可以分解制作排序特征时的压力,一次性做整个数据集的排序特征可能时间会比较长。" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:07.230308Z", + "start_time": "2020-11-17T09:07:07.221081Z" + } + }, + "outputs": [], + "source": [ + "# all_click_df指的是训练集\n", + "# sample_user_nums 采样作为验证集的用户数量\n", + "def trn_val_split(all_click_df, sample_user_nums):\n", + " all_click = all_click_df\n", + " all_user_ids = all_click.user_id.unique()\n", + " \n", + " # replace=True表示可以重复抽样,反之不可以\n", + " sample_user_ids = np.random.choice(all_user_ids, size=sample_user_nums, replace=False) \n", + " \n", + " click_val = all_click[all_click['user_id'].isin(sample_user_ids)]\n", + " click_trn = all_click[~all_click['user_id'].isin(sample_user_ids)]\n", + " \n", + " # 将验证集中的最后一次点击给抽取出来作为答案\n", + " click_val = click_val.sort_values(['user_id', 'click_timestamp'])\n", + " val_ans = click_val.groupby('user_id').tail(1)\n", + " \n", + " click_val = click_val.groupby('user_id').apply(lambda x: x[:-1]).reset_index(drop=True)\n", + " \n", + " # 去除val_ans中某些用户只有一个点击数据的情况,如果该用户只有一个点击数据,又被分到ans中,\n", + " # 那么训练集中就没有这个用户的点击数据,出现用户冷启动问题,给自己模型验证带来麻烦\n", + " val_ans = val_ans[val_ans.user_id.isin(click_val.user_id.unique())] # 保证答案中出现的用户再验证集中还有\n", + " click_val = click_val[click_val.user_id.isin(val_ans.user_id.unique())]\n", + " \n", + " return click_trn, click_val, val_ans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 获取历史点击和最后一次点击" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:19.202550Z", + "start_time": "2020-11-17T09:07:19.195766Z" + } + }, + "outputs": [], + "source": [ + "# 获取当前数据的历史点击和最后一次点击\n", + "def get_hist_and_last_click(all_click):\n", + " all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])\n", + " click_last_df = all_click.groupby('user_id').tail(1)\n", + "\n", + " # 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下\n", + " def hist_func(user_df):\n", + " if len(user_df) == 1:\n", + " return user_df\n", + " else:\n", + " return user_df[:-1]\n", + "\n", + " click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n", + "\n", + " return click_hist_df, click_last_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取训练、验证及测试集" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:21.181211Z", + "start_time": "2020-11-17T09:07:21.171338Z" + } + }, + "outputs": [], + "source": [ + "def get_trn_val_tst_data(data_path, offline=True):\n", + " if offline:\n", + " click_trn_data = pd.read_csv(data_path+'train_click_log.csv') # 训练集用户点击日志\n", + " click_trn_data = reduce_mem(click_trn_data)\n", + " click_trn, click_val, val_ans = trn_val_split(click_trn_data, sample_user_nums)\n", + " else:\n", + " click_trn = pd.read_csv(data_path+'train_click_log.csv')\n", + " click_trn = reduce_mem(click_trn)\n", + " click_val = None\n", + " val_ans = None\n", + " \n", + " click_tst = pd.read_csv(data_path+'testA_click_log.csv')\n", + " \n", + " return click_trn, click_val, click_tst, val_ans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取召回列表" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:23.210604Z", + "start_time": "2020-11-17T09:07:23.203652Z" + } + }, + "outputs": [], + "source": [ + "# 返回多路召回列表或者单路召回\n", + "def get_recall_list(save_path, single_recall_model=None, multi_recall=False):\n", + " if multi_recall:\n", + " return pickle.load(open(save_path + 'final_recall_items_dict.pkl', 'rb'))\n", + " \n", + " if single_recall_model == 'i2i_itemcf':\n", + " return pickle.load(open(save_path + 'itemcf_recall_dict.pkl', 'rb'))\n", + " elif single_recall_model == 'i2i_emb_itemcf':\n", + " return pickle.load(open(save_path + 'itemcf_emb_dict.pkl', 'rb'))\n", + " elif single_recall_model == 'user_cf':\n", + " return pickle.load(open(save_path + 'youtubednn_usercf_dict.pkl', 'rb'))\n", + " elif single_recall_model == 'youtubednn':\n", + " return pickle.load(open(save_path + 'youtube_u2i_dict.pkl', 'rb'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取各种Embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Word2Vec训练及gensim的使用\n", + "\n", + "Word2Vec主要思想是:一个词的上下文可以很好的表达出词的语义。通过无监督学习产生词向量的方式。word2vec中有两个非常经典的模型:skip-gram和cbow。\n", + "\n", + "- skip-gram:已知中心词预测周围词。\n", + "- cbow:已知周围词预测中心词。\n", + "![image-20201106225233086](https://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20201106225233086.png)\n", + "\n", + "在使用gensim训练word2vec的时候,有几个比较重要的参数\n", + "- size: 表示词向量的维度。\n", + "- window:决定了目标词会与多远距离的上下文产生关系。\n", + "- sg: 如果是0,则是CBOW模型,是1则是Skip-Gram模型。\n", + "- workers: 表示训练时候的线程数量\n", + "- min_count: 设置最小的\n", + "- iter: 训练时遍历整个数据集的次数\n", + "\n", + "**注意**\n", + "1. 训练的时候输入的语料库一定要是字符组成的二维数组,如:[['北', '京', '你', '好'], ['上', '海', '你', '好']]\n", + "2. 使用模型的时候有一些默认值,可以通过在Jupyter里面通过`Word2Vec??`查看\n", + "\n", + "\n", + "下面是个简单的测试样例:\n", + "```\n", + "from gensim.models import Word2Vec\n", + "doc = [['30760', '157507'],\n", + " ['289197', '63746'],\n", + " ['36162', '168401'],\n", + " ['50644', '36162']]\n", + "w2v = Word2Vec(docs, size=12, sg=1, window=2, seed=2020, workers=2, min_count=1, iter=1)\n", + "\n", + "# 查看'30760'表示的词向量\n", + "w2v['30760']\n", + "```\n", + "\n", + "skip-gram和cbow的详细原理可以参考下面的博客:\n", + "- [word2vec原理(一) CBOW与Skip-Gram模型基础](https://www.cnblogs.com/pinard/p/7160330.html) \n", + "- [word2vec原理(二) 基于Hierarchical Softmax的模型](https://www.cnblogs.com/pinard/p/7160330.html) \n", + "- [word2vec原理(三) 基于Negative Sampling的模型](https://www.cnblogs.com/pinard/p/7249903.html) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:26.676173Z", + "start_time": "2020-11-17T09:07:26.667926Z" + } + }, + "outputs": [], + "source": [ + "def trian_item_word2vec(click_df, embed_size=64, save_name='item_w2v_emb.pkl', split_char=' '):\n", + " click_df = click_df.sort_values('click_timestamp')\n", + " # 只有转换成字符串才可以进行训练\n", + " click_df['click_article_id'] = click_df['click_article_id'].astype(str)\n", + " # 转换成句子的形式\n", + " docs = click_df.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()\n", + " docs = docs['click_article_id'].values.tolist()\n", + "\n", + " # 为了方便查看训练的进度,这里设定一个log信息\n", + " logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", + "\n", + " # 这里的参数对训练得到的向量影响也很大,默认负采样为5\n", + " w2v = Word2Vec(docs, size=16, sg=1, window=5, seed=2020, workers=24, min_count=1, iter=1)\n", + " \n", + " # 保存成字典的形式\n", + " item_w2v_emb_dict = {k: w2v[k] for k in click_df['click_article_id']}\n", + " pickle.dump(item_w2v_emb_dict, open(save_path + 'item_w2v_emb.pkl', 'wb'))\n", + " \n", + " return item_w2v_emb_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:27.285690Z", + "start_time": "2020-11-17T09:07:27.276646Z" + } + }, + "outputs": [], + "source": [ + "# 可以通过字典查询对应的item的Embedding\n", + "def get_embedding(save_path, all_click_df):\n", + " if os.path.exists(save_path + 'item_content_emb.pkl'):\n", + " item_content_emb_dict = pickle.load(open(save_path + 'item_content_emb.pkl', 'rb'))\n", + " else:\n", + " print('item_content_emb.pkl 文件不存在...')\n", + " \n", + " # w2v Embedding是需要提前训练好的\n", + " if os.path.exists(save_path + 'item_w2v_emb.pkl'):\n", + " item_w2v_emb_dict = pickle.load(open(save_path + 'item_w2v_emb.pkl', 'rb'))\n", + " else:\n", + " item_w2v_emb_dict = trian_item_word2vec(all_click_df)\n", + " \n", + " if os.path.exists(save_path + 'item_youtube_emb.pkl'):\n", + " item_youtube_emb_dict = pickle.load(open(save_path + 'item_youtube_emb.pkl', 'rb'))\n", + " else:\n", + " print('item_youtube_emb.pkl 文件不存在...')\n", + " \n", + " if os.path.exists(save_path + 'user_youtube_emb.pkl'):\n", + " user_youtube_emb_dict = pickle.load(open(save_path + 'user_youtube_emb.pkl', 'rb'))\n", + " else:\n", + " print('user_youtube_emb.pkl 文件不存在...')\n", + " \n", + " return item_content_emb_dict, item_w2v_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取文章信息" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:28.391797Z", + "start_time": "2020-11-17T09:07:28.386650Z" + } + }, + "outputs": [], + "source": [ + "def get_article_info_df():\n", + " article_info_df = pd.read_csv(data_path + 'articles.csv')\n", + " article_info_df = reduce_mem(article_info_df)\n", + " \n", + " return article_info_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取数据" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:32.362045Z", + "start_time": "2020-11-17T09:07:29.490413Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Mem. usage decreased to 23.34 Mb (69.4% reduction),time spend:0.00 min\n" + ] + } + ], + "source": [ + "# 这里offline的online的区别就是验证集是否为空\n", + "click_trn, click_val, click_tst, val_ans = get_trn_val_tst_data(data_path, offline=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:10.378966Z", + "start_time": "2020-11-17T09:07:32.468580Z" + } + }, + "outputs": [], + "source": [ + "click_trn_hist, click_trn_last = get_hist_and_last_click(click_trn)\n", + "\n", + "if click_val is not None:\n", + " click_val_hist, click_val_last = click_val, val_ans\n", + "else:\n", + " click_val_hist, click_val_last = None, None\n", + " \n", + "click_tst_hist = click_tst" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 对训练数据做负采样\n", + "\n", + "通过召回我们将数据转换成三元组的形式(user1, item1, label)的形式,观察发现正负样本差距极度不平衡,我们可以先对负样本进行下采样,下采样的目的一方面缓解了正负样本比例的问题,另一方面也减小了我们做排序特征的压力,我们在做负采样的时候又有哪些东西是需要注意的呢?\n", + "\n", + "1. 只对负样本进行下采样(如果有比较好的正样本扩充的方法其实也是可以考虑的)\n", + "2. 负采样之后,保证所有的用户和文章仍然出现在采样之后的数据中\n", + "3. 下采样的比例可以根据实际情况人为的控制\n", + "4. 做完负采样之后,更新此时新的用户召回文章列表,因为后续做特征的时候可能用到相对位置的信息。\n", + "\n", + "其实负采样也可以留在后面做完特征在进行,这里由于做排序特征太慢了,所以把负采样的环节提到前面了。" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:36.096678Z", + "start_time": "2020-11-17T09:11:36.090911Z" + } + }, + "outputs": [], + "source": [ + "# 将召回列表转换成df的形式\n", + "def recall_dict_2_df(recall_list_dict):\n", + " df_row_list = [] # [user, item, score]\n", + " for user, recall_list in tqdm(recall_list_dict.items()):\n", + " for item, score in recall_list:\n", + " df_row_list.append([user, item, score])\n", + " \n", + " col_names = ['user_id', 'sim_item', 'score']\n", + " recall_list_df = pd.DataFrame(df_row_list, columns=col_names)\n", + " \n", + " return recall_list_df" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:37.668844Z", + "start_time": "2020-11-17T09:11:37.659774Z" + } + }, + "outputs": [], + "source": [ + "# 负采样函数,这里可以控制负采样时的比例, 这里给了一个默认的值\n", + "def neg_sample_recall_data(recall_items_df, sample_rate=0.001):\n", + " pos_data = recall_items_df[recall_items_df['label'] == 1]\n", + " neg_data = recall_items_df[recall_items_df['label'] == 0]\n", + " \n", + " print('pos_data_num:', len(pos_data), 'neg_data_num:', len(neg_data), 'pos/neg:', len(pos_data)/len(neg_data))\n", + " \n", + " # 分组采样函数\n", + " def neg_sample_func(group_df):\n", + " neg_num = len(group_df)\n", + " sample_num = max(int(neg_num * sample_rate), 1) # 保证最少有一个\n", + " sample_num = min(sample_num, 5) # 保证最多不超过5个,这里可以根据实际情况进行选择\n", + " return group_df.sample(n=sample_num, replace=True)\n", + " \n", + " # 对用户进行负采样,保证所有用户都在采样后的数据中\n", + " neg_data_user_sample = neg_data.groupby('user_id', group_keys=False).apply(neg_sample_func)\n", + " # 对文章进行负采样,保证所有文章都在采样后的数据中\n", + " neg_data_item_sample = neg_data.groupby('sim_item', group_keys=False).apply(neg_sample_func)\n", + " \n", + " # 将上述两种情况下的采样数据合并\n", + " neg_data_new = neg_data_user_sample.append(neg_data_item_sample)\n", + " # 由于上述两个操作是分开的,可能将两个相同的数据给重复选择了,所以需要对合并后的数据进行去重\n", + " neg_data_new = neg_data_new.sort_values(['user_id', 'score']).drop_duplicates(['user_id', 'sim_item'], keep='last')\n", + " \n", + " # 将正样本数据合并\n", + " data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)\n", + " \n", + " return data_new" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:39.481715Z", + "start_time": "2020-11-17T09:11:39.475144Z" + } + }, + "outputs": [], + "source": [ + "# 召回数据打标签\n", + "def get_rank_label_df(recall_list_df, label_df, is_test=False):\n", + " # 测试集是没有标签了,为了后面代码同一一些,这里直接给一个负数替代\n", + " if is_test:\n", + " recall_list_df['label'] = -1\n", + " return recall_list_df\n", + " \n", + " label_df = label_df.rename(columns={'click_article_id': 'sim_item'})\n", + " recall_list_df_ = recall_list_df.merge(label_df[['user_id', 'sim_item', 'click_timestamp']], \\\n", + " how='left', on=['user_id', 'sim_item'])\n", + " recall_list_df_['label'] = recall_list_df_['click_timestamp'].apply(lambda x: 0.0 if np.isnan(x) else 1.0)\n", + " del recall_list_df_['click_timestamp']\n", + " \n", + " return recall_list_df_" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:41.555566Z", + "start_time": "2020-11-17T09:11:41.546766Z" + } + }, + "outputs": [], + "source": [ + "def get_user_recall_item_label_df(click_trn_hist, click_val_hist, click_tst_hist,click_trn_last, click_val_last, recall_list_df):\n", + " # 获取训练数据的召回列表\n", + " trn_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_trn_hist['user_id'].unique())]\n", + " # 训练数据打标签\n", + " trn_user_item_label_df = get_rank_label_df(trn_user_items_df, click_trn_last, is_test=False)\n", + " # 训练数据负采样\n", + " trn_user_item_label_df = neg_sample_recall_data(trn_user_item_label_df)\n", + " \n", + " if click_val is not None:\n", + " val_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_val_hist['user_id'].unique())]\n", + " val_user_item_label_df = get_rank_label_df(val_user_items_df, click_val_last, is_test=False)\n", + " val_user_item_label_df = neg_sample_recall_data(val_user_item_label_df)\n", + " else:\n", + " val_user_item_label_df = None\n", + " \n", + " # 测试数据不需要进行负采样,直接对所有的召回商品进行打-1标签\n", + " tst_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_tst_hist['user_id'].unique())]\n", + " tst_user_item_label_df = get_rank_label_df(tst_user_items_df, None, is_test=True)\n", + " \n", + " return trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:23:35.357045Z", + "start_time": "2020-11-17T17:23:12.378284Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [00:12<00:00, 20689.39it/s]\n" + ] + } + ], + "source": [ + "# 读取召回列表\n", + "recall_list_dict = get_recall_list(save_path, single_recall_model='i2i_itemcf') # 这里只选择了单路召回的结果,也可以选择多路召回结果\n", + "# 将召回数据转换成df\n", + "recall_list_df = recall_dict_2_df(recall_list_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:29:04.598214Z", + "start_time": "2020-11-17T17:23:40.001052Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pos_data_num: 64190 neg_data_num: 1935810 pos/neg: 0.03315924600038227\n" + ] + } + ], + "source": [ + "# 给训练验证数据打标签,并负采样(这一部分时间比较久)\n", + "trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df = get_user_recall_item_label_df(click_trn_hist, \n", + " click_val_hist, \n", + " click_tst_hist,\n", + " click_trn_last, \n", + " click_val_last, \n", + " recall_list_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:23:11.642944Z", + "start_time": "2020-11-17T17:23:08.475Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "trn_user_item_label_df.label" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 将召回数据转换成字典" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:36:22.800449Z", + "start_time": "2020-11-17T17:36:22.794670Z" + } + }, + "outputs": [], + "source": [ + "# 将最终的召回的df数据转换成字典的形式做排序特征\n", + "def make_tuple_func(group_df):\n", + " row_data = []\n", + " for name, row_df in group_df.iterrows():\n", + " row_data.append((row_df['sim_item'], row_df['score'], row_df['label']))\n", + " \n", + " return row_data" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:40:05.991819Z", + "start_time": "2020-11-17T17:36:26.536429Z" + } + }, + "outputs": [], + "source": [ + "trn_user_item_label_tuples = trn_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", + "trn_user_item_label_tuples_dict = dict(zip(trn_user_item_label_tuples['user_id'], trn_user_item_label_tuples[0]))\n", + "\n", + "if val_user_item_label_df is not None:\n", + " val_user_item_label_tuples = val_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", + " val_user_item_label_tuples_dict = dict(zip(val_user_item_label_tuples['user_id'], val_user_item_label_tuples[0]))\n", + "else:\n", + " val_user_item_label_tuples_dict = None\n", + " \n", + "tst_user_item_label_tuples = tst_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", + "tst_user_item_label_tuples_dict = dict(zip(tst_user_item_label_tuples['user_id'], tst_user_item_label_tuples[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T07:59:53.141560Z", + "start_time": "2020-11-17T07:59:53.133599Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 特征工程" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 制作与用户历史行为相关特征\n", + "对于每个用户召回的每个商品, 做特征。 具体步骤如下:\n", + "* 对于每个用户, 获取最后点击的N个商品的item_id, \n", + " * 对于该用户的每个召回商品, 计算与上面最后N次点击商品的相似度的和(最大, 最小,均值), 时间差特征,相似性特征,字数差特征,与该用户的相似性特征" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T01:07:47.268035Z", + "start_time": "2020-11-18T01:07:47.250449Z" + } + }, + "outputs": [], + "source": [ + "# 下面基于data做历史相关的特征\n", + "def create_feature(users_id, recall_list, click_hist_df, articles_info, articles_emb, user_emb=None, N=1):\n", + " \"\"\"\n", + " 基于用户的历史行为做相关特征\n", + " :param users_id: 用户id\n", + " :param recall_list: 对于每个用户召回的候选文章列表\n", + " :param click_hist_df: 用户的历史点击信息\n", + " :param articles_info: 文章信息\n", + " :param articles_emb: 文章的embedding向量, 这个可以用item_content_emb, item_w2v_emb, item_youtube_emb\n", + " :param user_emb: 用户的embedding向量, 这个是user_youtube_emb, 如果没有也可以不用, 但要注意如果要用的话, articles_emb就要用item_youtube_emb的形式, 这样维度才一样\n", + " :param N: 最近的N次点击 由于testA日志里面很多用户只存在一次历史点击, 所以为了不产生空值,默认是1\n", + " \"\"\"\n", + " \n", + " # 建立一个二维列表保存结果, 后面要转成DataFrame\n", + " all_user_feas = []\n", + " i = 0\n", + " for user_id in tqdm(users_id):\n", + " # 该用户的最后N次点击\n", + " hist_user_items = click_hist_df[click_hist_df['user_id']==user_id]['click_article_id'][-N:]\n", + " \n", + " # 遍历该用户的召回列表\n", + " for rank, (article_id, score, label) in enumerate(recall_list[user_id]):\n", + " # 该文章建立时间, 字数\n", + " a_create_time = articles_info[articles_info['article_id']==article_id]['created_at_ts'].values[0]\n", + " a_words_count = articles_info[articles_info['article_id']==article_id]['words_count'].values[0]\n", + " single_user_fea = [user_id, article_id]\n", + " # 计算与最后点击的商品的相似度的和, 最大值和最小值, 均值\n", + " sim_fea = []\n", + " time_fea = []\n", + " word_fea = []\n", + " # 遍历用户的最后N次点击文章\n", + " for hist_item in hist_user_items:\n", + " b_create_time = articles_info[articles_info['article_id']==hist_item]['created_at_ts'].values[0]\n", + " b_words_count = articles_info[articles_info['article_id']==hist_item]['words_count'].values[0]\n", + " \n", + " sim_fea.append(np.dot(articles_emb[hist_item], articles_emb[article_id]))\n", + " time_fea.append(abs(a_create_time-b_create_time))\n", + " word_fea.append(abs(a_words_count-b_words_count))\n", + " \n", + " single_user_fea.extend(sim_fea) # 相似性特征\n", + " single_user_fea.extend(time_fea) # 时间差特征\n", + " single_user_fea.extend(word_fea) # 字数差特征\n", + " single_user_fea.extend([max(sim_fea), min(sim_fea), sum(sim_fea), sum(sim_fea) / len(sim_fea)]) # 相似性的统计特征\n", + " \n", + " if user_emb: # 如果用户向量有的话, 这里计算该召回文章与用户的相似性特征 \n", + " single_user_fea.append(np.dot(user_emb[user_id], articles_emb[article_id]))\n", + " \n", + " single_user_fea.extend([score, rank, label]) \n", + " # 加入到总的表中\n", + " all_user_feas.append(single_user_fea)\n", + " \n", + " # 定义列名\n", + " id_cols = ['user_id', 'click_article_id']\n", + " sim_cols = ['sim' + str(i) for i in range(N)]\n", + " time_cols = ['time_diff' + str(i) for i in range(N)]\n", + " word_cols = ['word_diff' + str(i) for i in range(N)]\n", + " sat_cols = ['sim_max', 'sim_min', 'sim_sum', 'sim_mean']\n", + " user_item_sim_cols = ['user_item_sim'] if user_emb else []\n", + " user_score_rank_label = ['score', 'rank', 'label']\n", + " cols = id_cols + sim_cols + time_cols + word_cols + sat_cols + user_item_sim_cols + user_score_rank_label\n", + " \n", + " # 转成DataFrame\n", + " df = pd.DataFrame( all_user_feas, columns=cols)\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T01:08:17.531694Z", + "start_time": "2020-11-18T01:08:10.754702Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Mem. usage decreased to 5.56 Mb (50.0% reduction),time spend:0.00 min\n" + ] + } + ], + "source": [ + "article_info_df = get_article_info_df()\n", + "all_click = click_trn.append(click_tst)\n", + "item_content_emb_dict, item_w2v_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict = get_embedding(save_path, all_click)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:06:22.709350Z", + "start_time": "2020-11-18T01:08:39.923811Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 200000/200000 [50:16<00:00, 66.31it/s] \n", + "100%|██████████| 50000/50000 [1:07:21<00:00, 12.37it/s]\n" + ] + } + ], + "source": [ + "# 获取训练验证及测试数据中召回列文章相关特征\n", + "trn_user_item_feats_df = create_feature(trn_user_item_label_tuples_dict.keys(), trn_user_item_label_tuples_dict, \\\n", + " click_trn_hist, article_info_df, item_content_emb_dict)\n", + "\n", + "if val_user_item_label_tuples_dict is not None:\n", + " val_user_item_feats_df = create_feature(val_user_item_label_tuples_dict.keys(), val_user_item_label_tuples_dict, \\\n", + " click_val_hist, article_info_df, item_content_emb_dict)\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "tst_user_item_feats_df = create_feature(tst_user_item_label_tuples_dict.keys(), tst_user_item_label_tuples_dict, \\\n", + " click_tst_hist, article_info_df, item_content_emb_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:13:58.573422Z", + "start_time": "2020-11-18T03:13:40.157228Z" + } + }, + "outputs": [], + "source": [ + "# 保存一份省的每次都要重新跑,每次跑的时间都比较长\n", + "trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n", + "\n", + "tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:14:22.838154Z", + "start_time": "2020-11-18T03:14:22.828212Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 用户和文章特征\n", + "### 用户相关特征\n", + "这一块,正式进行特征工程,既要拼接上已有的特征, 也会做更多的特征出来,我们来梳理一下已有的特征和可构造特征:\n", + "1. 文章自身的特征, 文章字数,文章创建时间, 文章的embedding (articles表中)\n", + "2. 用户点击环境特征, 那些设备的特征(这个在df中)\n", + "3. 对于用户和商品还可以构造的特征:\n", + " * 基于用户的点击文章次数和点击时间构造可以表现用户活跃度的特征\n", + " * 基于文章被点击次数和时间构造可以反映文章热度的特征\n", + " * 用户的时间统计特征: 根据其点击的历史文章列表的点击时间和文章的创建时间做统计特征,比如求均值, 这个可以反映用户对于文章时效的偏好\n", + " * 用户的主题爱好特征, 对于用户点击的历史文章主题进行一个统计, 然后对于当前文章看看是否属于用户已经点击过的主题\n", + " * 用户的字数爱好特征, 对于用户点击的历史文章的字数统计, 求一个均值" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T03:16:37.637495Z", + "start_time": "2020-11-14T03:16:37.618229Z" + } + }, + "outputs": [], + "source": [ + "click_tst.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:09:11.675550Z", + "start_time": "2020-11-17T02:09:10.265134Z" + } + }, + "outputs": [], + "source": [ + "# 读取文章特征\n", + "articles = pd.read_csv(data_path+'articles.csv')\n", + "articles = reduce_mem(articles)\n", + "\n", + "# 日志数据,就是前面的所有数据\n", + "if click_val is not None:\n", + " all_data = click_trn.append(click_val)\n", + "all_data = click_trn.append(click_tst)\n", + "all_data = reduce_mem(all_data)\n", + "\n", + "# 拼上文章信息\n", + "all_data = all_data.merge(articles, left_on='click_article_id', right_on='article_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T03:17:12.256244Z", + "start_time": "2020-11-14T03:17:12.250452Z" + } + }, + "outputs": [], + "source": [ + "all_data.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 分析一下点击时间和点击文章的次数,区分用户活跃度\n", + "如果某个用户点击文章之间的时间间隔比较小, 同时点击的文章次数很多的话, 那么我们认为这种用户一般就是活跃用户, 当然衡量用户活跃度的方式可能多种多样, 这里我们只提供其中一种,我们写一个函数, 得到可以衡量用户活跃度的特征,逻辑如下:\n", + "1. 首先根据用户user_id分组, 对于每个用户,计算点击文章的次数, 两两点击文章时间间隔的均值\n", + "2. 把点击次数取倒数和时间间隔的均值统一归一化,然后两者相加合并,该值越小, 说明用户越活跃\n", + "3. 注意, 上面两两点击文章的时间间隔均值, 会出现如果用户只点击了一次的情况,这时候时间间隔均值那里会出现空值, 对于这种情况最后特征那里给个大数进行区分\n", + "\n", + "这个的衡量标准就是先把点击的次数取到数然后归一化, 然后点击的时间差归一化, 然后两者相加进行合并, 该值越小, 说明被点击的次数越多, 且间隔时间短。 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:28:55.336058Z", + "start_time": "2020-11-17T02:28:55.324332Z" + } + }, + "outputs": [], + "source": [ + " def active_level(all_data, cols):\n", + " \"\"\"\n", + " 制作区分用户活跃度的特征\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " data = all_data[cols]\n", + " data.sort_values(['user_id', 'click_timestamp'], inplace=True)\n", + " user_act = pd.DataFrame(data.groupby('user_id', as_index=False)[['click_article_id', 'click_timestamp']].\\\n", + " agg({'click_article_id':np.size, 'click_timestamp': {list}}).values, columns=['user_id', 'click_size', 'click_timestamp'])\n", + " \n", + " # 计算时间间隔的均值\n", + " def time_diff_mean(l):\n", + " if len(l) == 1:\n", + " return 1\n", + " else:\n", + " return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])\n", + " \n", + " user_act['time_diff_mean'] = user_act['click_timestamp'].apply(lambda x: time_diff_mean(x))\n", + " \n", + " # 点击次数取倒数\n", + " user_act['click_size'] = 1 / user_act['click_size']\n", + " \n", + " # 两者归一化\n", + " user_act['click_size'] = (user_act['click_size'] - user_act['click_size'].min()) / (user_act['click_size'].max() - user_act['click_size'].min())\n", + " user_act['time_diff_mean'] = (user_act['time_diff_mean'] - user_act['time_diff_mean'].min()) / (user_act['time_diff_mean'].max() - user_act['time_diff_mean'].min()) \n", + " user_act['active_level'] = user_act['click_size'] + user_act['time_diff_mean']\n", + " \n", + " user_act['user_id'] = user_act['user_id'].astype('int')\n", + " del user_act['click_timestamp']\n", + " \n", + " return user_act" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:30:12.696060Z", + "start_time": "2020-11-17T02:29:01.523837Z" + } + }, + "outputs": [], + "source": [ + "user_act_fea = active_level(all_data, ['user_id', 'click_article_id', 'click_timestamp'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:28:53.996742Z", + "start_time": "2020-11-17T02:09:18.374Z" + } + }, + "outputs": [], + "source": [ + "user_act_fea.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 分析一下点击时间和被点击文章的次数, 衡量文章热度特征\n", + "和上面同样的思路, 如果一篇文章在很短的时间间隔之内被点击了很多次, 说明文章比较热门,实现的逻辑和上面的基本一致, 只不过这里是按照点击的文章进行分组:\n", + "1. 根据文章进行分组, 对于每篇文章的用户, 计算点击的时间间隔\n", + "2. 将用户的数量取倒数, 然后用户的数量和时间间隔归一化, 然后相加得到热度特征, 该值越小, 说明被点击的次数越大且时间间隔越短, 文章比较热\n", + "\n", + "当然, 这只是给出一种判断文章热度的一种方法, 这里大家也可以头脑风暴一下" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:41:26.398567Z", + "start_time": "2020-11-17T02:41:26.386668Z" + } + }, + "outputs": [], + "source": [ + " def hot_level(all_data, cols):\n", + " \"\"\"\n", + " 制作衡量文章热度的特征\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " data = all_data[cols]\n", + " data.sort_values(['click_article_id', 'click_timestamp'], inplace=True)\n", + " article_hot = pd.DataFrame(data.groupby('click_article_id', as_index=False)[['user_id', 'click_timestamp']].\\\n", + " agg({'user_id':np.size, 'click_timestamp': {list}}).values, columns=['click_article_id', 'user_num', 'click_timestamp'])\n", + " \n", + " # 计算被点击时间间隔的均值\n", + " def time_diff_mean(l):\n", + " if len(l) == 1:\n", + " return 1\n", + " else:\n", + " return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])\n", + " \n", + " article_hot['time_diff_mean'] = article_hot['click_timestamp'].apply(lambda x: time_diff_mean(x))\n", + " \n", + " # 点击次数取倒数\n", + " article_hot['user_num'] = 1 / article_hot['user_num']\n", + " \n", + " # 两者归一化\n", + " article_hot['user_num'] = (article_hot['user_num'] - article_hot['user_num'].min()) / (article_hot['user_num'].max() - article_hot['user_num'].min())\n", + " article_hot['time_diff_mean'] = (article_hot['time_diff_mean'] - article_hot['time_diff_mean'].min()) / (article_hot['time_diff_mean'].max() - article_hot['time_diff_mean'].min()) \n", + " article_hot['hot_level'] = article_hot['user_num'] + article_hot['time_diff_mean']\n", + " \n", + " article_hot['click_article_id'] = article_hot['click_article_id'].astype('int')\n", + " \n", + " del article_hot['click_timestamp']\n", + " \n", + " return article_hot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:41:44.635900Z", + "start_time": "2020-11-17T02:41:31.473032Z" + } + }, + "outputs": [], + "source": [ + "article_hot_fea = hot_level(all_data, ['user_id', 'click_article_id', 'click_timestamp']) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T03:19:54.775290Z", + "start_time": "2020-11-14T03:19:54.763699Z" + } + }, + "outputs": [], + "source": [ + "article_hot_fea.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的系列习惯\n", + "这个基于原来的日志表做一个类似于article的那种DataFrame, 存放用户特有的信息, 主要包括点击习惯, 爱好特征之类的\n", + "* 用户的设备习惯, 这里取最常用的设备(众数)\n", + "* 用户的时间习惯: 根据其点击过得历史文章的时间来做一个统计(这个感觉最好是把时间戳里的时间特征的h特征提出来,看看用户习惯一天的啥时候点击文章), 但这里先用转换的时间吧, 求个均值\n", + "* 用户的爱好特征, 对于用户点击的历史文章主题进行用户的爱好判别, 更偏向于哪几个主题, 这个最好是multi-hot进行编码, 先试试行不\n", + "* 用户文章的字数差特征, 用户的爱好文章的字数习惯\n", + "\n", + "这些就是对用户进行分组, 然后统计即可" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的设备习惯" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T04:22:48.877978Z", + "start_time": "2020-11-17T04:22:48.872049Z" + } + }, + "outputs": [], + "source": [ + "def device_fea(all_data, cols):\n", + " \"\"\"\n", + " 制作用户的设备特征\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " user_device_info = all_data[cols]\n", + " \n", + " # 用众数来表示每个用户的设备信息\n", + " user_device_info = user_device_info.groupby('user_id').agg(lambda x: x.value_counts().index[0]).reset_index()\n", + " \n", + " return user_device_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T05:27:10.897473Z", + "start_time": "2020-11-17T04:49:33.214865Z" + } + }, + "outputs": [], + "source": [ + "# 设备特征(这里时间会比较长)\n", + "device_cols = ['user_id', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', 'click_region', 'click_referrer_type']\n", + "user_device_info = device_fea(all_data, device_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T04:20:39.765842Z", + "start_time": "2020-11-14T04:20:39.747087Z" + } + }, + "outputs": [], + "source": [ + "user_device_info.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的时间习惯" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:11:50.889905Z", + "start_time": "2020-11-17T06:11:50.882653Z" + } + }, + "outputs": [], + "source": [ + "def user_time_hob_fea(all_data, cols):\n", + " \"\"\"\n", + " 制作用户的时间习惯特征\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " user_time_hob_info = all_data[cols]\n", + " \n", + " # 先把时间戳进行归一化\n", + " mm = MinMaxScaler()\n", + " user_time_hob_info['click_timestamp'] = mm.fit_transform(user_time_hob_info[['click_timestamp']])\n", + " user_time_hob_info['created_at_ts'] = mm.fit_transform(user_time_hob_info[['created_at_ts']])\n", + "\n", + " user_time_hob_info = user_time_hob_info.groupby('user_id').agg('mean').reset_index()\n", + " \n", + " user_time_hob_info.rename(columns={'click_timestamp': 'user_time_hob1', 'created_at_ts': 'user_time_hob2'}, inplace=True)\n", + " return user_time_hob_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:31:51.646110Z", + "start_time": "2020-11-17T06:31:51.171431Z" + } + }, + "outputs": [], + "source": [ + "user_time_hob_cols = ['user_id', 'click_timestamp', 'created_at_ts']\n", + "user_time_hob_info = user_time_hob_fea(all_data, user_time_hob_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的主题爱好\n", + "这里先把用户点击的文章属于的主题转成一个列表, 后面再总的汇总的时候单独制作一个特征, 就是文章的主题如果属于这里面, 就是1, 否则就是0。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:31:56.571088Z", + "start_time": "2020-11-17T06:31:56.565304Z" + } + }, + "outputs": [], + "source": [ + "def user_cat_hob_fea(all_data, cols):\n", + " \"\"\"\n", + " 用户的主题爱好\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " user_category_hob_info = all_data[cols]\n", + " user_category_hob_info = user_category_hob_info.groupby('user_id').agg({list}).reset_index()\n", + " \n", + " user_cat_hob_info = pd.DataFrame()\n", + " user_cat_hob_info['user_id'] = user_category_hob_info['user_id']\n", + " user_cat_hob_info['cate_list'] = user_category_hob_info['category_id']\n", + " \n", + " return user_cat_hob_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:32:55.150800Z", + "start_time": "2020-11-17T06:32:00.740046Z" + } + }, + "outputs": [], + "source": [ + "user_category_hob_cols = ['user_id', 'category_id']\n", + "user_cat_hob_info = user_cat_hob_fea(all_data, user_category_hob_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的字数偏好特征" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:48:12.988460Z", + "start_time": "2020-11-17T06:48:12.547000Z" + } + }, + "outputs": [], + "source": [ + "user_wcou_info = all_data.groupby('user_id')['words_count'].agg('mean').reset_index()\n", + "user_wcou_info.rename(columns={'words_count': 'words_hbo'}, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的信息特征合并保存" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:48:18.289591Z", + "start_time": "2020-11-17T06:48:17.084408Z" + } + }, + "outputs": [], + "source": [ + "# 所有表进行合并\n", + "user_info = pd.merge(user_act_fea, user_device_info, on='user_id')\n", + "user_info = user_info.merge(user_time_hob_info, on='user_id')\n", + "user_info = user_info.merge(user_cat_hob_info, on='user_id')\n", + "user_info = user_info.merge(user_wcou_info, on='user_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:48:26.907785Z", + "start_time": "2020-11-17T06:48:21.457597Z" + } + }, + "outputs": [], + "source": [ + "# 这样用户特征以后就可以直接读取了\n", + "user_info.to_csv(save_path + 'user_info.csv', index=False) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户特征直接读入\n", + "如果前面关于用户的特征工程已经给做完了,后面可以直接读取" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:15:49.502826Z", + "start_time": "2020-11-18T03:15:48.062243Z" + } + }, + "outputs": [], + "source": [ + "# 把用户信息直接读入进来\n", + "user_info = pd.read_csv(save_path + 'user_info.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:15:56.899635Z", + "start_time": "2020-11-18T03:15:53.701818Z" + } + }, + "outputs": [], + "source": [ + "if os.path.exists(save_path + 'trn_user_item_feats_df.csv'):\n", + " trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')\n", + " \n", + "if os.path.exists(save_path + 'tst_user_item_feats_df.csv'):\n", + " tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')\n", + "\n", + "if os.path.exists(save_path + 'val_user_item_feats_df.csv'):\n", + " val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')\n", + "else:\n", + " val_user_item_feats_df = None" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:16:02.739197Z", + "start_time": "2020-11-18T03:16:01.725028Z" + } + }, + "outputs": [], + "source": [ + "# 拼上用户特征\n", + "# 下面是线下验证的\n", + "trn_user_item_feats_df = trn_user_item_feats_df.merge(user_info, on='user_id', how='left')\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df = val_user_item_feats_df.merge(user_info, on='user_id', how='left')\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "tst_user_item_feats_df = tst_user_item_feats_df.merge(user_info, on='user_id',how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:16:06.989877Z", + "start_time": "2020-11-18T03:16:06.983327Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['user_id', 'click_article_id', 'sim0', 'time_diff0', 'word_diff0',\n", + " 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank', 'label',\n", + " 'click_size', 'time_diff_mean', 'active_level', 'click_environment',\n", + " 'click_deviceGroup', 'click_os', 'click_country', 'click_region',\n", + " 'click_referrer_type', 'user_time_hob1', 'user_time_hob2', 'cate_list',\n", + " 'words_hbo'],\n", + " dtype='object')" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trn_user_item_feats_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T03:13:36.071236Z", + "start_time": "2020-11-14T03:13:36.050188Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 文章的特征直接读入" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:16:12.793070Z", + "start_time": "2020-11-18T03:16:12.425380Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Mem. usage decreased to 5.56 Mb (50.0% reduction),time spend:0.00 min\n" + ] + } + ], + "source": [ + "articles = pd.read_csv(data_path+'articles.csv')\n", + "articles = reduce_mem(articles)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:16:18.118507Z", + "start_time": "2020-11-18T03:16:16.344338Z" + } + }, + "outputs": [], + "source": [ + "# 拼上文章特征\n", + "trn_user_item_feats_df = trn_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df = val_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n", + "else:\n", + " val_user_item_feats_df = None\n", + "\n", + "tst_user_item_feats_df = tst_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 召回文章的主题是否在用户的爱好里面" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:17:40.251797Z", + "start_time": "2020-11-18T03:16:28.130012Z" + } + }, + "outputs": [], + "source": [ + "trn_user_item_feats_df['is_cat_hab'] = trn_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df['is_cat_hab'] = val_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n", + "else:\n", + " val_user_item_feats_df = None\n", + "tst_user_item_feats_df['is_cat_hab'] = tst_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:19:30.451200Z", + "start_time": "2020-11-18T03:19:30.411225Z" + } + }, + "outputs": [], + "source": [ + "# 线下验证\n", + "del trn_user_item_feats_df['cate_list']\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " del val_user_item_feats_df['cate_list']\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "del tst_user_item_feats_df['cate_list']\n", + "\n", + "del trn_user_item_feats_df['article_id']\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " del val_user_item_feats_df['article_id']\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "del tst_user_item_feats_df['article_id']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 保存特征" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:20:08.560942Z", + "start_time": "2020-11-18T03:19:35.601095Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "# 训练验证特征\n", + "trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n", + "tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 总结\n", + "特征工程和数据清洗转换是比赛中至关重要的一块, 因为**数据和特征决定了机器学习的上限,而算法和模型只是逼近这个上限而已**,所以特征工程的好坏往往决定着最后的结果,**特征工程**可以一步增强数据的表达能力,通过构造新特征,我们可以挖掘出数据的更多信息,使得数据的表达能力进一步放大。 在本节内容中,我们主要是先通过制作特征和标签把预测问题转成了监督学习问题,然后围绕着用户画像和文章画像进行一系列特征的制作, 此外,为了保证正负样本的数据均衡,我们还学习了负采样就技术等。当然本节内容只是对构造特征提供了一些思路,也请学习者们在学习过程中开启头脑风暴,尝试更多的构造特征的方法,也欢迎我们一块探讨和交流。\n", + "\n", + "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", + "\n", + "![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" + ] } - }, - "outputs": [], - "source": [ - "trn_user_item_feats_df['is_cat_hab'] = trn_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df['is_cat_hab'] = val_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n", - "else:\n", - " val_user_item_feats_df = None\n", - "tst_user_item_feats_df['is_cat_hab'] = tst_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:19:30.451200Z", - "start_time": "2020-11-18T03:19:30.411225Z" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + }, + "tianchi_metadata": { + "competitions": [], + "datasets": [], + "description": "", + "notebookId": "130010", + "source": "dsw" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "218px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } - }, - "outputs": [], - "source": [ - "# 线下验证\n", - "del trn_user_item_feats_df['cate_list']\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " del val_user_item_feats_df['cate_list']\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "del tst_user_item_feats_df['cate_list']\n", - "\n", - "del trn_user_item_feats_df['article_id']\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " del val_user_item_feats_df['article_id']\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "del tst_user_item_feats_df['article_id']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 保存特征" - ] }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:20:08.560942Z", - "start_time": "2020-11-18T03:19:35.601095Z" - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "# 训练验证特征\n", - "trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n", - "tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 总结\n", - "特征工程和数据清洗转换是比赛中至关重要的一块, 因为**数据和特征决定了机器学习的上限,而算法和模型只是逼近这个上限而已**,所以特征工程的好坏往往决定着最后的结果,**特征工程**可以一步增强数据的表达能力,通过构造新特征,我们可以挖掘出数据的更多信息,使得数据的表达能力进一步放大。 在本节内容中,我们主要是先通过制作特征和标签把预测问题转成了监督学习问题,然后围绕着用户画像和文章画像进行一系列特征的制作, 此外,为了保证正负样本的数据均衡,我们还学习了负采样就技术等。当然本节内容只是对构造特征提供了一些思路,也请学习者们在学习过程中开启头脑风暴,尝试更多的构造特征的方法,也欢迎我们一块探讨和交流。\n", - "\n", - "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - }, - "tianchi_metadata": { - "competitions": [], - "datasets": [], - "description": "", - "notebookId": "130010", - "source": "dsw" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "218px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git "a/docs/ch03/ch3.1/jupyter/\350\265\233\351\242\230\347\220\206\350\247\243+Baseline.ipynb" "b/docs/ch03/ch3.1/jupyter/\350\265\233\351\242\230\347\220\206\350\247\243+Baseline.ipynb" index 1dae6308d..1567babe2 100644 --- "a/docs/ch03/ch3.1/jupyter/\350\265\233\351\242\230\347\220\206\350\247\243+Baseline.ipynb" +++ "b/docs/ch03/ch3.1/jupyter/\350\265\233\351\242\230\347\220\206\350\247\243+Baseline.ipynb" @@ -1,664 +1,664 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 赛题理解\n", - "赛题理解是切入一道赛题的基础,会影响后续特征工程和模型构建等各种工作,也影响着后续发展工作的方向,正确了解赛题背后的思想以及赛题业务逻辑的清晰,有利于花费更少时间构建更为有效的特征模型, 在各种比赛中, 赛题理解都是极其重要且必须走好的第一步, 今天我们就从赛题的理解出发, 首先了解一下这次赛题的概况和数据,从中分析赛题以及大致的处理方式, 其次我们了解模型评测的指标,最后对赛题的理解整理一些经验。\n", - "\n", - "## 赛题简介\n", - "此次比赛是新闻推荐场景下的用户行为预测挑战赛, 该赛题是以新闻APP中的新闻推荐为背景, 目的是**要求我们根据用户历史浏览点击新闻文章的数据信息预测用户未来的点击行为, 即用户的最后一次点击的新闻文章**, 这道赛题的设计初衷是引导大家了解推荐系统中的一些业务背景, 解决实际问题。 \n", - "\n", - "## 数据概况\n", - "该数据来自某新闻APP平台的用户交互数据,包括30万用户,近300万次点击,共36万多篇不同的新闻文章,同时每篇新闻文章有对应的embedding向量表示。为了保证比赛的公平性,从中抽取20万用户的点击日志数据作为训练集,5万用户的点击日志数据作为测试集A,5万用户的点击日志数据作为测试集B。具体数据表和参数, 大家可以参考赛题说明。下面说一下拿到这样的数据如何进行理解, 来有效的开展下一步的工作。

\n", - "## 评价方式理解\n", - "理解评价方式, 我们需要结合着最后的提交文件来看, 根据sample.submit.csv, 我们最后提交的格式是针对每个用户, 我们都会给出五篇文章的推荐结果,按照点击概率从前往后排序。 而真实的每个用户最后一次点击的文章只会有一篇的真实答案, 所以我们就看我们推荐的这五篇里面是否有命中真实答案的。比如对于user1来说, 我们的提交会是:\n", - ">user1, article1, article2, article3, article4, article5.\n", - "\n", - "评价指标的公式如下:\n", - "$$\n", - "score(user) = \\sum_{k=1}^5 \\frac{s(user, k)}{k}\n", - "$$\n", - "\n", - "假如article1就是真实的用户点击文章,也就是article1命中, 则s(user1,1)=1, s(user1,2-4)都是0, 如果article2是用户点击的文章, 则s(user,2)=1/2,s(user,1,3,4,5)都是0。也就是score(user)=命中第几条的倒数。如果都没中, 则score(user1)=0。 这个是合理的, 因为我们希望的就是命中的结果尽量靠前, 而此时分数正好比较高。\n", - "\n", - "## 赛题理解\n", - "根据赛题简介,我们首先要明确我们此次比赛的目标: 根据用户历史浏览点击新闻的数据信息预测用户最后一次点击的新闻文章。从这个目标上看, 会发现此次比赛和我们之前遇到的普通的结构化比赛不太一样, 主要有两点:\n", - " \n", - "- 首先是目标上, 要预测最后一次点击的新闻文章,也就是我们给用户推荐的是新闻文章, 并不是像之前那种预测一个数或者预测数据哪一类那样的问题\n", - "- 数据上, 通过给出的数据我们会发现, 这种数据也不是我们之前遇到的那种特征+标签的数据,而是基于了真实的业务场景, 拿到的用户的点击日志\n", - "\n", - "所以拿到这个题目,我们的思考方向就是结合我们的目标,**把该预测问题转成一个监督学习的问题(特征+标签),然后我们才能进行ML,DL等建模预测**。那么我们自然而然的就应该在心里会有这么几个问题:如何转成一个监督学习问题呢? 转成一个什么样的监督学习问题呢? 我们能利用的特征又有哪些呢? 又有哪些模型可以尝试呢? 此次面对数万级别的文章推荐,我们又有哪些策略呢? \n", - "\n", - "当然这些问题不会在我们刚看到赛题之后就一下出来答案, 但是只要有了问题之后, 我们就能想办法解决问题了, 比如上面的第二个问题,转成一个什么样的监督学习问题? 由于我们是预测用户最后一次点击的新闻文章,从36万篇文章中预测某一篇的话我们首先可能会想到这可能是一个多分类的问题(36万类里面选1), 但是如此庞大的分类问题, 我们做起来可能比较困难, 那么能不能转化一下? 既然是要预测最后一次点击的文章, 那么如果我们能预测出某个用户最后一次对于某一篇文章会进行点击的概率, 是不是就间接性的解决了这个问题呢?概率最大的那篇文章不就是用户最后一次可能点击的新闻文章吗? 这样就把原问题变成了一个点击率预测的问题(用户, 文章) --> 点击的概率(软分类), 而这个问题, 就是我们所熟悉的监督学习领域分类问题了, 这样我们后面建模的时候, 对于模型的选择就基本上有大致方向了,比如最简单的逻辑回归模型。

\n", - "这样, 我们对于该赛题的解决方案应该有了一个大致的解决思路,要先转成一个分类问题来做, 而分类的标签就是用户是否会点击某篇文章,分类问题的特征中会有用户和文章,我们要训练一个分类模型, 对某用户最后一次点击某篇文章的概率进行预测。 那么又会有几个问题:如何转成监督学习问题? 训练集和测试集怎么制作? 我们又能利用哪些特征? 我们又可以尝试哪些模型? 面对36万篇文章, 20多万用户的推荐, 我们又有哪些策略来缩减问题的规模?如何进行最后的预测? " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Baseline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 导包" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:46:49.678700Z", - "start_time": "2020-11-16T07:46:49.673336Z" - } - }, - "outputs": [], - "source": [ - "# import packages\n", - "import time, math, os\n", - "from tqdm import tqdm\n", - "import gc\n", - "import pickle\n", - "import random\n", - "from datetime import datetime\n", - "from operator import itemgetter\n", - "import numpy as np\n", - "import pandas as pd\n", - "import warnings\n", - "from collections import defaultdict\n", - "import collections\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:48:34.240098Z", - "start_time": "2020-11-16T07:48:34.236370Z" - } - }, - "outputs": [], - "source": [ - "# data_path = './data_raw/'\n", - "data_path = '/home/admin/jupyter/data/' # 天池平台路径\n", - "save_path = '/home/admin/jupyter/temp_results/' # 天池平台路径" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## df节省内存函数" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# 节约内存的一个标配函数\n", - "def reduce_mem(df):\n", - " starttime = time.time()\n", - " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", - " start_mem = df.memory_usage().sum() / 1024**2\n", - " for col in df.columns:\n", - " col_type = df[col].dtypes\n", - " if col_type in numerics:\n", - " c_min = df[col].min()\n", - " c_max = df[col].max()\n", - " if pd.isnull(c_min) or pd.isnull(c_max):\n", - " continue\n", - " if str(col_type)[:3] == 'int':\n", - " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", - " df[col] = df[col].astype(np.int8)\n", - " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", - " df[col] = df[col].astype(np.int16)\n", - " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", - " df[col] = df[col].astype(np.int32)\n", - " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", - " df[col] = df[col].astype(np.int64)\n", - " else:\n", - " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", - " df[col] = df[col].astype(np.float16)\n", - " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", - " df[col] = df[col].astype(np.float32)\n", - " else:\n", - " df[col] = df[col].astype(np.float64)\n", - " end_mem = df.memory_usage().sum() / 1024**2\n", - " print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,\n", - " 100*(start_mem-end_mem)/start_mem,\n", - " (time.time()-starttime)/60))\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取采样或全量数据" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:48:50.619963Z", - "start_time": "2020-11-16T07:48:50.611667Z" - } - }, - "outputs": [], - "source": [ - "# debug模式:从训练集中划出一部分数据来调试代码\n", - "def get_all_click_sample(data_path, sample_nums=10000):\n", - " \"\"\"\n", - " 训练集中采样一部分数据调试\n", - " data_path: 原数据的存储路径\n", - " sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)\n", - " \"\"\"\n", - " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " all_user_ids = all_click.user_id.unique()\n", - "\n", - " sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) \n", - " all_click = all_click[all_click['user_id'].isin(sample_user_ids)]\n", - " \n", - " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", - " return all_click\n", - "\n", - "# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中\n", - "# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集\n", - "def get_all_click_df(data_path='./data_raw/', offline=True):\n", - " if offline:\n", - " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " else:\n", - " trn_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", - "\n", - " all_click = trn_click.append(tst_click)\n", - " \n", - " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", - " return all_click" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# 全量训练集\n", - "all_click_df = get_all_click_df(data_path, offline=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 获取 用户 - 文章 - 点击时间字典" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:56:39.800240Z", - "start_time": "2020-11-16T07:56:39.793541Z" - } - }, - "outputs": [], - "source": [ - "# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - "def get_user_item_time(click_df):\n", - " \n", - " click_df = click_df.sort_values('click_timestamp')\n", - " \n", - " def make_item_time_pair(df):\n", - " return list(zip(df['click_article_id'], df['click_timestamp']))\n", - " \n", - " user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\\\n", - " .reset_index().rename(columns={0: 'item_time_list'})\n", - " user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))\n", - " \n", - " return user_item_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 获取点击最多的topk个文章" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# 获取近期点击最多的文章\n", - "def get_item_topk_click(click_df, k):\n", - " topk_click = click_df['click_article_id'].value_counts().index[:k]\n", - " return topk_click" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## itemcf的物品相似度计算" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:51:07.577037Z", - "start_time": "2020-11-16T07:51:07.568098Z" - } - }, - "outputs": [], - "source": [ - "def itemcf_sim(df):\n", - " \"\"\"\n", - " 文章与文章之间的相似性矩阵计算\n", - " :param df: 数据表\n", - " :item_created_time_dict: 文章创建时间的字典\n", - " return : 文章与文章的相似性矩阵\n", - " 思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习), 在多路召回部分会加上关联规则的召回策略\n", - " \"\"\"\n", - " \n", - " user_item_time_dict = get_user_item_time(df)\n", - " \n", - " # 计算物品相似度\n", - " i2i_sim = {}\n", - " item_cnt = defaultdict(int)\n", - " for user, item_time_list in tqdm(user_item_time_dict.items()):\n", - " # 在基于商品的协同过滤优化的时候可以考虑时间因素\n", - " for i, i_click_time in item_time_list:\n", - " item_cnt[i] += 1\n", - " i2i_sim.setdefault(i, {})\n", - " for j, j_click_time in item_time_list:\n", - " if(i == j):\n", - " continue\n", - " i2i_sim[i].setdefault(j, 0)\n", - " \n", - " i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)\n", - " \n", - " i2i_sim_ = i2i_sim.copy()\n", - " for i, related_items in i2i_sim.items():\n", - " for j, wij in related_items.items():\n", - " i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])\n", - " \n", - " # 将得到的相似性矩阵保存到本地\n", - " pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))\n", - " \n", - " return i2i_sim_" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:53:10.038470Z", - "start_time": "2020-11-16T07:51:11.281176Z" - } - }, - "outputs": [ + "cells": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [00:23<00:00, 10802.38it/s]\n" - ] - } - ], - "source": [ - "i2i_sim = itemcf_sim(all_click_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## itemcf 的文章推荐" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T08:03:18.383215Z", - "start_time": "2020-11-16T08:03:18.373432Z" - } - }, - "outputs": [], - "source": [ - "# 基于商品的召回i2i\n", - "def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):\n", - " \"\"\"\n", - " 基于文章协同过滤的召回\n", - " :param user_id: 用户id\n", - " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - " :param i2i_sim: 字典,文章相似性矩阵\n", - " :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章\n", - " :param recall_item_num: 整数, 最后的召回文章数量\n", - " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全 \n", - " return: 召回的文章列表 {item1:score1, item2: score2...}\n", - " 注意: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习), 在多路召回部分会加上关联规则的召回策略\n", - " \"\"\"\n", - " \n", - " # 获取用户历史交互的文章\n", - " user_hist_items = user_item_time_dict[user_id]\n", - " user_hist_items_ = {user_id for user_id, _ in user_hist_items}\n", - " \n", - " item_rank = {}\n", - " for loc, (i, click_time) in enumerate(user_hist_items):\n", - " for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:\n", - " if j in user_hist_items_:\n", - " continue\n", - " \n", - " item_rank.setdefault(j, 0)\n", - " item_rank[j] += wij\n", - " \n", - " # 不足10个,用热门商品补全\n", - " if len(item_rank) < recall_item_num:\n", - " for i, item in enumerate(item_topk_click):\n", - " if item in item_rank.items(): # 填充的item应该不在原来的列表中\n", - " continue\n", - " item_rank[item] = - i - 100 # 随便给个负数就行\n", - " if len(item_rank) == recall_item_num:\n", - " break\n", - " \n", - " item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]\n", - " \n", - " return item_rank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 给每个用户根据物品的协同过滤推荐文章" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:15:01.109798Z", - "start_time": "2020-11-16T08:11:07.233787Z" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 赛题理解\n", + "赛题理解是切入一道赛题的基础,会影响后续特征工程和模型构建等各种工作,也影响着后续发展工作的方向,正确了解赛题背后的思想以及赛题业务逻辑的清晰,有利于花费更少时间构建更为有效的特征模型, 在各种比赛中, 赛题理解都是极其重要且必须走好的第一步, 今天我们就从赛题的理解出发, 首先了解一下这次赛题的概况和数据,从中分析赛题以及大致的处理方式, 其次我们了解模型评测的指标,最后对赛题的理解整理一些经验。\n", + "\n", + "## 赛题简介\n", + "此次比赛是新闻推荐场景下的用户行为预测挑战赛, 该赛题是以新闻APP中的新闻推荐为背景, 目的是**要求我们根据用户历史浏览点击新闻文章的数据信息预测用户未来的点击行为, 即用户的最后一次点击的新闻文章**, 这道赛题的设计初衷是引导大家了解推荐系统中的一些业务背景, 解决实际问题。 \n", + "\n", + "## 数据概况\n", + "该数据来自某新闻APP平台的用户交互数据,包括30万用户,近300万次点击,共36万多篇不同的新闻文章,同时每篇新闻文章有对应的embedding向量表示。为了保证比赛的公平性,从中抽取20万用户的点击日志数据作为训练集,5万用户的点击日志数据作为测试集A,5万用户的点击日志数据作为测试集B。具体数据表和参数, 大家可以参考赛题说明。下面说一下拿到这样的数据如何进行理解, 来有效的开展下一步的工作。

\n", + "## 评价方式理解\n", + "理解评价方式, 我们需要结合着最后的提交文件来看, 根据sample.submit.csv, 我们最后提交的格式是针对每个用户, 我们都会给出五篇文章的推荐结果,按照点击概率从前往后排序。 而真实的每个用户最后一次点击的文章只会有一篇的真实答案, 所以我们就看我们推荐的这五篇里面是否有命中真实答案的。比如对于user1来说, 我们的提交会是:\n", + ">user1, article1, article2, article3, article4, article5.\n", + "\n", + "评价指标的公式如下:\n", + "$$\n", + "score(user) = \\sum_{k=1}^5 \\frac{s(user, k)}{k}\n", + "$$\n", + "\n", + "假如article1就是真实的用户点击文章,也就是article1命中, 则s(user1,1)=1, s(user1,2-4)都是0, 如果article2是用户点击的文章, 则s(user,2)=1/2,s(user,1,3,4,5)都是0。也就是score(user)=命中第几条的倒数。如果都没中, 则score(user1)=0。 这个是合理的, 因为我们希望的就是命中的结果尽量靠前, 而此时分数正好比较高。\n", + "\n", + "## 赛题理解\n", + "根据赛题简介,我们首先要明确我们此次比赛的目标: 根据用户历史浏览点击新闻的数据信息预测用户最后一次点击的新闻文章。从这个目标上看, 会发现此次比赛和我们之前遇到的普通的结构化比赛不太一样, 主要有两点:\n", + " \n", + "- 首先是目标上, 要预测最后一次点击的新闻文章,也就是我们给用户推荐的是新闻文章, 并不是像之前那种预测一个数或者预测数据哪一类那样的问题\n", + "- 数据上, 通过给出的数据我们会发现, 这种数据也不是我们之前遇到的那种特征+标签的数据,而是基于了真实的业务场景, 拿到的用户的点击日志\n", + "\n", + "所以拿到这个题目,我们的思考方向就是结合我们的目标,**把该预测问题转成一个监督学习的问题(特征+标签),然后我们才能进行ML,DL等建模预测**。那么我们自然而然的就应该在心里会有这么几个问题:如何转成一个监督学习问题呢? 转成一个什么样的监督学习问题呢? 我们能利用的特征又有哪些呢? 又有哪些模型可以尝试呢? 此次面对数万级别的文章推荐,我们又有哪些策略呢? \n", + "\n", + "当然这些问题不会在我们刚看到赛题之后就一下出来答案, 但是只要有了问题之后, 我们就能想办法解决问题了, 比如上面的第二个问题,转成一个什么样的监督学习问题? 由于我们是预测用户最后一次点击的新闻文章,从36万篇文章中预测某一篇的话我们首先可能会想到这可能是一个多分类的问题(36万类里面选1), 但是如此庞大的分类问题, 我们做起来可能比较困难, 那么能不能转化一下? 既然是要预测最后一次点击的文章, 那么如果我们能预测出某个用户最后一次对于某一篇文章会进行点击的概率, 是不是就间接性的解决了这个问题呢?概率最大的那篇文章不就是用户最后一次可能点击的新闻文章吗? 这样就把原问题变成了一个点击率预测的问题(用户, 文章) --> 点击的概率(软分类), 而这个问题, 就是我们所熟悉的监督学习领域分类问题了, 这样我们后面建模的时候, 对于模型的选择就基本上有大致方向了,比如最简单的逻辑回归模型。

\n", + "这样, 我们对于该赛题的解决方案应该有了一个大致的解决思路,要先转成一个分类问题来做, 而分类的标签就是用户是否会点击某篇文章,分类问题的特征中会有用户和文章,我们要训练一个分类模型, 对某用户最后一次点击某篇文章的概率进行预测。 那么又会有几个问题:如何转成监督学习问题? 训练集和测试集怎么制作? 我们又能利用哪些特征? 我们又可以尝试哪些模型? 面对36万篇文章, 20多万用户的推荐, 我们又有哪些策略来缩减问题的规模?如何进行最后的预测? " + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [43:19<00:00, 96.18it/s] \n" - ] - } - ], - "source": [ - "# 定义\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "\n", - "# 获取 用户 - 文章 - 点击时间的字典\n", - "user_item_time_dict = get_user_item_time(all_click_df)\n", - "\n", - "# 去取文章相似度\n", - "i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))\n", - "\n", - "# 相似文章的数量\n", - "sim_item_topk = 10\n", - "\n", - "# 召回文章数量\n", - "recall_item_num = 10\n", - "\n", - "# 用户热度补全\n", - "item_topk_click = get_item_topk_click(all_click_df, k=50)\n", - "\n", - "for user in tqdm(all_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, \n", - " sim_item_topk, recall_item_num, item_topk_click)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 召回字典转换成df" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:16:36.647466Z", - "start_time": "2020-11-16T10:16:24.791219Z" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Baseline" + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [00:04<00:00, 53319.08it/s]\n" - ] - } - ], - "source": [ - "# 将字典的形式转换成df\n", - "user_item_score_list = []\n", - "\n", - "for user, items in tqdm(user_recall_items_dict.items()):\n", - " for item, score in items:\n", - " user_item_score_list.append([user, item, score])\n", - "\n", - "recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 生成提交文件" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:16:46.268341Z", - "start_time": "2020-11-16T10:16:46.259293Z" - } - }, - "outputs": [], - "source": [ - "# 生成提交文件\n", - "def submit(recall_df, topk=5, model_name=None):\n", - " recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])\n", - " recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 判断是不是每个用户都有5篇文章及以上\n", - " tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())\n", - " assert tmp.min() >= topk\n", - " \n", - " del recall_df['pred_score']\n", - " submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()\n", - " \n", - " submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]\n", - " # 按照提交格式定义列名\n", - " submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', \n", - " 3: 'article_3', 4: 'article_4', 5: 'article_5'})\n", - " \n", - " save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'\n", - " submit.to_csv(save_name, index=False, header=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:17:42.254328Z", - "start_time": "2020-11-16T10:17:32.211862Z" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 导包" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:46:49.678700Z", + "start_time": "2020-11-16T07:46:49.673336Z" + } + }, + "outputs": [], + "source": [ + "# import packages\n", + "import time, math, os\n", + "from tqdm import tqdm\n", + "import gc\n", + "import pickle\n", + "import random\n", + "from datetime import datetime\n", + "from operator import itemgetter\n", + "import numpy as np\n", + "import pandas as pd\n", + "import warnings\n", + "from collections import defaultdict\n", + "import collections\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:48:34.240098Z", + "start_time": "2020-11-16T07:48:34.236370Z" + } + }, + "outputs": [], + "source": [ + "# data_path = './data_raw/'\n", + "data_path = '/home/admin/jupyter/data/' # 天池平台路径\n", + "save_path = '/home/admin/jupyter/temp_results/' # 天池平台路径" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## df节省内存函数" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 节约内存的一个标配函数\n", + "def reduce_mem(df):\n", + " starttime = time.time()\n", + " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", + " start_mem = df.memory_usage().sum() / 1024**2\n", + " for col in df.columns:\n", + " col_type = df[col].dtypes\n", + " if col_type in numerics:\n", + " c_min = df[col].min()\n", + " c_max = df[col].max()\n", + " if pd.isnull(c_min) or pd.isnull(c_max):\n", + " continue\n", + " if str(col_type)[:3] == 'int':\n", + " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", + " df[col] = df[col].astype(np.int8)\n", + " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", + " df[col] = df[col].astype(np.int16)\n", + " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", + " df[col] = df[col].astype(np.int32)\n", + " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", + " df[col] = df[col].astype(np.int64)\n", + " else:\n", + " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", + " df[col] = df[col].astype(np.float16)\n", + " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", + " df[col] = df[col].astype(np.float32)\n", + " else:\n", + " df[col] = df[col].astype(np.float64)\n", + " end_mem = df.memory_usage().sum() / 1024**2\n", + " print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,\n", + " 100*(start_mem-end_mem)/start_mem,\n", + " (time.time()-starttime)/60))\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取采样或全量数据" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:48:50.619963Z", + "start_time": "2020-11-16T07:48:50.611667Z" + } + }, + "outputs": [], + "source": [ + "# debug模式:从训练集中划出一部分数据来调试代码\n", + "def get_all_click_sample(data_path, sample_nums=10000):\n", + " \"\"\"\n", + " 训练集中采样一部分数据调试\n", + " data_path: 原数据的存储路径\n", + " sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)\n", + " \"\"\"\n", + " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " all_user_ids = all_click.user_id.unique()\n", + "\n", + " sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) \n", + " all_click = all_click[all_click['user_id'].isin(sample_user_ids)]\n", + " \n", + " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", + " return all_click\n", + "\n", + "# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中\n", + "# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集\n", + "def get_all_click_df(data_path='./data_raw/', offline=True):\n", + " if offline:\n", + " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " else:\n", + " trn_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", + "\n", + " all_click = trn_click.append(tst_click)\n", + " \n", + " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", + " return all_click" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# 全量训练集\n", + "all_click_df = get_all_click_df(data_path, offline=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 获取 用户 - 文章 - 点击时间字典" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:56:39.800240Z", + "start_time": "2020-11-16T07:56:39.793541Z" + } + }, + "outputs": [], + "source": [ + "# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + "def get_user_item_time(click_df):\n", + " \n", + " click_df = click_df.sort_values('click_timestamp')\n", + " \n", + " def make_item_time_pair(df):\n", + " return list(zip(df['click_article_id'], df['click_timestamp']))\n", + " \n", + " user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\\\n", + " .reset_index().rename(columns={0: 'item_time_list'})\n", + " user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))\n", + " \n", + " return user_item_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 获取点击最多的topk个文章" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# 获取近期点击最多的文章\n", + "def get_item_topk_click(click_df, k):\n", + " topk_click = click_df['click_article_id'].value_counts().index[:k]\n", + " return topk_click" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## itemcf的物品相似度计算" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:51:07.577037Z", + "start_time": "2020-11-16T07:51:07.568098Z" + } + }, + "outputs": [], + "source": [ + "def itemcf_sim(df):\n", + " \"\"\"\n", + " 文章与文章之间的相似性矩阵计算\n", + " :param df: 数据表\n", + " :item_created_time_dict: 文章创建时间的字典\n", + " return : 文章与文章的相似性矩阵\n", + " 思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习), 在多路召回部分会加上关联规则的召回策略\n", + " \"\"\"\n", + " \n", + " user_item_time_dict = get_user_item_time(df)\n", + " \n", + " # 计算物品相似度\n", + " i2i_sim = {}\n", + " item_cnt = defaultdict(int)\n", + " for user, item_time_list in tqdm(user_item_time_dict.items()):\n", + " # 在基于商品的协同过滤优化的时候可以考虑时间因素\n", + " for i, i_click_time in item_time_list:\n", + " item_cnt[i] += 1\n", + " i2i_sim.setdefault(i, {})\n", + " for j, j_click_time in item_time_list:\n", + " if(i == j):\n", + " continue\n", + " i2i_sim[i].setdefault(j, 0)\n", + " \n", + " i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)\n", + " \n", + " i2i_sim_ = i2i_sim.copy()\n", + " for i, related_items in i2i_sim.items():\n", + " for j, wij in related_items.items():\n", + " i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])\n", + " \n", + " # 将得到的相似性矩阵保存到本地\n", + " pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))\n", + " \n", + " return i2i_sim_" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:53:10.038470Z", + "start_time": "2020-11-16T07:51:11.281176Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [00:23<00:00, 10802.38it/s]\n" + ] + } + ], + "source": [ + "i2i_sim = itemcf_sim(all_click_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## itemcf 的文章推荐" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T08:03:18.383215Z", + "start_time": "2020-11-16T08:03:18.373432Z" + } + }, + "outputs": [], + "source": [ + "# 基于商品的召回i2i\n", + "def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):\n", + " \"\"\"\n", + " 基于文章协同过滤的召回\n", + " :param user_id: 用户id\n", + " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + " :param i2i_sim: 字典,文章相似性矩阵\n", + " :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章\n", + " :param recall_item_num: 整数, 最后的召回文章数量\n", + " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全 \n", + " return: 召回的文章列表 {item1:score1, item2: score2...}\n", + " 注意: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习), 在多路召回部分会加上关联规则的召回策略\n", + " \"\"\"\n", + " \n", + " # 获取用户历史交互的文章\n", + " user_hist_items = user_item_time_dict[user_id]\n", + " user_hist_items_ = {user_id for user_id, _ in user_hist_items}\n", + " \n", + " item_rank = {}\n", + " for loc, (i, click_time) in enumerate(user_hist_items):\n", + " for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:\n", + " if j in user_hist_items_:\n", + " continue\n", + " \n", + " item_rank.setdefault(j, 0)\n", + " item_rank[j] += wij\n", + " \n", + " # 不足10个,用热门商品补全\n", + " if len(item_rank) < recall_item_num:\n", + " for i, item in enumerate(item_topk_click):\n", + " if item in item_rank.items(): # 填充的item应该不在原来的列表中\n", + " continue\n", + " item_rank[item] = - i - 100 # 随便给个负数就行\n", + " if len(item_rank) == recall_item_num:\n", + " break\n", + " \n", + " item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]\n", + " \n", + " return item_rank" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 给每个用户根据物品的协同过滤推荐文章" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:15:01.109798Z", + "start_time": "2020-11-16T08:11:07.233787Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [43:19<00:00, 96.18it/s] \n" + ] + } + ], + "source": [ + "# 定义\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "\n", + "# 获取 用户 - 文章 - 点击时间的字典\n", + "user_item_time_dict = get_user_item_time(all_click_df)\n", + "\n", + "# 去取文章相似度\n", + "i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))\n", + "\n", + "# 相似文章的数量\n", + "sim_item_topk = 10\n", + "\n", + "# 召回文章数量\n", + "recall_item_num = 10\n", + "\n", + "# 用户热度补全\n", + "item_topk_click = get_item_topk_click(all_click_df, k=50)\n", + "\n", + "for user in tqdm(all_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, \n", + " sim_item_topk, recall_item_num, item_topk_click)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 召回字典转换成df" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:16:36.647466Z", + "start_time": "2020-11-16T10:16:24.791219Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [00:04<00:00, 53319.08it/s]\n" + ] + } + ], + "source": [ + "# 将字典的形式转换成df\n", + "user_item_score_list = []\n", + "\n", + "for user, items in tqdm(user_recall_items_dict.items()):\n", + " for item, score in items:\n", + " user_item_score_list.append([user, item, score])\n", + "\n", + "recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 生成提交文件" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:16:46.268341Z", + "start_time": "2020-11-16T10:16:46.259293Z" + } + }, + "outputs": [], + "source": [ + "# 生成提交文件\n", + "def submit(recall_df, topk=5, model_name=None):\n", + " recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])\n", + " recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 判断是不是每个用户都有5篇文章及以上\n", + " tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())\n", + " assert tmp.min() >= topk\n", + " \n", + " del recall_df['pred_score']\n", + " submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()\n", + " \n", + " submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]\n", + " # 按照提交格式定义列名\n", + " submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', \n", + " 3: 'article_3', 4: 'article_4', 5: 'article_5'})\n", + " \n", + " save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'\n", + " submit.to_csv(save_name, index=False, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:17:42.254328Z", + "start_time": "2020-11-16T10:17:32.211862Z" + } + }, + "outputs": [], + "source": [ + "# 获取测试集\n", + "tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", + "tst_users = tst_click['user_id'].unique()\n", + "\n", + "# 从所有的召回数据中将测试集中的用户选出来\n", + "tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]\n", + "\n", + "# 生成提交文件\n", + "submit(tst_recall, topk=5, model_name='itemcf_baseline')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 总结\n", + "本节内容主要包括赛题简介,数据概况,评价方式以及对该赛题进行了一个总体上的思路分析,作为竞赛前的预热,旨在帮助学习者们能够更好切入该赛题,为后面的学习内容打下一个良好的基础。最后我们给出了关于本赛题的一个简易Baseline, 帮助学习者们先了解一下新闻推荐比赛的一个整理流程, 接下来我们就对于流程中的每个步骤进行详细的介绍。\n", + "\n", + "今天的学习比较简单,下面整理一下关于赛题理解的一些经验:\n", + "\n", + "* 赛题理解究竟是在理解什么? \n", + "\n", + ">**理解赛题**:从直观上对问题进行梳理, 分析问题的目标,到底要让做什么事情, **这个非常重要**\n", + ">\n", + ">**理解数据**:对赛题数据有一个初步了解,知道和任务相关的数据字段和数据字段的类型, 数据之间的内在关联等,大体梳理一下哪些数据会对我们解决问题非常有用,方便后面我们的数据分析和特征工程。\n", + ">\n", + ">**理解评估指标**:评估指标是检验我们提出的方法,我们给出结果好坏的标准,只有正确的理解了评估指标,我们才能进行更好的训练模型,更好的进行预测。此外,很多情况下,线上验证是有一定的时间和次数限制的,**所以在比赛中构建一个合理的本地的验证集和验证的评价指标是很关键的步骤,能有效的节省很多时间**。 不同的指标对于同样的预测结果是具有误差敏感的差异性的所以不同的评价指标会影响后续一些预测的侧重点。\n", + "\n", + "* 有了赛题理解之后,我们该做什么?\n", + "\n", + " >在对于赛题有了一定的了解后,分析清楚了问题的类型性质和对于数据理解 的这一基础上,我们可以梳理一个解决赛题的一个大题思路和框架\n", + " >\n", + " >我们至少要有一些相应的理解分析,比如**这题的难点可能在哪里,关键点可能在哪里,哪些地方可以挖掘更好的特征**.\n", + " >\n", + " >用什么样得线下验证方式更为稳定,**出现了过拟合或者其他问题,估摸可以用什么方法去解决这些问题**\n", + "\n", + " 这时是在一个宏观的大体下分析的,有助于摸清整个题的思路脉络,以及后续的分析方向\n", + "\n", + "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", + "\n", + "![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" + ] } - }, - "outputs": [], - "source": [ - "# 获取测试集\n", - "tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", - "tst_users = tst_click['user_id'].unique()\n", - "\n", - "# 从所有的召回数据中将测试集中的用户选出来\n", - "tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]\n", - "\n", - "# 生成提交文件\n", - "submit(tst_recall, topk=5, model_name='itemcf_baseline')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 总结\n", - "本节内容主要包括赛题简介,数据概况,评价方式以及对该赛题进行了一个总体上的思路分析,作为竞赛前的预热,旨在帮助学习者们能够更好切入该赛题,为后面的学习内容打下一个良好的基础。最后我们给出了关于本赛题的一个简易Baseline, 帮助学习者们先了解一下新闻推荐比赛的一个整理流程, 接下来我们就对于流程中的每个步骤进行详细的介绍。\n", - "\n", - "今天的学习比较简单,下面整理一下关于赛题理解的一些经验:\n", - "\n", - "* 赛题理解究竟是在理解什么? \n", - "\n", - ">**理解赛题**:从直观上对问题进行梳理, 分析问题的目标,到底要让做什么事情, **这个非常重要**\n", - ">\n", - ">**理解数据**:对赛题数据有一个初步了解,知道和任务相关的数据字段和数据字段的类型, 数据之间的内在关联等,大体梳理一下哪些数据会对我们解决问题非常有用,方便后面我们的数据分析和特征工程。\n", - ">\n", - ">**理解评估指标**:评估指标是检验我们提出的方法,我们给出结果好坏的标准,只有正确的理解了评估指标,我们才能进行更好的训练模型,更好的进行预测。此外,很多情况下,线上验证是有一定的时间和次数限制的,**所以在比赛中构建一个合理的本地的验证集和验证的评价指标是很关键的步骤,能有效的节省很多时间**。 不同的指标对于同样的预测结果是具有误差敏感的差异性的所以不同的评价指标会影响后续一些预测的侧重点。\n", - "\n", - "* 有了赛题理解之后,我们该做什么?\n", - "\n", - " >在对于赛题有了一定的了解后,分析清楚了问题的类型性质和对于数据理解 的这一基础上,我们可以梳理一个解决赛题的一个大题思路和框架\n", - " >\n", - " >我们至少要有一些相应的理解分析,比如**这题的难点可能在哪里,关键点可能在哪里,哪些地方可以挖掘更好的特征**.\n", - " >\n", - " >用什么样得线下验证方式更为稳定,**出现了过拟合或者其他问题,估摸可以用什么方法去解决这些问题**\n", - "\n", - " 这时是在一个宏观的大体下分析的,有助于摸清整个题的思路脉络,以及后续的分析方向\n", - "\n", - "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "tianchi_metadata": { - "competitions": [], - "datasets": [], - "description": "", - "notebookId": "130006", - "source": "dsw" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "170px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "tianchi_metadata": { + "competitions": [], + "datasets": [], + "description": "", + "notebookId": "130006", + "source": "dsw" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "170px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/docs/ch03/ch3.1/markdown/ch3.1.1.md b/docs/ch03/ch3.1/markdown/ch3.1.1.md index 645152157..5c3930fe0 100644 --- a/docs/ch03/ch3.1/markdown/ch3.1.1.md +++ b/docs/ch03/ch3.1/markdown/ch3.1.1.md @@ -377,7 +377,7 @@ submit(tst_recall, topk=5, model_name='itemcf_baseline') **关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) diff --git a/docs/ch03/ch3.1/markdown/ch3.1.2.md b/docs/ch03/ch3.1/markdown/ch3.1.2.md index 173d95002..5584973fa 100644 --- a/docs/ch03/ch3.1/markdown/ch3.1.2.md +++ b/docs/ch03/ch3.1/markdown/ch3.1.2.md @@ -66,7 +66,7 @@ trn_click = trn_click.merge(item_df, how='left', on=['click_article_id']) trn_click.head() ``` -![image-20201119112706647](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112706647.png) +![image-20201119112706647](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112706647.png) **train_click_log.csv文件数据中每个字段的含义** @@ -86,7 +86,7 @@ trn_click.head() trn_click.info() ``` -![image-20201119112622939](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112622939.png) +![image-20201119112622939](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112622939.png) @@ -94,7 +94,7 @@ trn_click.info() trn_click.describe() ``` -![image-20201119112649376](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112649376.png) +![image-20201119112649376](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112649376.png) ```python @@ -133,7 +133,7 @@ plt.tight_layout() plt.show() ``` -![在这里插入图片描述](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/20201118000820300.png) +![在这里插入图片描述](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/20201118000820300.png) **从点击时间clik_timestamp来看,分布较为平均,可不做特殊处理。由于时间戳是13位的,后续将时间格式转换成10位方便计算。** @@ -149,14 +149,14 @@ tst_click = tst_click.merge(item_df, how='left', on=['click_article_id']) tst_click.head() ``` -![image-20201119112952261](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112952261.png) +![image-20201119112952261](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112952261.png) ```python tst_click.describe() ``` -![image-20201119113015529](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113015529.png) +![image-20201119113015529](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113015529.png) **我们可以看出训练集和测试集的用户是完全不一样的** @@ -187,14 +187,14 @@ tst_click.groupby('user_id')['click_article_id'].count().min() # 注意测试集 item_df.head().append(item_df.tail()) ``` -![image-20201119113118388](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113118388.png) +![image-20201119113118388](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113118388.png) ```python item_df['words_count'].value_counts() ``` -![image-20201119113147240](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113147240.png) +![image-20201119113147240](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113147240.png) ```python @@ -219,7 +219,7 @@ item_df.shape # 364047篇文章 item_emb_df.head() ``` -![image-20201119113253455](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113253455.png) +![image-20201119113253455](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113253455.png) ```python item_emb_df.shape @@ -245,21 +245,21 @@ user_click_count = user_click_merge.groupby(['user_id', 'click_article_id'])['cl user_click_count[:10] ``` -![image-20201119113334727](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113334727.png) +![image-20201119113334727](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113334727.png) ```python user_click_count[user_click_count['count']>7] ``` -![image-20201119113351807](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113351807.png) +![image-20201119113351807](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113351807.png) ```python user_click_count['count'].unique() ``` -![image-20201119113429769](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113429769.png) +![image-20201119113429769](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113429769.png) ```python @@ -267,7 +267,7 @@ user_click_count['count'].unique() user_click_count.loc[:,'count'].value_counts() ``` -![image-20201119113414785](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113414785.png) +![image-20201119113414785](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113414785.png) **可以看出:有1605541(约占99.2%)的用户未重复阅读过文章,仅有极少数用户重复点击过某篇文章。 这个也可以单独制作成特征** @@ -301,15 +301,15 @@ for _, user_df in sample_users.groupby('user_id'): plot_envs(user_df, cols, 2, 3) ``` -![image-20201119113624424](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113624424.png) +![image-20201119113624424](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113624424.png) -![image-20201119113637746](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113637746.png) +![image-20201119113637746](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113637746.png) -![image-20201119113652132](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113652132.png) +![image-20201119113652132](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113652132.png) -![image-20201119113702034](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113702034.png) +![image-20201119113702034](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113702034.png) -![image-20201119113714135](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113714135.png) +![image-20201119113714135](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113714135.png) **可以看出绝大多数数的用户的点击环境是比较固定的。思路:可以基于这些环境的统计特征来代表该用户本身的属性** @@ -322,7 +322,7 @@ plt.plot(user_click_item_count) ``` -![image-20201119113759490](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113759490.png) +![image-20201119113759490](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113759490.png) **可以根据用户的点击文章次数看出用户的活跃度** @@ -332,7 +332,7 @@ plt.plot(user_click_item_count) plt.plot(user_click_item_count[:50]) ``` -![image-20201119113825586](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113825586.png) +![image-20201119113825586](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113825586.png) **点击次数排前50的用户的点击次数都在100次以上。思路:我们可以定义点击次数大于等于100次的用户为活跃用户,这是一种简单的处理思路, 判断用户活跃度,更加全面的是再结合上点击时间,后面我们会基于点击次数和点击时间两个方面来判断用户活跃度。** @@ -342,7 +342,7 @@ plt.plot(user_click_item_count[:50]) plt.plot(user_click_item_count[25000:50000]) ``` -![image-20201119113844946](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113844946.png) +![image-20201119113844946](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113844946.png) **可以看出点击次数小于等于两次的用户非常的多,这些用户可以认为是非活跃用户** @@ -358,14 +358,14 @@ item_click_count = sorted(user_click_merge.groupby('click_article_id')['user_id' plt.plot(item_click_count) ``` -![image-20201119113912912](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113912912.png) +![image-20201119113912912](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113912912.png) ```python plt.plot(item_click_count[:100]) ``` -![image-20201119113930745](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113930745.png) +![image-20201119113930745](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113930745.png) **可以看出点击次数最多的前100篇新闻,点击次数大于1000次** @@ -374,7 +374,7 @@ plt.plot(item_click_count[:100]) plt.plot(item_click_count[:20]) ``` -![image-20201119113958254](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113958254.png) +![image-20201119113958254](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113958254.png) **点击次数最多的前20篇新闻,点击次数大于2500。思路:可以定义这些新闻为热门新闻, 这个也是简单的处理方式,后面我们也是根据点击次数和时间进行文章热度的一个划分。** @@ -383,7 +383,7 @@ plt.plot(item_click_count[:20]) plt.plot(item_click_count[3500:]) ``` -![image-20201119114017762](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114017762.png) +![image-20201119114017762](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114017762.png) **可以发现很多新闻只被点击过一两次。思路:可以定义这些新闻是冷门新闻。** @@ -397,7 +397,7 @@ union_item = tmp.groupby(['click_article_id','next_item'])['click_timestamp'].ag union_item[['count']].describe() ``` -![image-20201119114044351](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114044351.png) +![image-20201119114044351](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114044351.png) **由统计数据可以看出,平均共现次数2.88,最高为1687。** @@ -411,14 +411,14 @@ y = union_item['count'] plt.scatter(x, y) ``` -![image-20201119114106223](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114106223.png) +![image-20201119114106223](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114106223.png) ```python plt.plot(union_item['count'].values[40000:]) ``` -![image-20201119114122557](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114122557.png) +![image-20201119114122557](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114122557.png) **大概有70000个pair至少共现一次。** @@ -432,7 +432,7 @@ plt.plot(union_item['count'].values[40000:]) plt.plot(user_click_merge['category_id'].value_counts().values) ``` -![image-20201119114144058](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114144058.png) +![image-20201119114144058](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114144058.png) ```python @@ -440,7 +440,7 @@ plt.plot(user_click_merge['category_id'].value_counts().values) plt.plot(user_click_merge['category_id'].value_counts().values[150:]) ``` -![image-20201119114201764](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114201764.png) +![image-20201119114201764](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114201764.png) ```python @@ -455,7 +455,7 @@ user_click_merge['words_count'].describe() plt.plot(user_click_merge['words_count'].values) ``` -![image-20201119114241194](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114241194.png) +![image-20201119114241194](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114241194.png) @@ -469,7 +469,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), re ``` -![image-20201119114300286](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114300286.png) +![image-20201119114300286](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114300286.png) **从上图中可以看出有一小部分用户阅读类型是极其广泛的,大部分人都处在20个新闻类型以下。** @@ -478,7 +478,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), re user_click_merge.groupby('user_id')['category_id'].nunique().reset_index().describe() ``` -![image-20201119114318523](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114318523.png) +![image-20201119114318523](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114318523.png) ### 用户查看文章的长度的分布 @@ -490,7 +490,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), rever ``` -![image-20201119114337448](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114337448.png) +![image-20201119114337448](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114337448.png) @@ -504,7 +504,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), rever plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True)[1000:45000]) ``` -![image-20201119114355195](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114355195.png) +![image-20201119114355195](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114355195.png) **可以发现大多数人都是看250字以下的文章** @@ -514,7 +514,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), rever user_click_merge.groupby('user_id')['words_count'].mean().reset_index().describe() ``` -![image-20201119114418911](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114418911.png) +![image-20201119114418911](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114418911.png) @@ -536,7 +536,7 @@ user_click_merge = user_click_merge.sort_values('click_timestamp') user_click_merge.head() ``` -![image-20201119114447904](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114447904.png) +![image-20201119114447904](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114447904.png) ```python @@ -558,7 +558,7 @@ mean_diff_click_time = user_click_merge.groupby('user_id')['click_timestamp', 'c plt.plot(sorted(mean_diff_click_time.values, reverse=True)) ``` -![image-20201119114505086](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114505086.png) +![image-20201119114505086](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114505086.png) **从上图可以发现不同用户点击文章的时间差是有差异的。** @@ -573,7 +573,7 @@ mean_diff_created_time = user_click_merge.groupby('user_id')['click_timestamp', plt.plot(sorted(mean_diff_created_time.values, reverse=True)) ``` -![image-20201119122227666](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122227666.png) +![image-20201119122227666](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122227666.png) **从图中可以发现用户先后点击文章,文章的创建时间也是有差异的** @@ -602,7 +602,7 @@ sub_user_info = user_click_merge[user_click_merge['user_id'].isin(sub_user_ids)] sub_user_info.head() ``` -![image-20201119122251274](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122251274.png) +![image-20201119122251274](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122251274.png) ```python @@ -625,7 +625,7 @@ for _, user_df in sub_user_info.groupby('user_id'): ``` -![image-20201119122310969](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122310969.png) +![image-20201119122310969](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122310969.png) @@ -654,5 +654,5 @@ for _, user_df in sub_user_info.groupby('user_id'): **关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) diff --git a/docs/ch03/ch3.1/markdown/ch3.1.3.md b/docs/ch03/ch3.1/markdown/ch3.1.3.md index 323cf46fe..9bf554093 100644 --- a/docs/ch03/ch3.1/markdown/ch3.1.3.md +++ b/docs/ch03/ch3.1/markdown/ch3.1.3.md @@ -2,7 +2,7 @@ 所谓的“多路召回”策略,就是指采用不同的策略、特征或简单模型,分别召回一部分候选集,然后把候选集混合在一起供后续排序模型使用,可以明显的看出,“多路召回策略”是在“计算速度”和“召回率”之间进行权衡的结果。其中,各种简单策略保证候选集的快速召回,从不同角度设计的策略保证召回率接近理想的状态,不至于损伤排序效果。如下图是多路召回的一个示意图,在多路召回中,每个策略之间毫不相关,所以一般可以写并发多线程同时进行,这样可以更加高效。 -image-20201119132726873 +image-20201119132726873 上图只是一个多路召回的例子,也就是说可以使用多种不同的策略来获取用户排序的候选商品集合,而具体使用哪些召回策略其实是与业务强相关的 ,针对不同的任务就会有对于该业务真实场景下需要考虑的召回规则。例如新闻推荐,召回规则可以是“热门视频”、“导演召回”、“演员召回”、“最近上映“、”流行趋势“、”类型召回“等等。 @@ -1344,4 +1344,4 @@ final_recall_items_dict_rank = combine_recall_results(user_multi_recall_dict, we **关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) diff --git a/docs/ch03/ch3.1/markdown/ch3.1.4.md b/docs/ch03/ch3.1/markdown/ch3.1.4.md index 197765e8b..e5e267f0e 100644 --- a/docs/ch03/ch3.1/markdown/ch3.1.4.md +++ b/docs/ch03/ch3.1/markdown/ch3.1.4.md @@ -193,7 +193,7 @@ Word2Vec主要思想是:一个词的上下文可以很好的表达出词的语 - skip-gram:已知中心词预测周围词。 - cbow:已知周围词预测中心词。 -![image-20201106225233086](http://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20201106225233086.png) +![image-20201106225233086](https://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20201106225233086.png) 在使用gensim训练word2vec的时候,有几个比较重要的参数 - size: 表示词向量的维度。 @@ -985,5 +985,5 @@ tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=Fa **关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) diff --git a/docs/ch03/ch3.1/markdown/ch3.1.5.md b/docs/ch03/ch3.1/markdown/ch3.1.5.md index 9fef3fda5..0e8f45abe 100644 --- a/docs/ch03/ch3.1/markdown/ch3.1.5.md +++ b/docs/ch03/ch3.1/markdown/ch3.1.5.md @@ -407,7 +407,7 @@ tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_be 我们下面尝试使用DIN模型, DIN的全称是Deep Interest Network, 这是阿里2018年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型, 它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性,来计算用户兴趣的表示向量。具体来说就是通过引入局部激活单元,通过软搜索历史行为的相关部分来关注相关的用户兴趣,并采用加权和来获得有关候选广告的用户兴趣的表示。与候选广告相关性较高的行为会获得较高的激活权重,并支配着用户兴趣。该表示向量在不同广告上有所不同,大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合, 我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。 该模型的结构如下: -![image-20201116201646983](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png) +![image-20201116201646983](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png) 我们这里直接调包来使用这个模型, 关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用:deepctr的函数原型如下: @@ -949,4 +949,4 @@ submit(rank_results, topk=5, model_name='ensumble_staking') 关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) \ No newline at end of file +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) \ No newline at end of file diff --git a/docs/ch03/ch3.2/3.2.1.3.md b/docs/ch03/ch3.2/3.2.1.3.md index 2d79c1fbf..9153d9f15 100644 --- a/docs/ch03/ch3.2/3.2.1.3.md +++ b/docs/ch03/ch3.2/3.2.1.3.md @@ -20,7 +20,7 @@ sudo apt-get install redis-server 下载完成的结果 -![image-20211030164414594](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164414594.png) +![image-20211030164414594](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164414594.png) **启动Redis服务:** @@ -30,7 +30,7 @@ sudo apt-get install redis-server service redis-server status ``` -![image-20211030164432589](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164432589.png) +![image-20211030164432589](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164432589.png) 检查当前进程,查看redis是否启动。(ps: 可以看到redis服务正在监听6379端口) @@ -38,7 +38,7 @@ service redis-server status ps -aux|grep redis-server ``` -![image-20211030164448713](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164448713.png) +![image-20211030164448713](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164448713.png) 或者进入redis客户端,与服务器进行通信,当输入ping命令,如果返回 PONG 表示Redis已成功安装。 @@ -46,7 +46,7 @@ ps -aux|grep redis-server redis-cli ``` -![image-20211030164455928](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164455928.png) +![image-20211030164455928](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164455928.png) 上面的127.0.0.1 是redis服务器的 IP 地址,6379 是 Redis 服务器运行的端口。 diff --git a/docs/ch03/ch3.2/3.2.1.4.md b/docs/ch03/ch3.2/3.2.1.4.md index 8a74c546e..dc29a96f1 100644 --- a/docs/ch03/ch3.2/3.2.1.4.md +++ b/docs/ch03/ch3.2/3.2.1.4.md @@ -129,7 +129,7 @@ class QuotesSpider(scrapy.Spider): 因为新闻爬取项目和新闻推荐系统是放在一起的,为了方便提前学习,下面直接给出项目的目录结构以及重要文件中的代码实现,最终的项目将会和新闻推荐系统一起开源出来 -image-20211103214124327 +image-20211103214124327 1. **创建一个scrapy项目:** @@ -164,7 +164,7 @@ class SinanewsItem(scrapy.Item): 这里需要注意的一点,这里在爬取新闻的时候选择的是一个比较简洁的展示网站进行爬取的,相比直接去最新的新浪新闻观光爬取新闻简单很多,简洁的网站大概的链接:https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1 -image-20211103213354334 +image-20211103213354334 ```python # -*- coding: utf-8 -*- @@ -497,7 +497,7 @@ sh run_scrapy_sina.sh 最终查看数据库中的数据: -image-20211103214611171 +image-20211103214611171 ### 参考资料 diff --git a/docs/ch03/ch3.2/3.2.1.5.md b/docs/ch03/ch3.2/3.2.1.5.md index 4cc60eda6..bd9d70acc 100644 --- a/docs/ch03/ch3.2/3.2.1.5.md +++ b/docs/ch03/ch3.2/3.2.1.5.md @@ -1,4 +1,4 @@ -![image-20211203145147649](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203145147649.png) +![image-20211203145147649](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203145147649.png) # 自动化构建用户及物料画像 @@ -19,13 +19,13 @@ 首先说一下新物料添加到物料库的逻辑是什么,新物料添加到物料库这件事情肯定是发生在新闻爬取之后的,然后要将新物料添加到物料库还需要对新物料做一些简单的画像处理,目前我们定义的画像字段如下(处理后的画像存储在Mongodb): -image-20211203150212960 +image-20211203150212960 具体的逻辑就是遍历今天爬取的所有文章,然后通过文章的title来判断这篇文章是否已经在物料库中(新闻网站有可能有些相同的文章会出现在多天)来去重。然后再根据我们定义的一些字段,给画像相应的字段初始化,最后就是存入画像物料池中。 关于旧物料画像的更新,这里就需要先了解一下旧物料哪些字段会被用户的行为更新。下面是新闻列表展示页,我们会发现前端会展示新闻的阅读、喜欢及收藏次数。而用户的交互(阅读、点赞和收藏)会改变这些值。 -image-20211203150835056 +image-20211203150835056 为了能够实时的在前端显示新闻的这些动态行为信息,我们提前将新闻的动态信息存储到了redis中,线上获取的时候是直接从redis中获取新闻的数据,并且如果用户对新闻产生了交互,那么这些动态信息就会被更新,我们也是直接更新redis中的值,这样做主要是为了能够让前端可以实时的获取的新闻最新的动态画像信息。 @@ -175,9 +175,9 @@ if __name__ == "__main__": 上面的内容说完了物料的更新,接下来介绍一下对于更新完的物料是如何添加到redis数据库中去的。关于新闻内容在redis中的存储,我们将新闻的信息拆成了两部分,一部分是新闻不会发生变化的属性(例如,创建时间、标题、新闻内容等),还有一部分是物料的动态属性,在redis中存储的key的标识分别为:static_news_detail:news_id和dynamic_news_detail:news_id 下面是redis中存储的真实内容 -image-20211203153841222 +image-20211203153841222 -image-20211203153958220 +image-20211203153958220 这么做的目的是为了线上实时更改物料动态信息的时候更加高效一点。当需要获取某篇新闻的详细信息的时候需要查这两份数据并将数据这两部分数据拼起来最终才发送给前端展示。这部分的代码逻辑如下: @@ -306,11 +306,11 @@ if __name__ == "__main__": 由于我们系统中将所有注册过的用户都放到了一个表里面(新、老用户),所以每次更新画像的话只需要遍历一遍注册表中的所有用户。再说具体的画像构建逻辑之前,得先了解一下用户画像中包含哪些字段,下面是直接从mongo中查出来的 -image-20211203163848668 +image-20211203163848668 从上面可以看出,主要是用户的基本信息和用户历史信息相关的一些标签,对于用户的基本属性特征这个可以直接从注册表中获取,那么对于跟用户历史阅读相关的信息,需要统计用户历史的所有阅读、喜欢和收藏的新闻详细信息。为了得到跟用户历史兴趣相关的信息,我们需要对用户的历史阅读、喜欢和收藏这几个历史记录给存起来,其实这些信息都可以从日志信息中获取得到,但是这里有个工程上的事情得先说明一下,先看下面这个图,对于每个用户点进一篇新闻的详情页 -image-20211203164332062 +image-20211203164332062 最底部有个喜欢和收藏,这个前端展示的结果是从后端获取的数据,那就意味着后端需要维护一个用户历史点击及收藏过的文章列表,这里我们使用了mysql来存储,主要是怕redis不够用。其实这两个表不仅仅可以用来前端展示用的,还可以用来分析用户的画像,这都给我们整理好了用户历史喜欢和收藏了。 @@ -622,7 +622,7 @@ echo " " **crontab定时任务:** -![image-20211203172613512](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203172613512.png) +![image-20211203172613512](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203172613512.png) 将定时任务拆解一下: diff --git a/docs/ch03/ch3.2/3.2.2.3.md b/docs/ch03/ch3.2/3.2.2.3.md index e251e6515..e736c0fa1 100644 --- a/docs/ch03/ch3.2/3.2.2.3.md +++ b/docs/ch03/ch3.2/3.2.2.3.md @@ -6,7 +6,7 @@ 下面主要展现的是项目的整体部分,主要分为推荐页,热门页以及新闻详情页。 -image-20211203154557244image-20211203155028564image-20211203155058020 +image-20211203154557244image-20211203155028564image-20211203155058020 diff --git a/docs/ch03/ch3.2/3.2.3.md b/docs/ch03/ch3.2/3.2.3.md index 3beac83f1..b2cce14f1 100644 --- a/docs/ch03/ch3.2/3.2.3.md +++ b/docs/ch03/ch3.2/3.2.3.md @@ -1,6 +1,6 @@ -![](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片Untitled.png) +![](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片Untitled.png) 本篇文章主要是讲解推荐系统流程构建,主要包括Offline和Online两个部分。 diff --git a/docs/ch03/ch3.2/3.2.4.3.md b/docs/ch03/ch3.2/3.2.4.3.md index 957e4719c..8da93411b 100644 --- a/docs/ch03/ch3.2/3.2.4.3.md +++ b/docs/ch03/ch3.2/3.2.4.3.md @@ -12,7 +12,7 @@ DSSM(Deep Structured Semantic Model)是由微软研究院于CIKM在2013年提出 ### **DSSM 模型结构** -![image-20220224100424897](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20220224100424897.png) +![image-20220224100424897](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20220224100424897.png) 上图是DSSM模型的结构,该网络结构比较简单,是一个由几层DNN组成网络,我们将要搜索文本(Query)和要匹配的文本(Document)的 embedding 输入到网络,网络输出为 128 维的向量,然后通过向量之间计算余弦相似度来计算向量之间距离,可以看作每一个 query 和 document 之间相似分数,然后在做 softmax。 @@ -28,7 +28,7 @@ DSSM(Deep Structured Semantic Model)是由微软研究院于CIKM在2013年提出 该模型主要是将上述模型中的两个“塔”改为独立的 user 和 item 两个子网络,大概结构如下: -![img](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-f7ecbf1faf7899c6e2999182055470fb_720w.jpg) +![img](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-f7ecbf1faf7899c6e2999182055470fb_720w.jpg) 其结构非常简单,如上图所示,左侧是用户塔,右侧是Item塔。在用户侧结构中,其输入为用户侧特征(用户画像信息、统计属性以及历史行为序列等);在用户侧结构中,其输入为Item相关特征(Item基本信息、属性信息等)。对于这两个塔本身,则是经典的DNN模型,在训练过程中,其输入由特征OneHot到特征Embedding,再经过几层DNN隐层,两个塔分别输出user embedding和item embedding,最后这两个embedding做内积或者Cosine相似度计算,使得user和item在embedding映射到共同维度的语义空间中。 @@ -38,7 +38,7 @@ DSSM(Deep Structured Semantic Model)是由微软研究院于CIKM在2013年提出 该模型主要的改进是在user塔和Item塔的特征Embedding层上,各自加入一个SENet模块,借助SENet网络用来动态地学习特征的重要性,根据得到的特征权重与对应特征的embedding相乘,进而达到放大重要特征或抑制无效特征的目的,模型大致结构如下所示: -![img](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-8766fee1b442ed17111d5822033f960f_720w.jpg) +![img](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-8766fee1b442ed17111d5822033f960f_720w.jpg) 其模型和朴素DSSM模型的区别在于多加了一个SENet网络,该网络主要是将特征的 embedding 通过 Squeeze 和Excitation 两个阶段得到一个权重向量,在用该向量与特征的embeding对应为相乘,挑选出最要特征之后在进入到朴素的DSSM网络中。 而 SENet 之所以起作用的原因,张俊林老师的解释是 SENet 可以突出那些对高层 User embedding 和 Item embedding 的特征交叉起重要作用的特征,更有利于表达两侧的特征交互,避免单侧无效特征经过DNN双塔非线性融合时带来的噪声,同时又带有非线性的作用。关于SENet网络详细内容可以查看[原文](https://arxiv.org/abs/1709.01507) @@ -48,7 +48,7 @@ DSSM(Deep Structured Semantic Model)是由微软研究院于CIKM在2013年提出 该模型是Youtube于2019年在RecSys发表的一篇工作,这个模型从结构上来看是最普通的双塔。左边是user塔,输入包括两部分,第一部分是user当前正在观看的视频的特征,第二部分user的特征是用户历史行为的统计量,例如用户最近观看的N条视频的id embedding均值,这两部分融合起来一起输入user侧的输入。右边是item塔,将候选视频的特征作为输入,计算item的 embedding。之后也是再计算两侧embedding的相似度,进行学习。 模型的大致结构如下所示: -![image-20220224100307472](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20220224100307472.png) +![image-20220224100307472](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20220224100307472.png) 对于该模型,重点并不在于结构上的改变,而是对于负采样问题。因为召回的过程可以被视为是一个多分类问题,模型的输出层选择softmax计算后再计算交叉熵损失。但问题是当候选item特别多的时候,无法对所有的item进行softmax,因此通常的做法是随机从全量item中采样出一个batch的item进行softmax。但是使用batch内的样本作为彼此负样本会带来非常大的偏置问题,即对于热门的样本,被当作负样本的概率更高,因此该模型的贡献在于如何减小batch内负采样所带来的偏置问题? 关于paper的详细内容可以查看[原文](https://dl.acm.org/doi/10.1145/3298689.3346996) diff --git a/docs/ch03/ch3.2/3.2.8.1.md b/docs/ch03/ch3.2/3.2.8.1.md index 15c96e8c0..58553a9a9 100644 --- a/docs/ch03/ch3.2/3.2.8.1.md +++ b/docs/ch03/ch3.2/3.2.8.1.md @@ -45,30 +45,30 @@ - 问:在执行`Scrapy`进行新闻爬取实战的时候,写不进去`mongdb`数据库
- image-20210308142624189 + image-20210308142624189
- image-20210308142624189 + image-20210308142624189
答:`mongodb`安装是否成功?有没有报错之类的。 问:成功安装。爬虫已经成功,我看`title content`已经有数据了
- image-20210308142624189 + image-20210308142624189
答:你这里是不是什么都没有,你退出`mongo`命令行重新进入查看一下呢?
- image-20210308142624189 + image-20210308142624189
问:对,我是在`windows`下做的,还是没有
- image-20210308142624189 + image-20210308142624189
答:你看下这个路径是不是有问题,我这里好像忘记改成`fun-rec`的路径了,你改成`fun-rec`下的路径再试试,有可能这里没有的参数没有导入进去。 @@ -86,13 +86,13 @@ 答:不过应该不影响,代码你是自己单独写呢?还是运行的`fun-rec`下的`code代`码?你检查下pipline下面,看参数配置是否有问题,写一点print查看一下,然后在这里单独使用`insert`方法插入点东西查看是否有问题。
- image-20210308142624189 + image-20210308142624189
问(解决):找到问题了,在`copy piplines`文件的时候,`def`类没有对齐。
- image-20210308142624189 + image-20210308142624189
- 问:`linux`一般软件安装都放在哪个目录下面啊?是`usr/local`吗? @@ -125,16 +125,16 @@ - 问:服务没启动问题
- image-20210308142624189 + image-20210308142624189
答:对,需要安装,启动这个服务,已经加入到文档中。
- image-20210308142624189 + image-20210308142624189
- image-20210308142624189 + image-20210308142624189
- 问:`redis key`的问题如何处理? @@ -278,7 +278,7 @@ - 问:运行后端`server`遇到过这个报错吗?
- image-20210308142624189 + image-20210308142624189
答:重新安装下`cryptography`这个包 diff --git a/docs/ch03/ch3.2/3.2.8.2.md b/docs/ch03/ch3.2/3.2.8.2.md index 6ea8ce49b..940d6fec3 100644 --- a/docs/ch03/ch3.2/3.2.8.2.md +++ b/docs/ch03/ch3.2/3.2.8.2.md @@ -26,7 +26,7 @@ - 问:请问这个报错是缺少什么?
- image-20210308142624189 + image-20210308142624189
答:需要下载`drive`驱动才可以正常运行。 @@ -42,7 +42,7 @@ 问:应该是有
- image-20210308142624189 + image-20210308142624189
@@ -67,7 +67,7 @@ - 问:`python process material.py`需要`redis`验证怎么解决,有没有除了取消密码之外的解决方式。
- image-20210308142624189 + image-20210308142624189
答:估计是设置了`redis`的用户和密码,这个没有办法,只能取消密码。或者修改代码,连接`redis` @@ -98,7 +98,7 @@ 答:修改此处代码。
- image-20210308142624189 + image-20210308142624189
diff --git a/docs/ch03/ch3.2/3.2.8.3.md b/docs/ch03/ch3.2/3.2.8.3.md index 8d82a64ff..d05688ff3 100644 --- a/docs/ch03/ch3.2/3.2.8.3.md +++ b/docs/ch03/ch3.2/3.2.8.3.md @@ -18,7 +18,7 @@ - 问:请问这样处理会不会时间复杂度较大?
- image-20210308142624189 + image-20210308142624189
答:不容易吧,爬取的文章判断重复怎么用`id`啊?如果式唯一性`id`必然是跟时间相关的。 @@ -32,7 +32,7 @@ - 问:请教下大家,正常这两个`col`的大小是不是一样的?
- image-20210308142624189 + image-20210308142624189
答:不是一样大,你看一下具体内容就知道了, @@ -50,7 +50,7 @@ 答:这一步
- image-20210308142624189 + image-20210308142624189
@@ -65,7 +65,7 @@ 问:`update_redis_mongo_protrail_data`这个函数是遍历`material_collection`,也就是`mongo_server.get_feature_protrail_collection()`也就是`featureprotrail`应该是和`featureprotrail`一样多的。
- image-20210308142624189 + image-20210308142624189
答:理解一样多没有问题,后面会修改。 @@ -74,7 +74,7 @@ - 问:用户的喜欢,收藏,点击是直接落到`mysql`里面吗?
- image-20210308142624189 + image-20210308142624189
答:是的,前端点击阅读、喜欢、收藏会实时更新。 @@ -83,7 +83,7 @@ - 问:这个关键词属于长尾是什么意思?
- image-20210308142624189 + image-20210308142624189
答:个别关键词的类别占了大量数目,以至于前三一直是那几个,长尾现象。 @@ -93,7 +93,7 @@ - 问:请教下大家,这个`user_exposure.py`是用来建`exposure_日期`这个表的么
- image-20210308142624189 + image-20210308142624189
答:是的。 \ No newline at end of file diff --git a/docs/ch03/ch3.2/3.2.md b/docs/ch03/ch3.2/3.2.md index bb37dc334..e73b904fd 100644 --- a/docs/ch03/ch3.2/3.2.md +++ b/docs/ch03/ch3.2/3.2.md @@ -11,7 +11,7 @@ **新闻推荐系统实践前端展示和后端逻辑(项目没有任何商用价值仅供入门者学习)**
- image-20211205142026937 - Fun-Rec新闻推荐系统 + image-20211205142026937 + Fun-Rec新闻推荐系统
diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.2 \351\200\273\350\276\221\345\233\236\345\275\222.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.2 \351\200\273\350\276\221\345\233\236\345\275\222.md" index b3abf16d1..af3404357 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.2 \351\200\273\350\276\221\345\233\236\345\275\222.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.2 \351\200\273\350\276\221\345\233\236\345\275\222.md" @@ -8,7 +8,7 @@ f(x)=\frac{1}{1+e^{-x}} $$ **sigmoid函数图像:** -![Sigmoid_function](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片Sigmoid_function.png) +![Sigmoid_function](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片Sigmoid_function.png) 由于 sigmoid 函数的定义域是 $(-∞,+∞)$,而值域为 $(0, 1)$。Logistic 回归通过 sigmoid 联结函数可以将变量映射到 $ (0, 1) $ 之间,这也是为什么最基本的 LR 分类器适合于对二分类(类 0,类 1)目标进行分类。 @@ -324,7 +324,7 @@ $$ 关于该模型的详细原理和实现,可以参考资料[4]。 -![Sigmoid_function_01](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片gbdt_lr.png) +![Sigmoid_function_01](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片gbdt_lr.png) diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.3 \347\245\236\347\273\217\347\275\221\347\273\234.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.3 \347\245\236\347\273\217\347\275\221\347\273\234.md" index 0f2f9c426..e037e909c 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.3 \347\245\236\347\273\217\347\275\221\347\273\234.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.3 \347\245\236\347\273\217\347\275\221\347\273\234.md" @@ -12,7 +12,7 @@ + 轴突(Axon)可以把自身的兴奋状态从胞体传送到另一个神经元或其他组织,每个神经元只有一个轴突; -image-20211210174234519 +image-20211210174234519 神经元可以接收其他神经元的信息,也可以发送信息给其他神经元。神经元之间没有物理连接,两个“连接”的神经元之间留有 20 纳米左右的缝隙,并靠突触进行互联来传递信息,形成一个神经网络,即神经系统。 @@ -30,7 +30,7 @@ 1943 年,心理学家 McCulloch 和数学家 Pitts 根据生物神经元的结构,提出了一种非常简单的神经元模型,MP神经元。现代神经网络中的神经元和 MP 神经元的结构并无太多变化。不同的是,MP 神经元中的激活函数 $f$ 为 $0$ 或 $1$ 的阶跃函数,而现代神经元中的激活函数通常要求是连续可导的函数。 -MP模型 +MP模型 假设一个神经元接收到了 $n$ 个输入 $x_1, ... ,x_n$,令向量 $x=[x_1;x_2;...;x_n]$ 来表示这组输入,并用净输出 $z \in \mathbb{R}$ 表示一个神经元所获得的输入信号 $x$ 的加权和: @@ -54,7 +54,7 @@ $$ 理想中的激活函数为阶跃函数,它可以将输入值映射为 $0$ 或 $1$ ,这里 $1$ 对应神经元兴奋, $0$ 对应神经元抑制。但是阶跃函数具有不连续,不光滑等不太好的性质。 -![img](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片threshold.png) +![img](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片threshold.png) 为了增强网络的表示能力和学习能力,激活函数需要具备以下几点性质: @@ -72,7 +72,7 @@ $$ 常用的 Sigmoid 函数有 Logistic 函数和 Tanh 函数,它们的形状都呈 S 型,均为两端饱和函数。所谓两端饱和,指的是当变量 $x$ 趋无穷时,函数 $f(x)$ 的导数 $f'(x)$ 趋向于 $0$ 。 -image-20211210185902605 +image-20211210185902605 + Logistic 函数定义: $$ @@ -155,7 +155,7 @@ $$ $$ 其中, $\boldsymbol{W}^{(l)} \in \mathbb{R}^{N_{l-1} \times N_{l}}$ 为第 $l$ 层的参数矩阵,$\boldsymbol{b}^{(l)}\in \mathbb{R}^{N_{l}}$ 为第 $l$ 层的偏置向量,$f_l$ 为第 $l$ 层的激活函数,$\boldsymbol{a}^{(l)}\in \mathbb{R}^{N_{l}}$为第 $l$ 层的输出。示例如下: -image-20211210200057126 +image-20211210200057126 这样,前馈神经网络可以通过逐层进行信息传递,得到网络最后的输出 $a^{L}$: $$ @@ -172,7 +172,7 @@ $$ 假设存在 $y=f(x)$ 的计算,则该计算的反向传播如下: -![image-20211210204402301](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211210204402301.png) +![image-20211210204402301](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211210204402301.png) 反正传播的计算顺序是,将信号 $E$ 乘以节点的局部导数 $\frac{\partial y}{\partial x}$, 然后将结果传递给下一个节点。 @@ -197,9 +197,9 @@ $$ $$ 将链式法则用计算图表示,如下: -image-20211212115850464 +image-20211212115850464 -image-20211212115902802 +image-20211212115902802 ### 2.4.3 反向传播 @@ -231,13 +231,13 @@ y=\frac{1}{1+exp(-x)} $$ 用计算图可以表示为: -image-20211212121646115 +image-20211212121646115 可以看到,复杂的公式经过拆解,已经没那么复杂了。现在,按照前面总结的反向传播规则,可以得到加上反向传播后的计算图: -image-20211212121715218 +image-20211212121715218 + 以最后一个运算 $/$ (除法)为例,令输入 $t=1+exp(-x)$,则输出 $y=\frac{1}{t}$,有: $$ @@ -262,7 +262,7 @@ $$ 对于一些层数较深的神经网络模型,在训练时可能会出现一些问题,其中就包括梯度消失问题(gradient vanishing problem)和梯度爆炸问题(gradient exploding problem)。梯度消失问题和梯度爆炸问题一般随着网络层数的增加会变得越来越明显。 -![preview](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-82873a89ff3c14c1d3b42d1862917f35_r.jpg) +![preview](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-82873a89ff3c14c1d3b42d1862917f35_r.jpg) 我们知道前馈神经网络的传播公式如下: $$ @@ -304,7 +304,7 @@ $$ 根据Sigmoid 函数的表达式,其函数图像(左)及其导数图像(右)如下: -深度学习中的激活函数 +深度学习中的激活函数 可以看出,Sigmoid 函数的导数取值范围在 $(0, 0.25]$ 之间,而权重矩阵在初始化时通常 $|| \boldsymbol{W}||<1$,则有 $|| f'(\cdot) \times \boldsymbol{W}||\le 0.25 $ 。由链式法则可得,由于连乘效应,梯度 $\large \frac{\partial \boldsymbol{L}}{\partial \boldsymbol{W}^{(0)}}$ 会越来越小,从而引发梯度消失的问题。 @@ -326,7 +326,7 @@ Youtube 作为全球最大的 UGC 的视频网站,需要在百万量级的视 -image-20211214212744049 +image-20211214212744049 从上图可以看出,YoutubeDNN 包含了两个阶段分别为: @@ -341,7 +341,7 @@ Youtube 作为全球最大的 UGC 的视频网站,需要在百万量级的视 -image-20211214212755758 +image-20211214212755758 从模型的结构来看,召回阶段使用的模型并不复杂,为包含多层神经网络的 DNN 模型。下面简单分析模型的流程: @@ -375,7 +375,7 @@ Youtube 作为全球最大的 UGC 的视频网站,需要在百万量级的视 -image-20211214212806574 +image-20211214212806574 可以看出,该阶段使用的模型与前面召回阶段相同,均为 DNN 模型。不同的是: @@ -406,7 +406,7 @@ Wide&Deep是谷歌发表在 DLRS 2016 上的文章《Wide & Deep Learning for Wide & Deep 已成功应用到了 Google Play 的 app 推荐业务,具体的模型结构如下: -![image-20211214203653752](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211214203653752.png) +![image-20211214203653752](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211214203653752.png) 从结构图上看,Wide&Deep 由两部分组成,分别为 Wide 部分和 Deep 部分。简单来说,Wide 部分就是一个线性层,Deep 部分为多层前馈神经网络层,下面先对原理进行介绍: @@ -465,7 +465,7 @@ $$ 下图,是谷歌在应用商店中的推荐模型架构: -image-20211214212332877 +image-20211214212332877 + Deep 部分的输入是全量的特征向量,包括用户年龄(Age)、已安装应用数量(#App Installs)、设备类型(Device Class)、已安装应用(User Installed App)、曝光应用( Impression App)等特征。已安装应用、曝光应用等类别型特征,需要经过Embedding层输入连接层(Concatenated Embedding),拼接成1200维的Embedding向量,再依次经过3层ReLU全连接层,最终输入LogLoss输出层。 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.4 \345\270\270\347\224\250\344\274\230\345\214\226\347\256\227\346\263\225.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.4 \345\270\270\347\224\250\344\274\230\345\214\226\347\256\227\346\263\225.md" index 3abc29527..72199db46 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.4 \345\270\270\347\224\250\344\274\230\345\214\226\347\256\227\346\263\225.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.4 \345\270\270\347\224\250\344\274\230\345\214\226\347\256\227\346\263\225.md" @@ -12,7 +12,7 @@ $$ \theta x+(1-\theta) y \in S $$ -![convex_set](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片convex_set.png) +![convex_set](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片convex_set.png) ### 1.1.2 凸函数 @@ -22,7 +22,7 @@ f(\lambda x+(1-\lambda)y) \le \lambda f(x)+(1-\lambda)f(y) $$ 直观上来看,对于 $z \in (x, y)$,对应的坐标点 $\left (z,f(z)\right)$ 的位置,均处于点 $(x, f(x))$ 和点 $(y,f(y))$ 连接成的线段的下方。 -![convex_func](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片convex_func.png) +![convex_func](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片convex_func.png) 凸函数具有一个重要的性质: **局部极小值点为全局极小值点**。 @@ -169,7 +169,7 @@ $$ $$ 一旦达到收敛条件的话,迭代就结束。从梯度下降法的迭代公式来看,下一个点的选择与当前点的位置和它的梯度相关。 -![optimization](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片optimization.gif) +![optimization](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片optimization.gif) 不同的优化算法,由于优化目标函数时有着不同的出发点,所以函数在寻找局部极小值点的时对应的轨迹也有所不同。 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.5 \346\267\261\345\272\246\345\255\246\344\271\240\346\250\241\345\236\213\346\220\255\345\273\272\345\237\272\347\241\200.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.5 \346\267\261\345\272\246\345\255\246\344\271\240\346\250\241\345\236\213\346\220\255\345\273\272\345\237\272\347\241\200.md" index b3b5025c7..453b5aee1 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.5 \346\267\261\345\272\246\345\255\246\344\271\240\346\250\241\345\236\213\346\220\255\345\273\272\345\237\272\347\241\200.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.5 \346\267\261\345\272\246\345\255\246\344\271\240\346\250\241\345\236\213\346\220\255\345\273\272\345\237\272\347\241\200.md" @@ -122,7 +122,7 @@ model = keras.Model( keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True) ``` -![image-20210226174510287](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20210226174510287.png) +![image-20210226174510287](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20210226174510287.png) 从上面这个图就可以看出,模型多输入,多输出,共享层的结构,并且也会发现搭建的过程也是非常的简单。 @@ -134,7 +134,7 @@ keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=Tr 先说答案:将输入的数据转换成字典的形式,定义输入层的时候让输入层的name和字典中特征的key一致,就可以使得输入的数据和对应的Input层对应,后面搭建模型就是和上面介绍的一样的了。 -image-20210226175546548 +image-20210226175546548 直接看个例子吧: @@ -176,7 +176,7 @@ model.fit(x, y, batch_size=1, epochs=2, validation_split=0.2) keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=True) ``` -image-20210226181357167 +image-20210226181357167 上面就是举了个简单的例子说明,当多输入特别多的时候,构建模型我们可以将数据转换成字典的形式,然后字典中特征的名称与其对应的Input层的名称一致就行,这里是为了后面搭建复杂模型打基础。 @@ -184,7 +184,7 @@ keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=Tr 相信大家对DeepCTR开源项目应该是有点了解,DeepCTR通过对现有的基于深度学习的点击率预测模型的结构进行抽象总结,在设计过程中采用模块化的思路,各个模块自身具有高复用性,各个模块之间互相独立。 基于深度学习的点击率预测模型按模型内部组件的功能可以划分成以下4个模块:输入模块,嵌入模块,特征提取模块,预测输出模块。关于DeepCTR的介绍可以参考这个文章[DeepCTR:易用可扩展的深度学习点击率预测算法包](https://zhuanlan.zhihu.com/p/53231955) -image-20210221193056946 +image-20210221193056946 这个开源项目做的非常好反而不是特别适合初学者学习,但是又非常适合推荐系统领域的小白去学习,所以本次内容设计我们借鉴了DeepCTR的设计思想,复现课程中的代码,复现的代码中包含了大量的注释,使得学习者在了解了上述所说的函数式API构建模型的基础上,快速看懂源码的设计,以及模型的原理。下面主要说一下我们代码参考DeepCTR项目实现需要注意的几个点。 @@ -232,7 +232,7 @@ keras.utils.plot_model(model, "multi_input_and_output_model.png", show_shapes=Tr 上面在说了类别特征和可变长的序列特征,在这两个Input层之后都需要将其转化成Embedding向量或者Embedding矩阵,在keras中转化成Embedding向量和Embedding矩阵只是相差一个参数的问题 -image-20210226191552184 +image-20210226191552184 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.6 Word2vec.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.6 Word2vec.md" index 2ed19ec8b..2883242cf 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.6 Word2vec.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.0 \346\234\272\345\231\250\345\255\246\344\271\240\345\237\272\347\241\200/1.0.6 Word2vec.md" @@ -56,7 +56,7 @@ one-hot向量的维度是词汇表的大小(如:500,000) 如果我们可以使用某种方法为每个单词构建一个合适的dense vector,如下图,那么通过点积等数学计算就可以获得单词之间的某种联系
-在这里插入图片描述 +在这里插入图片描述
# Word2vec @@ -71,7 +71,7 @@ one-hot向量的维度是词汇表的大小(如:500,000) 我们先引入上下文context的概念:当单词 w 出现在文本中时,其**上下文context**是出现在w附近的一组单词(在固定大小的窗口内),如下图
-在这里插入图片描述 +在这里插入图片描述
这些上下文单词context words决定了banking的意义 @@ -97,13 +97,13 @@ Word2vec包含两个模型,**Skip-gram与CBOW**。下面,我们先讲**Skip- 下图展示了以“into”为中心词,窗口大小为2的情况下它的上下文词。以及相对应的$P(o|c)$
-在这里插入图片描述 +在这里插入图片描述
我们滑动窗口,再以banking为中心词
- +
那么,如果我们在整个语料库上不断地滑动窗口,我们可以得到所有位置的$P(o|c)$,我们希望在所有位置上**最大化单词o在单词c周围出现了这一事实**,由极大似然法,可得: @@ -115,13 +115,13 @@ $$ 此式还可以依图3写为:
-在这里插入图片描述 +在这里插入图片描述
加log,加负号,缩放大小可得:
-在这里插入图片描述 +在这里插入图片描述
上式即为**skip-gram的损失函数**,最小化损失函数,就可以得到合适的词向量 @@ -141,7 +141,7 @@ $$ 又P(o|c)是一个概率,所以我们在整个语料库上使用**softmax**将点积的值映射到概率,如图6
-在这里插入图片描述 +在这里插入图片描述
注:注意到上图,中心词词向量为$v_{c}$,而上下文词词向量为$u_{o}$。也就是说每个词会对应两个词向量,**在词w做中心词时,使用$v_{w}$作为词向量,而在它做上下文词时,使用$u_{w}$作为词向量**。这样做的原因是为了求导等操作时计算上的简便。当整个模型训练完成后,我们既可以使用$v_{w}$作为词w的词向量,也可以使用$u_{w}$作为词w的词向量,亦或是将二者平均。在下一部分的模型结构中,我们将更清楚地看到两个词向量究竟在模型的哪个位置。 @@ -153,7 +153,7 @@ $$ ## Word2vec模型结构
-在这里插入图片描述 +在这里插入图片描述
如图八所示,这是一个输入为1 X V维的one-hot向量(V为整个词汇表的长度,这个向量只有一个1值,其余为0值表示一个词),单隐藏层(**隐藏层的维度为N,这里是一个超参数,这个参数由我们定义,也就是词向量的维度**),输出为1 X V维的softmax层的模型。 @@ -175,13 +175,13 @@ $W^{I}$为V X N的参数矩阵,$W^{O}$为N X V的参数矩阵。 如上文所述,Skip-gram为给定中心词,预测周围的词,即求P(o|c),如下图所示:
-在这里插入图片描述 +在这里插入图片描述
而CBOW为给定周围的词,预测中心词,即求P(c|o),如下图所示:
-在这里插入图片描述 +在这里插入图片描述
@@ -194,7 +194,7 @@ $W^{I}$为V X N的参数矩阵,$W^{O}$为N X V的参数矩阵。 我们再看一眼,通过softmax得到的$P(o|c)$,如图:
-在这里插入图片描述 +在这里插入图片描述
@@ -209,7 +209,7 @@ $W^{I}$为V X N的参数矩阵,$W^{O}$为N X V的参数矩阵。 我们首先给出负采样的损失函数:
-在这里插入图片描述 +在这里插入图片描述
diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.1 \346\246\202\350\277\260.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.1 \346\246\202\350\277\260.md" index 3c75e7052..45bbe183e 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.1 \346\246\202\350\277\260.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.1 \346\246\202\350\277\260.md" @@ -7,8 +7,8 @@ **传统推荐系统及深度学习推荐系统的演化关系图(图来自《深度学习推荐系统》)**
-image-20200923143443499 -image-20200923143559968 +image-20200923143443499 +image-20200923143559968
传统推荐系统(左),深度学习推荐系统(右)
@@ -119,7 +119,7 @@ 在讲AUC前需要理解混淆矩阵,召回率,精确率,ROC曲线等概念 - img + img TP:真的真了(真实值是真的,预测也是真) @@ -136,7 +136,7 @@ $$ ROC(**Receiver Operating Characteristic Curve**)曲线: - img + img ROC曲线的横坐标为假阳性率(False Positive Rate, FPR),N是真实负样本的个数, FP是N个负样本中被分类器预测为正样本的个数。 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.2 \345\215\217\345\220\214\350\277\207\346\273\244-UserCF.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.2 \345\215\217\345\220\214\350\277\207\346\273\244-UserCF.md" index 2a34353e3..72a3c8617 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.2 \345\215\217\345\220\214\350\277\207\346\273\244-UserCF.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.2 \345\215\217\345\220\214\350\277\207\346\273\244-UserCF.md" @@ -99,13 +99,13 @@ + 例如,我们要对用户 $A$ 进行物品推荐,可以先找到和他有相似兴趣的其他用户。 + 然后,将共同兴趣用户喜欢的,但用户 $A$ 未交互过的物品推荐给 $A$。 -image-20210629232540289 +image-20210629232540289 ## 计算过程 以下图为例,给用户推荐物品的过程可以形象化为一个猜测用户对物品进行打分的任务,表格里面是5个用户对于5件物品的一个打分情况,就可以理解为用户对物品的喜欢程度。 -![image-20210629232622758](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20210629232622758.png) +![image-20210629232622758](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20210629232622758.png) UserCF算法的两个步骤: @@ -164,7 +164,7 @@ UserCF算法的两个步骤: + 基于 sklearn 计算所有用户之间的皮尔逊相关系数。可以看出,与 Alice 相似度最高的用户为用户1和用户2。 - 图片 + 图片 2. **根据相似度用户计算 Alice对物品5的最终得分** 用户1对物品5的评分是3, 用户2对物品5的打分是5, 那么根据上面的计算公式, 可以计算出 Alice 对物品5的最终得分是 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.3 \345\215\217\345\220\214\350\277\207\346\273\244-ItemCF.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.3 \345\215\217\345\220\214\350\277\207\346\273\244-ItemCF.md" index 14c97119d..97ff8df44 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.3 \345\215\217\345\220\214\350\277\207\346\273\244-ItemCF.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/1.1.3 \345\215\217\345\220\214\350\277\207\346\273\244-ItemCF.md" @@ -9,13 +9,13 @@ 举例来说,如果用户 1 喜欢物品 A ,而物品 A 和 C 非常相似,则可以将物品 C 推荐给用户1。ItemCF算法并不利用物品的内容属性计算物品之间的相似度, 主要通过分析用户的行为记录计算物品之间的相似度, 该算法认为, 物品 A 和物品 C 具有很大的相似度是因为喜欢物品 A 的用户极可能喜欢物品 C。 -![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavagdvaYX0HSW4PdssV.png!thumbnail) +![图片](https://ryluo.oss-cn-chengdu.aliyuncs.com/JavagdvaYX0HSW4PdssV.png!thumbnail) ## 计算过程 基于物品的协同过滤算法和基于用户的协同过滤算法很像, 所以我们这里直接还是拿上面 Alice 的那个例子来看。 -![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavaE306yXB4mGmjIxbn.png!thumbnail) +![图片](https://ryluo.oss-cn-chengdu.aliyuncs.com/JavaE306yXB4mGmjIxbn.png!thumbnail) 如果想知道 Alice 对物品5打多少分, 基于物品的协同过滤算法会这么做: @@ -41,7 +41,7 @@ 2. 基于 `sklearn` 计算物品之间的皮尔逊相关系数: -图片 +图片 3. 根据皮尔逊相关系数, 可以找到与物品5最相似的2个物品是 item1 和 item4, 下面基于上面的公式计算最终得分: @@ -196,7 +196,7 @@ $$ 比如下面这个例子: -![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavaxxhHm3BAtMfsy2AV.png!thumbnail) +![图片](https://ryluo.oss-cn-chengdu.aliyuncs.com/JavaxxhHm3BAtMfsy2AV.png!thumbnail) + 左边矩阵中,$A, B, C, D$ 表示的是物品。 + 可以看出,$D $ 是一件热门物品,其与 $A、B、C$ 的相似度比较大。因此,推荐系统更可能将 $D$ 推荐给用过 $A、B、C$ 的用户。 @@ -242,7 +242,7 @@ $$ > > 举例来说明,如下图(`X,Y,Z` 表示物品,`d,e,f`表示用户): > -> ![图片](http://ryluo.oss-cn-chengdu.aliyuncs.com/JavaWKvITKBhYOkfXrzs.png!thumbnail) +> ![图片](https://ryluo.oss-cn-chengdu.aliyuncs.com/JavaWKvITKBhYOkfXrzs.png!thumbnail) > > + 如果使用余弦相似度进行计算,用户 d 和 e 之间较为相似。但是实际上,用户 d 和 f 之间应该更加相似。只不过由于 d 倾向于打高分,e 倾向于打低分导致二者之间的余弦相似度更高。 > + 这种情况下,可以考虑使用皮尔逊相关系数计算用户之间的相似性关系。 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/readme.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/readme.md" index 4a7245455..8deeac97c 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/readme.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.1 \345\237\272\347\241\200\346\216\250\350\215\220\347\256\227\346\263\225/readme.md" @@ -19,11 +19,11 @@ 传统推荐系统: -![](http://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20200923143443499.png) +![](https://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20200923143443499.png) 深度学习推荐系统: -![](http://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20200923143559968.png) +![](https://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20200923143559968.png) **本开源内容的目标是掌握以下算法:** diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.1 NeuralCF.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.1 NeuralCF.md" index 1a110dc75..1658781ea 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.1 NeuralCF.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.1 NeuralCF.md" @@ -86,11 +86,11 @@ def NCF(dnn_feature_columns): 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播。 -![image-20210307191533086](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20210307191533086.png) +![image-20210307191533086](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20210307191533086.png) 下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。 -![NCF](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片NCF.png) +![NCF](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片NCF.png) diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.10 DIEN.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.10 DIEN.md" index cc24dde81..d2e865ef6 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.10 DIEN.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.10 DIEN.md" @@ -8,7 +8,7 @@ DIN模型考虑了用户兴趣,并且强调用户兴趣是多样的,该模 ## 2. DIEN模型原理 -image-20210218155901144 +image-20210218155901144 模型的输入可以分成两大部分,一部分是用户的行为序列(这部分会通过兴趣提取层及兴趣演化层转换成与用户当前兴趣相关的embedding),另一部分就是除了用户行为以外的其他所有特征,如Target id, Coontext Feature, UserProfile Feature,这些特征都转化成embedding的类型然后concat在一起(形成一个大的embedding)作为非行为相关的特征(这里可能也会存在一些非id类特征,应该可以直接进行concat)。最后DNN输入的部分由行为序列embedding和非行为特征embedding(多个特征concat到一起之后形成的一个大的向量)组成,将两者concat之后输入到DNN中。 @@ -26,11 +26,11 @@ DIN模型考虑了用户兴趣,并且强调用户兴趣是多样的,该模 首先需要明确的就是辅助损失是计算哪两个量的损失。计算的是用户每个时刻的兴趣表示(GRU每个时刻输出的隐藏状态形成的序列)与用户当前时刻实际点击的物品表示(输入的embedding序列)之间的损失,相当于是行为序列中的第t+1个物品与用户第t时刻的兴趣表示之间的损失**(为什么这里用户第t时刻的兴趣与第t+1时刻的真实点击做损失呢?我的理解是,只有知道了用户第t+1真实点击的商品,才能更好的确定用户第t时刻的兴趣)。** -image-20210218163742638 +image-20210218163742638 当然,如果只计算用户点击物品与其点击前一次的兴趣之间的损失,只能认为是正样本之间的损失,那么用户第t时刻的兴趣其实还有很多其他的未点击的商品,这些未点击的商品就是负样本,负样本一般通过从用户点击序列中采样得到,这样一来辅助损失中就包含了用户某个时刻下的兴趣及与该时刻兴趣相关的正负物品。所以最终的损失函数表示如下。 -image-20210218162447125 +image-20210218162447125 其中$h_t^i$表示的是用户$i$第$t$时刻的隐藏状态,可以表示用户第$t$时刻的兴趣向量,$e_b^i,\hat{e_b^i}$分别表示的是正负样本,$e_b^i[t+1]$表示的是用户$i$第$t+1$时刻点击的物品向量。 @@ -61,7 +61,7 @@ $$ 由于用户的兴趣是多样的,但是用户的每一种兴趣都有自己的发展过程,即使兴趣发生漂移我们可以只考虑用户与target item(广告或者商品)相关的兴趣演化过程,这样就不用考虑用户多样化的兴趣的问题了,而如何只获取与target item相关的信息,作者使用了与DIN模型中提取与target item相同的方法,来计算用户历史兴趣与target item之间的相似度,即这里也使用了DIN中介绍的局部激活单元(就是下图中的Attention模块)。 -image-20210218180755462 +image-20210218180755462 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.2 DeepCrossing.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.2 DeepCrossing.md" index 0583e6b57..db001c511 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.2 DeepCrossing.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.2 DeepCrossing.md" @@ -18,7 +18,7 @@ DeepCrossing模型应用场景是微软搜索引擎Bing中的搜索广告推荐 DeepCrossing分别设置了不同神经网络层解决上述问题。模型结构如下 -image-20210217173154706 +image-20210217173154706 下面分别介绍一下各层的作用: @@ -48,7 +48,7 @@ dnn_inputs = Concatenate(axis=1)([dense_dnn_inputs, sparse_dnn_inputs]) # B x (n 该层的主要结构是MLP, 但DeepCrossing采用了残差网络进行的连接。通过多层残差网络对特征向量各个维度充分的交叉组合, 使得模型能够抓取更多的非线性特征和组合特征信息, 增加模型的表达能力。残差网络结构如下图所示: -image-20210217174914659 +image-20210217174914659 Deep Crossing模型使用稍微修改过的残差单元,它不使用卷积内核,改为了两层神经网络。我们可以看到,残差单元是通过两层ReLU变换再将原输入特征相加回来实现的。具体代码实现如下: @@ -136,11 +136,11 @@ def DeepCrossing(dnn_feature_columns): 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播。(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。 -image-20210304222328047 +image-20210304222328047 下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。 -![DeepCrossing](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片DeepCrossing.png) +![DeepCrossing](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片DeepCrossing.png) ## 5. 参考资料 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.3 PNN.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.3 PNN.md" index 7b9bb4ccb..c513ee8d0 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.3 PNN.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.3 PNN.md" @@ -12,13 +12,13 @@ PNN模型其实是对IPNN和OPNN的总称,两者分别对应的是不同的Pro PNN模型的整体架构如下图所示: -image-20210308142624189 +image-20210308142624189 一共分为五层,其中除了Product Layer别的layer都是比较常规的处理方法,均可以从前面的章节进一步了解。模型中最重要的部分就是通过Product层对embedding特征进行交叉组合,也就是上图中红框所显示的部分。 Product层主要有线性部分和非线性部分组成,分别用$l_z$和$l_p$来表示, -image-20210308143101261 +image-20210308143101261 1. 线性模块,一阶特征(未经过显示特征交叉处理),对应论文中的$l_z=(l_z^1,l_z^2, ..., l_z^{D_1})$ 2. 非线性模块,高阶特征(经过显示特征交叉处理),对应论文中的$l_p=(l_p^1,l_p^2, ..., l_p^{D_1})$ @@ -236,7 +236,7 @@ class ProductLayer(Layer): 下面是一个通过keras画的模型结构图,为了更好的显示,类别特征都只是选择了一小部分,画图的代码也在github中。 -![PNN](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片PNN.png) +![PNN](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片PNN.png) ## 4. 思考题 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.4 Wide&Deep.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.4 Wide&Deep.md" index 4c5e7d8f3..1078d72e9 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.4 Wide&Deep.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.4 Wide&Deep.md" @@ -14,7 +14,7 @@ Wide&Deep模型就是围绕记忆性和泛化性进行讨论的,模型能够 ## 2. 模型结构及原理
-image-20200910214310877 +image-20200910214310877
其实wide&deep模型本身的结构是非常简单的,对于有点机器学习基础和深度学习基础的人来说都非常的容易看懂,但是如何根据自己的场景去选择那些特征放在Wide部分,哪些特征放在Deep部分就需要理解这篇论文提出者当时对于设计该模型不同结构时的意图了,所以这也是用好这个模型的一个前提。 @@ -92,11 +92,11 @@ def WideNDeep(linear_feature_columns, dnn_feature_columns): 关于每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播。(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。 -image-20210228160557072 +image-20210228160557072 下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。 -![Wide&Deep](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片Wide&Deep.png) +![Wide&Deep](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片Wide&Deep.png) ## 4. 思考 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.5 DeepFM.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.5 DeepFM.md" index 78e62ceb8..f4f569c95 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.5 DeepFM.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.5 DeepFM.md" @@ -9,15 +9,15 @@ - ==DNN局限== 当我们使用DNN网络解决推荐问题的时候存在网络参数过于庞大的问题,这是因为在进行特征处理的时候我们需要使用one-hot编码来处理离散特征,这会导致输入的维度猛增。这里借用AI大会的一张图片: - + 这样庞大的参数量也是不实际的。为了解决DNN参数量过大的局限性,可以采用非常经典的Field思想,将OneHot特征转换为Dense Vector - + 此时通过增加全连接层就可以实现高阶的特征组合,如下图所示: - + 但是仍然缺少低阶的特征组合,于是增加FM来表示低阶的特征组合。 @@ -25,7 +25,7 @@ 结合FM和DNN其实有两种方式,可以并行结合也可以串行结合。这两种方式各有几种代表模型。在DeepFM之前有FNN,虽然在影响力上可能并不如DeepFM,但是了解FNN的思想对我们理解DeepFM的特点和优点是很有帮助的。 - + FNN是使用预训练好的FM模块,得到隐向量,然后把隐向量作为DNN的输入,但是经过实验进一步发现,在Embedding layer和hidden layer1之间增加一个product层(如上图所示)可以提高模型的表现,所以提出了PNN,使用product layer替换FM预训练层。 @@ -33,7 +33,7 @@ FNN是使用预训练好的FM模块,得到隐向量,然后把隐向量作为 FNN和PNN模型仍然有一个比较明显的尚未解决的缺点:对于低阶组合特征学习到的比较少,这一点主要是由于FM和DNN的串行方式导致的,也就是虽然FM学到了低阶特征组合,但是DNN的全连接结构导致低阶特征并不能在DNN的输出端较好的表现。看来我们已经找到问题了,将串行方式改进为并行方式能比较好的解决这个问题。于是Google提出了Wide&Deep模型(将前几章),但是如果深入探究Wide&Deep的构成方式,虽然将整个模型的结构调整为了并行结构,在实际的使用中Wide Module中的部分需要较为精巧的特征工程,换句话说人工处理对于模型的效果具有比较大的影响(这一点可以在Wide&Deep模型部分得到验证)。 -image-20200910214310877 +image-20200910214310877 如上图所示,该模型仍然存在问题:**在output Units阶段直接将低阶和高阶特征进行组合,很容易让模型最终偏向学习到低阶或者高阶的特征,而不能做到很好的结合。** @@ -41,7 +41,7 @@ FNN和PNN模型仍然有一个比较明显的尚未解决的缺点:对于低 ## 2. 模型的结构与原理 -image-20210225180556628 +image-20210225180556628 前面的Field和Embedding处理是和前面的方法是相同的,如上图中的绿色部分;DeepFM将Wide部分替换为了FM layer如上图中的蓝色部分 @@ -57,13 +57,13 @@ FNN和PNN模型仍然有一个比较明显的尚未解决的缺点:对于低 $$ \hat{y}_{FM}(x) = w_0+\sum_{i=1}^N w_ix_i + \sum_{i=1}^N \sum_{j=i+1}^N v_i^T v_j x_ix_j $$ -image-20210225181340313 +image-20210225181340313 ### 2.2 Deep Deep架构图 -image-20210225181010107 +image-20210225181010107 Deep Module是为了学习高阶的特征组合,在上图中使用用全连接的方式将Dense Embedding输入到Hidden Layer,这里面Dense Embeddings就是为了解决DNN中的参数爆炸问题,这也是推荐模型中常用的处理方法。 @@ -132,11 +132,11 @@ def DeepFM(linear_feature_columns, dnn_feature_columns): 关于每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。 -image-20210228161135777 +image-20210228161135777 下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。 -![DeepFM](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片DeepFM.png) +![DeepFM](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片DeepFM.png) @@ -146,7 +146,7 @@ def DeepFM(linear_feature_columns, dnn_feature_columns): 2. 对于下图所示,根据你的理解Sparse Feature中的不同颜色节点分别表示什么意思 -image-20210225180556628 +image-20210225180556628 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.6 NFM.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.6 NFM.md" index c569925de..fdd26f9a9 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.6 NFM.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.6 NFM.md" @@ -10,11 +10,11 @@ $$ $$ 我们对比FM, 就会发现变化的是第三项,前两项还是原来的, 因为我们说FM的一个问题,就是只能到二阶交叉, 且是线性模型, 这是他本身的一个局限性, 而如果想突破这个局限性, 就需要从他的公式本身下点功夫, 于是乎,作者在这里改进的思路就是**用一个表达能力更强的函数来替代原FM中二阶隐向量内积的部分**。 - + 而这个表达能力更强的函数呢, 我们很容易就可以想到神经网络来充当,因为神经网络理论上可以拟合任何复杂能力的函数, 所以作者真的就把这个$f(x)$换成了一个神经网络,当然不是一个简单的DNN, 而是依然底层考虑了交叉,然后高层使用的DNN网络, 这个也就是我们最终的NFM网络了: - + 这个结构,如果前面看过了PNN的伙伴会发现,这个结构和PNN非常像,只不过那里是一个product_layer, 而这里换成了Bi-Interaction Pooling了, 这个也是NFM的核心结构了。这里注意, 这个结构中,忽略了一阶部分,只可视化出来了$f(x)$, 我们还是下面从底层一点点的对这个网络进行剖析。 @@ -137,11 +137,11 @@ def NFM(linear_feature_columns, dnn_feature_columns): 有了上面的解释,这个模型的宏观层面相信就很容易理解了。关于这每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播。(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。 -NFM_aaaa +NFM_aaaa 下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。 -![nfm](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片nfm.png) +![nfm](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片nfm.png) ## 4. 思考题 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.7 DCN.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.7 DCN.md" index 023824d7c..483b4fa76 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.7 DCN.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.7 DCN.md" @@ -8,7 +8,7 @@ Wide&Deep模型的提出不仅综合了“记忆能力”和“泛化能力” 这个模型的结构是这个样子的: - + 这个模型的结构也是比较简洁的, 从下到上依次为:Embedding和Stacking层, Cross网络层与Deep网络层并列, 以及最后的输出层。下面也是一一为大家剖析。 @@ -35,7 +35,7 @@ $$ $$ 可以看到, 交叉层的二阶部分非常类似PNN提到的外积操作, 在此基础上增加了外积操作的权重向量$w_l$, 以及原输入向量$x_l$和偏置向量$b_l$。 交叉层的可视化如下: - + 可以看到, 每一层增加了一个$n$维的权重向量$w_l$(n表示输入向量维度), 并且在每一层均保留了输入向量, 因此输入和输出之间的变化不会特别明显。关于这一层, 原论文里面有个具体的证明推导Cross Network为啥有效, 不过比较复杂,这里我拿一个式子简单的解释下上面这个公式的伟大之处: @@ -139,7 +139,7 @@ def DCN(linear_feature_columns, dnn_feature_columns): 下面是一个通过keras画的模型结构图,为了更好的显示,类别特征都只是选择了一小部分,画图的代码也在github中。 -![DCN](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片DCN.png) +![DCN](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片DCN.png) ## 4. 思考 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.8 AFM.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.8 AFM.md" index 59c749d9f..c1999ee6e 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.8 AFM.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.8 AFM.md" @@ -10,7 +10,7 @@ $$ ## 2. AFM模型原理 -image-20210131092744905 +image-20210131092744905 上图表示的就是AFM交叉特征部分的模型结构(非交叉部分与FM是一样的,图中并没有给出)。AFM最核心的两个点分别是Pair-wise Interaction Layer和Attention-based Pooling。前者将输入的非零特征的隐向量两两计算element-wise product(哈达玛积,两个向量对应元素相乘,得到的还是一个向量),假如输入的特征中的非零向量的数量为m,那么经过Pair-wise Interaction Layer之后输出的就是$\frac{m(m-1)}{2}$个向量,再将前面得到的交叉特征向量组输入到Attention-based Pooling,该pooling层会先计算出每个特征组合的自适应权重(通过Attention Net进行计算),通过加权求和的方式将向量组压缩成一个向量,由于最终需要输出的是一个数值,所以还需要将前一步得到的向量通过另外一个向量将其映射成一个值,得到最终的基于注意力加权的二阶交叉特征的输出。(对于这部分如果不是很清楚,可以先看下面对两个核心层的介绍) @@ -109,11 +109,11 @@ def AFM(linear_feature_columns, dnn_feature_columns): 关于每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。 -image-20210307200304199 +image-20210307200304199 下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。 -![AFM](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片AFM.png) +![AFM](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片AFM.png) ## 4. 思考 diff --git "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.9 DIN.md" "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.9 DIN.md" index 34cf89f8c..083521174 100644 --- "a/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.9 DIN.md" +++ "b/docs/\347\254\254\344\270\200\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\237\272\347\241\200/1.2 \346\267\261\345\272\246\346\216\250\350\215\220\346\250\241\345\236\213/1.2.9 DIN.md" @@ -159,11 +159,11 @@ def DIN(feature_columns, behavior_feature_list, behavior_seq_feature_list): 关于每一块的细节,这里就不解释了,在我们给出的GitHub代码中,我们已经加了非常详细的注释,大家看那个应该很容易看明白, 为了方便大家的阅读,我们这里还给大家画了一个整体的模型架构图,帮助大家更好的了解每一块以及前向传播。(画的图不是很规范,先将就看一下,后面我们会统一在优化一下这个手工图)。 -DIN_aaaa +DIN_aaaa 下面是一个通过keras画的模型结构图,为了更好的显示,数值特征和类别特征都只是选择了一小部分,画图的代码也在github中。 -![din](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片din.png) +![din](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片din.png) ## 思考 diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.1 \350\265\233\351\242\230\347\220\206\350\247\243+Baseline.ipynb" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.1 \350\265\233\351\242\230\347\220\206\350\247\243+Baseline.ipynb" index 1dae6308d..1567babe2 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.1 \350\265\233\351\242\230\347\220\206\350\247\243+Baseline.ipynb" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.1 \350\265\233\351\242\230\347\220\206\350\247\243+Baseline.ipynb" @@ -1,664 +1,664 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 赛题理解\n", - "赛题理解是切入一道赛题的基础,会影响后续特征工程和模型构建等各种工作,也影响着后续发展工作的方向,正确了解赛题背后的思想以及赛题业务逻辑的清晰,有利于花费更少时间构建更为有效的特征模型, 在各种比赛中, 赛题理解都是极其重要且必须走好的第一步, 今天我们就从赛题的理解出发, 首先了解一下这次赛题的概况和数据,从中分析赛题以及大致的处理方式, 其次我们了解模型评测的指标,最后对赛题的理解整理一些经验。\n", - "\n", - "## 赛题简介\n", - "此次比赛是新闻推荐场景下的用户行为预测挑战赛, 该赛题是以新闻APP中的新闻推荐为背景, 目的是**要求我们根据用户历史浏览点击新闻文章的数据信息预测用户未来的点击行为, 即用户的最后一次点击的新闻文章**, 这道赛题的设计初衷是引导大家了解推荐系统中的一些业务背景, 解决实际问题。 \n", - "\n", - "## 数据概况\n", - "该数据来自某新闻APP平台的用户交互数据,包括30万用户,近300万次点击,共36万多篇不同的新闻文章,同时每篇新闻文章有对应的embedding向量表示。为了保证比赛的公平性,从中抽取20万用户的点击日志数据作为训练集,5万用户的点击日志数据作为测试集A,5万用户的点击日志数据作为测试集B。具体数据表和参数, 大家可以参考赛题说明。下面说一下拿到这样的数据如何进行理解, 来有效的开展下一步的工作。

\n", - "## 评价方式理解\n", - "理解评价方式, 我们需要结合着最后的提交文件来看, 根据sample.submit.csv, 我们最后提交的格式是针对每个用户, 我们都会给出五篇文章的推荐结果,按照点击概率从前往后排序。 而真实的每个用户最后一次点击的文章只会有一篇的真实答案, 所以我们就看我们推荐的这五篇里面是否有命中真实答案的。比如对于user1来说, 我们的提交会是:\n", - ">user1, article1, article2, article3, article4, article5.\n", - "\n", - "评价指标的公式如下:\n", - "$$\n", - "score(user) = \\sum_{k=1}^5 \\frac{s(user, k)}{k}\n", - "$$\n", - "\n", - "假如article1就是真实的用户点击文章,也就是article1命中, 则s(user1,1)=1, s(user1,2-4)都是0, 如果article2是用户点击的文章, 则s(user,2)=1/2,s(user,1,3,4,5)都是0。也就是score(user)=命中第几条的倒数。如果都没中, 则score(user1)=0。 这个是合理的, 因为我们希望的就是命中的结果尽量靠前, 而此时分数正好比较高。\n", - "\n", - "## 赛题理解\n", - "根据赛题简介,我们首先要明确我们此次比赛的目标: 根据用户历史浏览点击新闻的数据信息预测用户最后一次点击的新闻文章。从这个目标上看, 会发现此次比赛和我们之前遇到的普通的结构化比赛不太一样, 主要有两点:\n", - " \n", - "- 首先是目标上, 要预测最后一次点击的新闻文章,也就是我们给用户推荐的是新闻文章, 并不是像之前那种预测一个数或者预测数据哪一类那样的问题\n", - "- 数据上, 通过给出的数据我们会发现, 这种数据也不是我们之前遇到的那种特征+标签的数据,而是基于了真实的业务场景, 拿到的用户的点击日志\n", - "\n", - "所以拿到这个题目,我们的思考方向就是结合我们的目标,**把该预测问题转成一个监督学习的问题(特征+标签),然后我们才能进行ML,DL等建模预测**。那么我们自然而然的就应该在心里会有这么几个问题:如何转成一个监督学习问题呢? 转成一个什么样的监督学习问题呢? 我们能利用的特征又有哪些呢? 又有哪些模型可以尝试呢? 此次面对数万级别的文章推荐,我们又有哪些策略呢? \n", - "\n", - "当然这些问题不会在我们刚看到赛题之后就一下出来答案, 但是只要有了问题之后, 我们就能想办法解决问题了, 比如上面的第二个问题,转成一个什么样的监督学习问题? 由于我们是预测用户最后一次点击的新闻文章,从36万篇文章中预测某一篇的话我们首先可能会想到这可能是一个多分类的问题(36万类里面选1), 但是如此庞大的分类问题, 我们做起来可能比较困难, 那么能不能转化一下? 既然是要预测最后一次点击的文章, 那么如果我们能预测出某个用户最后一次对于某一篇文章会进行点击的概率, 是不是就间接性的解决了这个问题呢?概率最大的那篇文章不就是用户最后一次可能点击的新闻文章吗? 这样就把原问题变成了一个点击率预测的问题(用户, 文章) --> 点击的概率(软分类), 而这个问题, 就是我们所熟悉的监督学习领域分类问题了, 这样我们后面建模的时候, 对于模型的选择就基本上有大致方向了,比如最简单的逻辑回归模型。

\n", - "这样, 我们对于该赛题的解决方案应该有了一个大致的解决思路,要先转成一个分类问题来做, 而分类的标签就是用户是否会点击某篇文章,分类问题的特征中会有用户和文章,我们要训练一个分类模型, 对某用户最后一次点击某篇文章的概率进行预测。 那么又会有几个问题:如何转成监督学习问题? 训练集和测试集怎么制作? 我们又能利用哪些特征? 我们又可以尝试哪些模型? 面对36万篇文章, 20多万用户的推荐, 我们又有哪些策略来缩减问题的规模?如何进行最后的预测? " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Baseline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 导包" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:46:49.678700Z", - "start_time": "2020-11-16T07:46:49.673336Z" - } - }, - "outputs": [], - "source": [ - "# import packages\n", - "import time, math, os\n", - "from tqdm import tqdm\n", - "import gc\n", - "import pickle\n", - "import random\n", - "from datetime import datetime\n", - "from operator import itemgetter\n", - "import numpy as np\n", - "import pandas as pd\n", - "import warnings\n", - "from collections import defaultdict\n", - "import collections\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:48:34.240098Z", - "start_time": "2020-11-16T07:48:34.236370Z" - } - }, - "outputs": [], - "source": [ - "# data_path = './data_raw/'\n", - "data_path = '/home/admin/jupyter/data/' # 天池平台路径\n", - "save_path = '/home/admin/jupyter/temp_results/' # 天池平台路径" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## df节省内存函数" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# 节约内存的一个标配函数\n", - "def reduce_mem(df):\n", - " starttime = time.time()\n", - " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", - " start_mem = df.memory_usage().sum() / 1024**2\n", - " for col in df.columns:\n", - " col_type = df[col].dtypes\n", - " if col_type in numerics:\n", - " c_min = df[col].min()\n", - " c_max = df[col].max()\n", - " if pd.isnull(c_min) or pd.isnull(c_max):\n", - " continue\n", - " if str(col_type)[:3] == 'int':\n", - " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", - " df[col] = df[col].astype(np.int8)\n", - " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", - " df[col] = df[col].astype(np.int16)\n", - " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", - " df[col] = df[col].astype(np.int32)\n", - " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", - " df[col] = df[col].astype(np.int64)\n", - " else:\n", - " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", - " df[col] = df[col].astype(np.float16)\n", - " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", - " df[col] = df[col].astype(np.float32)\n", - " else:\n", - " df[col] = df[col].astype(np.float64)\n", - " end_mem = df.memory_usage().sum() / 1024**2\n", - " print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,\n", - " 100*(start_mem-end_mem)/start_mem,\n", - " (time.time()-starttime)/60))\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取采样或全量数据" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:48:50.619963Z", - "start_time": "2020-11-16T07:48:50.611667Z" - } - }, - "outputs": [], - "source": [ - "# debug模式:从训练集中划出一部分数据来调试代码\n", - "def get_all_click_sample(data_path, sample_nums=10000):\n", - " \"\"\"\n", - " 训练集中采样一部分数据调试\n", - " data_path: 原数据的存储路径\n", - " sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)\n", - " \"\"\"\n", - " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " all_user_ids = all_click.user_id.unique()\n", - "\n", - " sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) \n", - " all_click = all_click[all_click['user_id'].isin(sample_user_ids)]\n", - " \n", - " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", - " return all_click\n", - "\n", - "# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中\n", - "# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集\n", - "def get_all_click_df(data_path='./data_raw/', offline=True):\n", - " if offline:\n", - " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " else:\n", - " trn_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", - "\n", - " all_click = trn_click.append(tst_click)\n", - " \n", - " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", - " return all_click" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# 全量训练集\n", - "all_click_df = get_all_click_df(data_path, offline=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 获取 用户 - 文章 - 点击时间字典" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:56:39.800240Z", - "start_time": "2020-11-16T07:56:39.793541Z" - } - }, - "outputs": [], - "source": [ - "# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - "def get_user_item_time(click_df):\n", - " \n", - " click_df = click_df.sort_values('click_timestamp')\n", - " \n", - " def make_item_time_pair(df):\n", - " return list(zip(df['click_article_id'], df['click_timestamp']))\n", - " \n", - " user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\\\n", - " .reset_index().rename(columns={0: 'item_time_list'})\n", - " user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))\n", - " \n", - " return user_item_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 获取点击最多的topk个文章" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# 获取近期点击最多的文章\n", - "def get_item_topk_click(click_df, k):\n", - " topk_click = click_df['click_article_id'].value_counts().index[:k]\n", - " return topk_click" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## itemcf的物品相似度计算" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:51:07.577037Z", - "start_time": "2020-11-16T07:51:07.568098Z" - } - }, - "outputs": [], - "source": [ - "def itemcf_sim(df):\n", - " \"\"\"\n", - " 文章与文章之间的相似性矩阵计算\n", - " :param df: 数据表\n", - " :item_created_time_dict: 文章创建时间的字典\n", - " return : 文章与文章的相似性矩阵\n", - " 思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习), 在多路召回部分会加上关联规则的召回策略\n", - " \"\"\"\n", - " \n", - " user_item_time_dict = get_user_item_time(df)\n", - " \n", - " # 计算物品相似度\n", - " i2i_sim = {}\n", - " item_cnt = defaultdict(int)\n", - " for user, item_time_list in tqdm(user_item_time_dict.items()):\n", - " # 在基于商品的协同过滤优化的时候可以考虑时间因素\n", - " for i, i_click_time in item_time_list:\n", - " item_cnt[i] += 1\n", - " i2i_sim.setdefault(i, {})\n", - " for j, j_click_time in item_time_list:\n", - " if(i == j):\n", - " continue\n", - " i2i_sim[i].setdefault(j, 0)\n", - " \n", - " i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)\n", - " \n", - " i2i_sim_ = i2i_sim.copy()\n", - " for i, related_items in i2i_sim.items():\n", - " for j, wij in related_items.items():\n", - " i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])\n", - " \n", - " # 将得到的相似性矩阵保存到本地\n", - " pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))\n", - " \n", - " return i2i_sim_" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:53:10.038470Z", - "start_time": "2020-11-16T07:51:11.281176Z" - } - }, - "outputs": [ + "cells": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [00:23<00:00, 10802.38it/s]\n" - ] - } - ], - "source": [ - "i2i_sim = itemcf_sim(all_click_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## itemcf 的文章推荐" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T08:03:18.383215Z", - "start_time": "2020-11-16T08:03:18.373432Z" - } - }, - "outputs": [], - "source": [ - "# 基于商品的召回i2i\n", - "def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):\n", - " \"\"\"\n", - " 基于文章协同过滤的召回\n", - " :param user_id: 用户id\n", - " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - " :param i2i_sim: 字典,文章相似性矩阵\n", - " :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章\n", - " :param recall_item_num: 整数, 最后的召回文章数量\n", - " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全 \n", - " return: 召回的文章列表 {item1:score1, item2: score2...}\n", - " 注意: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习), 在多路召回部分会加上关联规则的召回策略\n", - " \"\"\"\n", - " \n", - " # 获取用户历史交互的文章\n", - " user_hist_items = user_item_time_dict[user_id]\n", - " user_hist_items_ = {user_id for user_id, _ in user_hist_items}\n", - " \n", - " item_rank = {}\n", - " for loc, (i, click_time) in enumerate(user_hist_items):\n", - " for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:\n", - " if j in user_hist_items_:\n", - " continue\n", - " \n", - " item_rank.setdefault(j, 0)\n", - " item_rank[j] += wij\n", - " \n", - " # 不足10个,用热门商品补全\n", - " if len(item_rank) < recall_item_num:\n", - " for i, item in enumerate(item_topk_click):\n", - " if item in item_rank.items(): # 填充的item应该不在原来的列表中\n", - " continue\n", - " item_rank[item] = - i - 100 # 随便给个负数就行\n", - " if len(item_rank) == recall_item_num:\n", - " break\n", - " \n", - " item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]\n", - " \n", - " return item_rank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 给每个用户根据物品的协同过滤推荐文章" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:15:01.109798Z", - "start_time": "2020-11-16T08:11:07.233787Z" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 赛题理解\n", + "赛题理解是切入一道赛题的基础,会影响后续特征工程和模型构建等各种工作,也影响着后续发展工作的方向,正确了解赛题背后的思想以及赛题业务逻辑的清晰,有利于花费更少时间构建更为有效的特征模型, 在各种比赛中, 赛题理解都是极其重要且必须走好的第一步, 今天我们就从赛题的理解出发, 首先了解一下这次赛题的概况和数据,从中分析赛题以及大致的处理方式, 其次我们了解模型评测的指标,最后对赛题的理解整理一些经验。\n", + "\n", + "## 赛题简介\n", + "此次比赛是新闻推荐场景下的用户行为预测挑战赛, 该赛题是以新闻APP中的新闻推荐为背景, 目的是**要求我们根据用户历史浏览点击新闻文章的数据信息预测用户未来的点击行为, 即用户的最后一次点击的新闻文章**, 这道赛题的设计初衷是引导大家了解推荐系统中的一些业务背景, 解决实际问题。 \n", + "\n", + "## 数据概况\n", + "该数据来自某新闻APP平台的用户交互数据,包括30万用户,近300万次点击,共36万多篇不同的新闻文章,同时每篇新闻文章有对应的embedding向量表示。为了保证比赛的公平性,从中抽取20万用户的点击日志数据作为训练集,5万用户的点击日志数据作为测试集A,5万用户的点击日志数据作为测试集B。具体数据表和参数, 大家可以参考赛题说明。下面说一下拿到这样的数据如何进行理解, 来有效的开展下一步的工作。

\n", + "## 评价方式理解\n", + "理解评价方式, 我们需要结合着最后的提交文件来看, 根据sample.submit.csv, 我们最后提交的格式是针对每个用户, 我们都会给出五篇文章的推荐结果,按照点击概率从前往后排序。 而真实的每个用户最后一次点击的文章只会有一篇的真实答案, 所以我们就看我们推荐的这五篇里面是否有命中真实答案的。比如对于user1来说, 我们的提交会是:\n", + ">user1, article1, article2, article3, article4, article5.\n", + "\n", + "评价指标的公式如下:\n", + "$$\n", + "score(user) = \\sum_{k=1}^5 \\frac{s(user, k)}{k}\n", + "$$\n", + "\n", + "假如article1就是真实的用户点击文章,也就是article1命中, 则s(user1,1)=1, s(user1,2-4)都是0, 如果article2是用户点击的文章, 则s(user,2)=1/2,s(user,1,3,4,5)都是0。也就是score(user)=命中第几条的倒数。如果都没中, 则score(user1)=0。 这个是合理的, 因为我们希望的就是命中的结果尽量靠前, 而此时分数正好比较高。\n", + "\n", + "## 赛题理解\n", + "根据赛题简介,我们首先要明确我们此次比赛的目标: 根据用户历史浏览点击新闻的数据信息预测用户最后一次点击的新闻文章。从这个目标上看, 会发现此次比赛和我们之前遇到的普通的结构化比赛不太一样, 主要有两点:\n", + " \n", + "- 首先是目标上, 要预测最后一次点击的新闻文章,也就是我们给用户推荐的是新闻文章, 并不是像之前那种预测一个数或者预测数据哪一类那样的问题\n", + "- 数据上, 通过给出的数据我们会发现, 这种数据也不是我们之前遇到的那种特征+标签的数据,而是基于了真实的业务场景, 拿到的用户的点击日志\n", + "\n", + "所以拿到这个题目,我们的思考方向就是结合我们的目标,**把该预测问题转成一个监督学习的问题(特征+标签),然后我们才能进行ML,DL等建模预测**。那么我们自然而然的就应该在心里会有这么几个问题:如何转成一个监督学习问题呢? 转成一个什么样的监督学习问题呢? 我们能利用的特征又有哪些呢? 又有哪些模型可以尝试呢? 此次面对数万级别的文章推荐,我们又有哪些策略呢? \n", + "\n", + "当然这些问题不会在我们刚看到赛题之后就一下出来答案, 但是只要有了问题之后, 我们就能想办法解决问题了, 比如上面的第二个问题,转成一个什么样的监督学习问题? 由于我们是预测用户最后一次点击的新闻文章,从36万篇文章中预测某一篇的话我们首先可能会想到这可能是一个多分类的问题(36万类里面选1), 但是如此庞大的分类问题, 我们做起来可能比较困难, 那么能不能转化一下? 既然是要预测最后一次点击的文章, 那么如果我们能预测出某个用户最后一次对于某一篇文章会进行点击的概率, 是不是就间接性的解决了这个问题呢?概率最大的那篇文章不就是用户最后一次可能点击的新闻文章吗? 这样就把原问题变成了一个点击率预测的问题(用户, 文章) --> 点击的概率(软分类), 而这个问题, 就是我们所熟悉的监督学习领域分类问题了, 这样我们后面建模的时候, 对于模型的选择就基本上有大致方向了,比如最简单的逻辑回归模型。

\n", + "这样, 我们对于该赛题的解决方案应该有了一个大致的解决思路,要先转成一个分类问题来做, 而分类的标签就是用户是否会点击某篇文章,分类问题的特征中会有用户和文章,我们要训练一个分类模型, 对某用户最后一次点击某篇文章的概率进行预测。 那么又会有几个问题:如何转成监督学习问题? 训练集和测试集怎么制作? 我们又能利用哪些特征? 我们又可以尝试哪些模型? 面对36万篇文章, 20多万用户的推荐, 我们又有哪些策略来缩减问题的规模?如何进行最后的预测? " + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [43:19<00:00, 96.18it/s] \n" - ] - } - ], - "source": [ - "# 定义\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "\n", - "# 获取 用户 - 文章 - 点击时间的字典\n", - "user_item_time_dict = get_user_item_time(all_click_df)\n", - "\n", - "# 去取文章相似度\n", - "i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))\n", - "\n", - "# 相似文章的数量\n", - "sim_item_topk = 10\n", - "\n", - "# 召回文章数量\n", - "recall_item_num = 10\n", - "\n", - "# 用户热度补全\n", - "item_topk_click = get_item_topk_click(all_click_df, k=50)\n", - "\n", - "for user in tqdm(all_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, \n", - " sim_item_topk, recall_item_num, item_topk_click)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 召回字典转换成df" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:16:36.647466Z", - "start_time": "2020-11-16T10:16:24.791219Z" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Baseline" + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [00:04<00:00, 53319.08it/s]\n" - ] - } - ], - "source": [ - "# 将字典的形式转换成df\n", - "user_item_score_list = []\n", - "\n", - "for user, items in tqdm(user_recall_items_dict.items()):\n", - " for item, score in items:\n", - " user_item_score_list.append([user, item, score])\n", - "\n", - "recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 生成提交文件" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:16:46.268341Z", - "start_time": "2020-11-16T10:16:46.259293Z" - } - }, - "outputs": [], - "source": [ - "# 生成提交文件\n", - "def submit(recall_df, topk=5, model_name=None):\n", - " recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])\n", - " recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 判断是不是每个用户都有5篇文章及以上\n", - " tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())\n", - " assert tmp.min() >= topk\n", - " \n", - " del recall_df['pred_score']\n", - " submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()\n", - " \n", - " submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]\n", - " # 按照提交格式定义列名\n", - " submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', \n", - " 3: 'article_3', 4: 'article_4', 5: 'article_5'})\n", - " \n", - " save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'\n", - " submit.to_csv(save_name, index=False, header=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:17:42.254328Z", - "start_time": "2020-11-16T10:17:32.211862Z" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 导包" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:46:49.678700Z", + "start_time": "2020-11-16T07:46:49.673336Z" + } + }, + "outputs": [], + "source": [ + "# import packages\n", + "import time, math, os\n", + "from tqdm import tqdm\n", + "import gc\n", + "import pickle\n", + "import random\n", + "from datetime import datetime\n", + "from operator import itemgetter\n", + "import numpy as np\n", + "import pandas as pd\n", + "import warnings\n", + "from collections import defaultdict\n", + "import collections\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:48:34.240098Z", + "start_time": "2020-11-16T07:48:34.236370Z" + } + }, + "outputs": [], + "source": [ + "# data_path = './data_raw/'\n", + "data_path = '/home/admin/jupyter/data/' # 天池平台路径\n", + "save_path = '/home/admin/jupyter/temp_results/' # 天池平台路径" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## df节省内存函数" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# 节约内存的一个标配函数\n", + "def reduce_mem(df):\n", + " starttime = time.time()\n", + " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", + " start_mem = df.memory_usage().sum() / 1024**2\n", + " for col in df.columns:\n", + " col_type = df[col].dtypes\n", + " if col_type in numerics:\n", + " c_min = df[col].min()\n", + " c_max = df[col].max()\n", + " if pd.isnull(c_min) or pd.isnull(c_max):\n", + " continue\n", + " if str(col_type)[:3] == 'int':\n", + " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", + " df[col] = df[col].astype(np.int8)\n", + " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", + " df[col] = df[col].astype(np.int16)\n", + " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", + " df[col] = df[col].astype(np.int32)\n", + " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", + " df[col] = df[col].astype(np.int64)\n", + " else:\n", + " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", + " df[col] = df[col].astype(np.float16)\n", + " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", + " df[col] = df[col].astype(np.float32)\n", + " else:\n", + " df[col] = df[col].astype(np.float64)\n", + " end_mem = df.memory_usage().sum() / 1024**2\n", + " print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,\n", + " 100*(start_mem-end_mem)/start_mem,\n", + " (time.time()-starttime)/60))\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取采样或全量数据" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:48:50.619963Z", + "start_time": "2020-11-16T07:48:50.611667Z" + } + }, + "outputs": [], + "source": [ + "# debug模式:从训练集中划出一部分数据来调试代码\n", + "def get_all_click_sample(data_path, sample_nums=10000):\n", + " \"\"\"\n", + " 训练集中采样一部分数据调试\n", + " data_path: 原数据的存储路径\n", + " sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)\n", + " \"\"\"\n", + " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " all_user_ids = all_click.user_id.unique()\n", + "\n", + " sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) \n", + " all_click = all_click[all_click['user_id'].isin(sample_user_ids)]\n", + " \n", + " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", + " return all_click\n", + "\n", + "# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中\n", + "# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集\n", + "def get_all_click_df(data_path='./data_raw/', offline=True):\n", + " if offline:\n", + " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " else:\n", + " trn_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", + "\n", + " all_click = trn_click.append(tst_click)\n", + " \n", + " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", + " return all_click" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# 全量训练集\n", + "all_click_df = get_all_click_df(data_path, offline=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 获取 用户 - 文章 - 点击时间字典" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:56:39.800240Z", + "start_time": "2020-11-16T07:56:39.793541Z" + } + }, + "outputs": [], + "source": [ + "# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + "def get_user_item_time(click_df):\n", + " \n", + " click_df = click_df.sort_values('click_timestamp')\n", + " \n", + " def make_item_time_pair(df):\n", + " return list(zip(df['click_article_id'], df['click_timestamp']))\n", + " \n", + " user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\\\n", + " .reset_index().rename(columns={0: 'item_time_list'})\n", + " user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))\n", + " \n", + " return user_item_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 获取点击最多的topk个文章" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# 获取近期点击最多的文章\n", + "def get_item_topk_click(click_df, k):\n", + " topk_click = click_df['click_article_id'].value_counts().index[:k]\n", + " return topk_click" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## itemcf的物品相似度计算" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:51:07.577037Z", + "start_time": "2020-11-16T07:51:07.568098Z" + } + }, + "outputs": [], + "source": [ + "def itemcf_sim(df):\n", + " \"\"\"\n", + " 文章与文章之间的相似性矩阵计算\n", + " :param df: 数据表\n", + " :item_created_time_dict: 文章创建时间的字典\n", + " return : 文章与文章的相似性矩阵\n", + " 思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习), 在多路召回部分会加上关联规则的召回策略\n", + " \"\"\"\n", + " \n", + " user_item_time_dict = get_user_item_time(df)\n", + " \n", + " # 计算物品相似度\n", + " i2i_sim = {}\n", + " item_cnt = defaultdict(int)\n", + " for user, item_time_list in tqdm(user_item_time_dict.items()):\n", + " # 在基于商品的协同过滤优化的时候可以考虑时间因素\n", + " for i, i_click_time in item_time_list:\n", + " item_cnt[i] += 1\n", + " i2i_sim.setdefault(i, {})\n", + " for j, j_click_time in item_time_list:\n", + " if(i == j):\n", + " continue\n", + " i2i_sim[i].setdefault(j, 0)\n", + " \n", + " i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)\n", + " \n", + " i2i_sim_ = i2i_sim.copy()\n", + " for i, related_items in i2i_sim.items():\n", + " for j, wij in related_items.items():\n", + " i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])\n", + " \n", + " # 将得到的相似性矩阵保存到本地\n", + " pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))\n", + " \n", + " return i2i_sim_" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:53:10.038470Z", + "start_time": "2020-11-16T07:51:11.281176Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [00:23<00:00, 10802.38it/s]\n" + ] + } + ], + "source": [ + "i2i_sim = itemcf_sim(all_click_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## itemcf 的文章推荐" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T08:03:18.383215Z", + "start_time": "2020-11-16T08:03:18.373432Z" + } + }, + "outputs": [], + "source": [ + "# 基于商品的召回i2i\n", + "def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):\n", + " \"\"\"\n", + " 基于文章协同过滤的召回\n", + " :param user_id: 用户id\n", + " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + " :param i2i_sim: 字典,文章相似性矩阵\n", + " :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章\n", + " :param recall_item_num: 整数, 最后的召回文章数量\n", + " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全 \n", + " return: 召回的文章列表 {item1:score1, item2: score2...}\n", + " 注意: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习), 在多路召回部分会加上关联规则的召回策略\n", + " \"\"\"\n", + " \n", + " # 获取用户历史交互的文章\n", + " user_hist_items = user_item_time_dict[user_id]\n", + " user_hist_items_ = {user_id for user_id, _ in user_hist_items}\n", + " \n", + " item_rank = {}\n", + " for loc, (i, click_time) in enumerate(user_hist_items):\n", + " for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:\n", + " if j in user_hist_items_:\n", + " continue\n", + " \n", + " item_rank.setdefault(j, 0)\n", + " item_rank[j] += wij\n", + " \n", + " # 不足10个,用热门商品补全\n", + " if len(item_rank) < recall_item_num:\n", + " for i, item in enumerate(item_topk_click):\n", + " if item in item_rank.items(): # 填充的item应该不在原来的列表中\n", + " continue\n", + " item_rank[item] = - i - 100 # 随便给个负数就行\n", + " if len(item_rank) == recall_item_num:\n", + " break\n", + " \n", + " item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]\n", + " \n", + " return item_rank" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 给每个用户根据物品的协同过滤推荐文章" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:15:01.109798Z", + "start_time": "2020-11-16T08:11:07.233787Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [43:19<00:00, 96.18it/s] \n" + ] + } + ], + "source": [ + "# 定义\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "\n", + "# 获取 用户 - 文章 - 点击时间的字典\n", + "user_item_time_dict = get_user_item_time(all_click_df)\n", + "\n", + "# 去取文章相似度\n", + "i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))\n", + "\n", + "# 相似文章的数量\n", + "sim_item_topk = 10\n", + "\n", + "# 召回文章数量\n", + "recall_item_num = 10\n", + "\n", + "# 用户热度补全\n", + "item_topk_click = get_item_topk_click(all_click_df, k=50)\n", + "\n", + "for user in tqdm(all_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, \n", + " sim_item_topk, recall_item_num, item_topk_click)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 召回字典转换成df" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:16:36.647466Z", + "start_time": "2020-11-16T10:16:24.791219Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [00:04<00:00, 53319.08it/s]\n" + ] + } + ], + "source": [ + "# 将字典的形式转换成df\n", + "user_item_score_list = []\n", + "\n", + "for user, items in tqdm(user_recall_items_dict.items()):\n", + " for item, score in items:\n", + " user_item_score_list.append([user, item, score])\n", + "\n", + "recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_article_id', 'pred_score'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 生成提交文件" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:16:46.268341Z", + "start_time": "2020-11-16T10:16:46.259293Z" + } + }, + "outputs": [], + "source": [ + "# 生成提交文件\n", + "def submit(recall_df, topk=5, model_name=None):\n", + " recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])\n", + " recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 判断是不是每个用户都有5篇文章及以上\n", + " tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())\n", + " assert tmp.min() >= topk\n", + " \n", + " del recall_df['pred_score']\n", + " submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()\n", + " \n", + " submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]\n", + " # 按照提交格式定义列名\n", + " submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', \n", + " 3: 'article_3', 4: 'article_4', 5: 'article_5'})\n", + " \n", + " save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'\n", + " submit.to_csv(save_name, index=False, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:17:42.254328Z", + "start_time": "2020-11-16T10:17:32.211862Z" + } + }, + "outputs": [], + "source": [ + "# 获取测试集\n", + "tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", + "tst_users = tst_click['user_id'].unique()\n", + "\n", + "# 从所有的召回数据中将测试集中的用户选出来\n", + "tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]\n", + "\n", + "# 生成提交文件\n", + "submit(tst_recall, topk=5, model_name='itemcf_baseline')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 总结\n", + "本节内容主要包括赛题简介,数据概况,评价方式以及对该赛题进行了一个总体上的思路分析,作为竞赛前的预热,旨在帮助学习者们能够更好切入该赛题,为后面的学习内容打下一个良好的基础。最后我们给出了关于本赛题的一个简易Baseline, 帮助学习者们先了解一下新闻推荐比赛的一个整理流程, 接下来我们就对于流程中的每个步骤进行详细的介绍。\n", + "\n", + "今天的学习比较简单,下面整理一下关于赛题理解的一些经验:\n", + "\n", + "* 赛题理解究竟是在理解什么? \n", + "\n", + ">**理解赛题**:从直观上对问题进行梳理, 分析问题的目标,到底要让做什么事情, **这个非常重要**\n", + ">\n", + ">**理解数据**:对赛题数据有一个初步了解,知道和任务相关的数据字段和数据字段的类型, 数据之间的内在关联等,大体梳理一下哪些数据会对我们解决问题非常有用,方便后面我们的数据分析和特征工程。\n", + ">\n", + ">**理解评估指标**:评估指标是检验我们提出的方法,我们给出结果好坏的标准,只有正确的理解了评估指标,我们才能进行更好的训练模型,更好的进行预测。此外,很多情况下,线上验证是有一定的时间和次数限制的,**所以在比赛中构建一个合理的本地的验证集和验证的评价指标是很关键的步骤,能有效的节省很多时间**。 不同的指标对于同样的预测结果是具有误差敏感的差异性的所以不同的评价指标会影响后续一些预测的侧重点。\n", + "\n", + "* 有了赛题理解之后,我们该做什么?\n", + "\n", + " >在对于赛题有了一定的了解后,分析清楚了问题的类型性质和对于数据理解 的这一基础上,我们可以梳理一个解决赛题的一个大题思路和框架\n", + " >\n", + " >我们至少要有一些相应的理解分析,比如**这题的难点可能在哪里,关键点可能在哪里,哪些地方可以挖掘更好的特征**.\n", + " >\n", + " >用什么样得线下验证方式更为稳定,**出现了过拟合或者其他问题,估摸可以用什么方法去解决这些问题**\n", + "\n", + " 这时是在一个宏观的大体下分析的,有助于摸清整个题的思路脉络,以及后续的分析方向\n", + "\n", + "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", + "\n", + "![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" + ] } - }, - "outputs": [], - "source": [ - "# 获取测试集\n", - "tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", - "tst_users = tst_click['user_id'].unique()\n", - "\n", - "# 从所有的召回数据中将测试集中的用户选出来\n", - "tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]\n", - "\n", - "# 生成提交文件\n", - "submit(tst_recall, topk=5, model_name='itemcf_baseline')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 总结\n", - "本节内容主要包括赛题简介,数据概况,评价方式以及对该赛题进行了一个总体上的思路分析,作为竞赛前的预热,旨在帮助学习者们能够更好切入该赛题,为后面的学习内容打下一个良好的基础。最后我们给出了关于本赛题的一个简易Baseline, 帮助学习者们先了解一下新闻推荐比赛的一个整理流程, 接下来我们就对于流程中的每个步骤进行详细的介绍。\n", - "\n", - "今天的学习比较简单,下面整理一下关于赛题理解的一些经验:\n", - "\n", - "* 赛题理解究竟是在理解什么? \n", - "\n", - ">**理解赛题**:从直观上对问题进行梳理, 分析问题的目标,到底要让做什么事情, **这个非常重要**\n", - ">\n", - ">**理解数据**:对赛题数据有一个初步了解,知道和任务相关的数据字段和数据字段的类型, 数据之间的内在关联等,大体梳理一下哪些数据会对我们解决问题非常有用,方便后面我们的数据分析和特征工程。\n", - ">\n", - ">**理解评估指标**:评估指标是检验我们提出的方法,我们给出结果好坏的标准,只有正确的理解了评估指标,我们才能进行更好的训练模型,更好的进行预测。此外,很多情况下,线上验证是有一定的时间和次数限制的,**所以在比赛中构建一个合理的本地的验证集和验证的评价指标是很关键的步骤,能有效的节省很多时间**。 不同的指标对于同样的预测结果是具有误差敏感的差异性的所以不同的评价指标会影响后续一些预测的侧重点。\n", - "\n", - "* 有了赛题理解之后,我们该做什么?\n", - "\n", - " >在对于赛题有了一定的了解后,分析清楚了问题的类型性质和对于数据理解 的这一基础上,我们可以梳理一个解决赛题的一个大题思路和框架\n", - " >\n", - " >我们至少要有一些相应的理解分析,比如**这题的难点可能在哪里,关键点可能在哪里,哪些地方可以挖掘更好的特征**.\n", - " >\n", - " >用什么样得线下验证方式更为稳定,**出现了过拟合或者其他问题,估摸可以用什么方法去解决这些问题**\n", - "\n", - " 这时是在一个宏观的大体下分析的,有助于摸清整个题的思路脉络,以及后续的分析方向\n", - "\n", - "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "tianchi_metadata": { - "competitions": [], - "datasets": [], - "description": "", - "notebookId": "130006", - "source": "dsw" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "170px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "tianchi_metadata": { + "competitions": [], + "datasets": [], + "description": "", + "notebookId": "130006", + "source": "dsw" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "170px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.2 \346\225\260\346\215\256\345\210\206\346\236\220.ipynb" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.2 \346\225\260\346\215\256\345\210\206\346\236\220.ipynb" index c9cbc0c37..6bc2d7d2b 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.2 \346\225\260\346\215\256\345\210\206\346\236\220.ipynb" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.2 \346\225\260\346\215\256\345\210\206\346\236\220.ipynb" @@ -1,3980 +1,3980 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 数据分析\n", - "\n", - "数据分析的价值主要在于熟悉了解整个数据集的基本情况包括每个文件里有哪些数据,具体的文件中的每个字段表示什么实际含义,以及数据集中特征之间的相关性,在推荐场景下主要就是分析用户本身的基本属性,文章基本属性,以及用户和文章交互的一些分布,这些都有利于后面的召回策略的选择,以及特征工程。\n", - "\n", - "**建议:当特征工程和模型调参已经很难继续上分了,可以回来在重新从新的角度去分析这些数据,或许可以找到上分的灵感**\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 导包" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:13:59.322486Z", - "start_time": "2020-11-13T15:13:55.601445Z" - } - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "plt.rc('font', family='SimHei', size=13)\n", - "\n", - "import os,gc,re,warnings,sys\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取数据" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:14:18.918041Z", - "start_time": "2020-11-13T15:14:02.568798Z" - } - }, - "outputs": [], - "source": [ - "# path = './data/' # 自定义的路径\n", - "path = './' # 天池平台路径\n", - "\n", - "#####train\n", - "trn_click = pd.read_csv(path+'train_click_log.csv')\n", - "#trn_click = pd.read_csv(path+'train_click_log.csv', names=['user_id','item_id','click_time','click_environment','click_deviceGroup','click_os','click_country','click_region','click_referrer_type'])\n", - "item_df = pd.read_csv(path+'articles.csv')\n", - "item_df = item_df.rename(columns={'article_id': 'click_article_id'}) #重命名,方便后续match\n", - "item_emb_df = pd.read_csv(path+'articles_emb.csv')\n", - "\n", - "#####test\n", - "tst_click = pd.read_csv(path+'testA_click_log.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 数据预处理\n", - "计算用户点击rank和点击次数" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:14:31.746748Z", - "start_time": "2020-11-13T15:14:31.409643Z" - } - }, - "outputs": [], - "source": [ - "# 对每个用户的点击时间戳进行排序\n", - "trn_click['rank'] = trn_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)\n", - "tst_click['rank'] = tst_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:15:04.503079Z", - "start_time": "2020-11-13T15:15:04.394329Z" - } - }, - "outputs": [], - "source": [ - "#计算用户点击文章的次数,并添加新的一列count\n", - "trn_click['click_cnts'] = trn_click.groupby(['user_id'])['click_timestamp'].transform('count')\n", - "tst_click['click_cnts'] = tst_click.groupby(['user_id'])['click_timestamp'].transform('count')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 数据浏览" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户点击日志文件_训练集" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:16:07.764776Z", - "start_time": "2020-11-13T15:16:07.536342Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
019999916041715070295701904117113111112811506942089000173
11999995408150702957147841171131101141506994257000118
219999950823150702960147841171131911991507013614000213
319999815777015070295322004117125540402811506983935000201
41999989661315070296718314117125539402091506938444000185
\n", - "
" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 数据分析\n", + "\n", + "数据分析的价值主要在于熟悉了解整个数据集的基本情况包括每个文件里有哪些数据,具体的文件中的每个字段表示什么实际含义,以及数据集中特征之间的相关性,在推荐场景下主要就是分析用户本身的基本属性,文章基本属性,以及用户和文章交互的一些分布,这些都有利于后面的召回策略的选择,以及特征工程。\n", + "\n", + "**建议:当特征工程和模型调参已经很难继续上分了,可以回来在重新从新的角度去分析这些数据,或许可以找到上分的灵感**\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 导包" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:13:59.322486Z", + "start_time": "2020-11-13T15:13:55.601445Z" + } + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "plt.rc('font', family='SimHei', size=13)\n", + "\n", + "import os,gc,re,warnings,sys\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取数据" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:14:18.918041Z", + "start_time": "2020-11-13T15:14:02.568798Z" + } + }, + "outputs": [], + "source": [ + "# path = './data/' # 自定义的路径\n", + "path = './' # 天池平台路径\n", + "\n", + "#####train\n", + "trn_click = pd.read_csv(path+'train_click_log.csv')\n", + "#trn_click = pd.read_csv(path+'train_click_log.csv', names=['user_id','item_id','click_time','click_environment','click_deviceGroup','click_os','click_country','click_region','click_referrer_type'])\n", + "item_df = pd.read_csv(path+'articles.csv')\n", + "item_df = item_df.rename(columns={'article_id': 'click_article_id'}) #重命名,方便后续match\n", + "item_emb_df = pd.read_csv(path+'articles_emb.csv')\n", + "\n", + "#####test\n", + "tst_click = pd.read_csv(path+'testA_click_log.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据预处理\n", + "计算用户点击rank和点击次数" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:14:31.746748Z", + "start_time": "2020-11-13T15:14:31.409643Z" + } + }, + "outputs": [], + "source": [ + "# 对每个用户的点击时间戳进行排序\n", + "trn_click['rank'] = trn_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)\n", + "tst_click['rank'] = tst_click.groupby(['user_id'])['click_timestamp'].rank(ascending=False).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:15:04.503079Z", + "start_time": "2020-11-13T15:15:04.394329Z" + } + }, + "outputs": [], + "source": [ + "#计算用户点击文章的次数,并添加新的一列count\n", + "trn_click['click_cnts'] = trn_click.groupby(['user_id'])['click_timestamp'].transform('count')\n", + "tst_click['click_cnts'] = tst_click.groupby(['user_id'])['click_timestamp'].transform('count')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据浏览" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户点击日志文件_训练集" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:16:07.764776Z", + "start_time": "2020-11-13T15:16:07.536342Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
019999916041715070295701904117113111112811506942089000173
11999995408150702957147841171131101141506994257000118
219999950823150702960147841171131911991507013614000213
319999815777015070295322004117125540402811506983935000201
41999989661315070296718314117125539402091506938444000185
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "0 199999 160417 1507029570190 4 \n", + "1 199999 5408 1507029571478 4 \n", + "2 199999 50823 1507029601478 4 \n", + "3 199998 157770 1507029532200 4 \n", + "4 199998 96613 1507029671831 4 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "0 1 17 1 13 \n", + "1 1 17 1 13 \n", + "2 1 17 1 13 \n", + "3 1 17 1 25 \n", + "4 1 17 1 25 \n", + "\n", + " click_referrer_type rank click_cnts category_id created_at_ts \\\n", + "0 1 11 11 281 1506942089000 \n", + "1 1 10 11 4 1506994257000 \n", + "2 1 9 11 99 1507013614000 \n", + "3 5 40 40 281 1506983935000 \n", + "4 5 39 40 209 1506938444000 \n", + "\n", + " words_count \n", + "0 173 \n", + "1 118 \n", + "2 213 \n", + "3 201 \n", + "4 185 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "0 199999 160417 1507029570190 4 \n", - "1 199999 5408 1507029571478 4 \n", - "2 199999 50823 1507029601478 4 \n", - "3 199998 157770 1507029532200 4 \n", - "4 199998 96613 1507029671831 4 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "0 1 17 1 13 \n", - "1 1 17 1 13 \n", - "2 1 17 1 13 \n", - "3 1 17 1 25 \n", - "4 1 17 1 25 \n", - "\n", - " click_referrer_type rank click_cnts category_id created_at_ts \\\n", - "0 1 11 11 281 1506942089000 \n", - "1 1 10 11 4 1506994257000 \n", - "2 1 9 11 99 1507013614000 \n", - "3 5 40 40 281 1506983935000 \n", - "4 5 39 40 209 1506938444000 \n", - "\n", - " words_count \n", - "0 173 \n", - "1 118 \n", - "2 213 \n", - "3 201 \n", - "4 185 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click = trn_click.merge(item_df, how='left', on=['click_article_id'])\n", - "trn_click.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### train_click_log.csv文件数据中每个字段的含义\n", - "\n", - "1. user_id: 用户的唯一标识\n", - "2. click_article_id: 用户点击的文章唯一标识\n", - "3. click_timestamp: 用户点击文章时的时间戳\n", - "4. click_environment: 用户点击文章的环境\n", - "5. click_deviceGroup: 用户点击文章的设备组\n", - "6. click_os: 用户点击文章时的操作系统\n", - "7. click_country: 用户点击文章时的所在的国家\n", - "8. click_region: 用户点击文章时所在的区域\n", - "9. click_referrer_type: 用户点击文章时,文章的来源" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:16:18.536902Z", - "start_time": "2020-11-13T15:16:18.424203Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Int64Index: 1112623 entries, 0 to 1112622\n", - "Data columns (total 14 columns):\n", - "user_id 1112623 non-null int64\n", - "click_article_id 1112623 non-null int64\n", - "click_timestamp 1112623 non-null int64\n", - "click_environment 1112623 non-null int64\n", - "click_deviceGroup 1112623 non-null int64\n", - "click_os 1112623 non-null int64\n", - "click_country 1112623 non-null int64\n", - "click_region 1112623 non-null int64\n", - "click_referrer_type 1112623 non-null int64\n", - "rank 1112623 non-null int64\n", - "click_cnts 1112623 non-null int64\n", - "category_id 1112623 non-null int64\n", - "created_at_ts 1112623 non-null int64\n", - "words_count 1112623 non-null int64\n", - "dtypes: int64(14)\n", - "memory usage: 127.3 MB\n" - ] - } - ], - "source": [ - "#用户点击日志信息\n", - "trn_click.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
count1.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+06
mean1.221198e+051.951541e+051.507588e+123.947786e+001.815981e+001.301976e+011.310776e+001.813587e+011.910063e+007.118518e+001.323704e+013.056176e+021.506598e+122.011981e+02
std5.540349e+049.292286e+043.363466e+083.276715e-011.035170e+006.967844e+001.618264e+007.105832e+001.220012e+001.016095e+011.631503e+011.155791e+028.343066e+095.223881e+01
min0.000000e+003.000000e+001.507030e+121.000000e+001.000000e+002.000000e+001.000000e+001.000000e+001.000000e+001.000000e+002.000000e+001.000000e+001.166573e+120.000000e+00
25%7.934700e+041.239090e+051.507297e+124.000000e+001.000000e+002.000000e+001.000000e+001.300000e+011.000000e+002.000000e+004.000000e+002.500000e+021.507220e+121.700000e+02
50%1.309670e+052.038900e+051.507596e+124.000000e+001.000000e+001.700000e+011.000000e+002.100000e+012.000000e+004.000000e+008.000000e+003.280000e+021.507553e+121.970000e+02
75%1.704010e+052.777120e+051.507841e+124.000000e+003.000000e+001.700000e+011.000000e+002.500000e+012.000000e+008.000000e+001.600000e+014.100000e+021.507756e+122.280000e+02
max1.999990e+053.640460e+051.510603e+124.000000e+005.000000e+002.000000e+011.100000e+012.800000e+017.000000e+002.410000e+022.410000e+024.600000e+021.510666e+126.690000e+03
\n", - "
" + "source": [ + "trn_click = trn_click.merge(item_df, how='left', on=['click_article_id'])\n", + "trn_click.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### train_click_log.csv文件数据中每个字段的含义\n", + "\n", + "1. user_id: 用户的唯一标识\n", + "2. click_article_id: 用户点击的文章唯一标识\n", + "3. click_timestamp: 用户点击文章时的时间戳\n", + "4. click_environment: 用户点击文章的环境\n", + "5. click_deviceGroup: 用户点击文章的设备组\n", + "6. click_os: 用户点击文章时的操作系统\n", + "7. click_country: 用户点击文章时的所在的国家\n", + "8. click_region: 用户点击文章时所在的区域\n", + "9. click_referrer_type: 用户点击文章时,文章的来源" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:16:18.536902Z", + "start_time": "2020-11-13T15:16:18.424203Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 1112623 entries, 0 to 1112622\n", + "Data columns (total 14 columns):\n", + "user_id 1112623 non-null int64\n", + "click_article_id 1112623 non-null int64\n", + "click_timestamp 1112623 non-null int64\n", + "click_environment 1112623 non-null int64\n", + "click_deviceGroup 1112623 non-null int64\n", + "click_os 1112623 non-null int64\n", + "click_country 1112623 non-null int64\n", + "click_region 1112623 non-null int64\n", + "click_referrer_type 1112623 non-null int64\n", + "rank 1112623 non-null int64\n", + "click_cnts 1112623 non-null int64\n", + "category_id 1112623 non-null int64\n", + "created_at_ts 1112623 non-null int64\n", + "words_count 1112623 non-null int64\n", + "dtypes: int64(14)\n", + "memory usage: 127.3 MB\n" + ] + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", - "mean 1.221198e+05 1.951541e+05 1.507588e+12 3.947786e+00 \n", - "std 5.540349e+04 9.292286e+04 3.363466e+08 3.276715e-01 \n", - "min 0.000000e+00 3.000000e+00 1.507030e+12 1.000000e+00 \n", - "25% 7.934700e+04 1.239090e+05 1.507297e+12 4.000000e+00 \n", - "50% 1.309670e+05 2.038900e+05 1.507596e+12 4.000000e+00 \n", - "75% 1.704010e+05 2.777120e+05 1.507841e+12 4.000000e+00 \n", - "max 1.999990e+05 3.640460e+05 1.510603e+12 4.000000e+00 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", - "mean 1.815981e+00 1.301976e+01 1.310776e+00 1.813587e+01 \n", - "std 1.035170e+00 6.967844e+00 1.618264e+00 7.105832e+00 \n", - "min 1.000000e+00 2.000000e+00 1.000000e+00 1.000000e+00 \n", - "25% 1.000000e+00 2.000000e+00 1.000000e+00 1.300000e+01 \n", - "50% 1.000000e+00 1.700000e+01 1.000000e+00 2.100000e+01 \n", - "75% 3.000000e+00 1.700000e+01 1.000000e+00 2.500000e+01 \n", - "max 5.000000e+00 2.000000e+01 1.100000e+01 2.800000e+01 \n", - "\n", - " click_referrer_type rank click_cnts category_id \\\n", - "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", - "mean 1.910063e+00 7.118518e+00 1.323704e+01 3.056176e+02 \n", - "std 1.220012e+00 1.016095e+01 1.631503e+01 1.155791e+02 \n", - "min 1.000000e+00 1.000000e+00 2.000000e+00 1.000000e+00 \n", - "25% 1.000000e+00 2.000000e+00 4.000000e+00 2.500000e+02 \n", - "50% 2.000000e+00 4.000000e+00 8.000000e+00 3.280000e+02 \n", - "75% 2.000000e+00 8.000000e+00 1.600000e+01 4.100000e+02 \n", - "max 7.000000e+00 2.410000e+02 2.410000e+02 4.600000e+02 \n", - "\n", - " created_at_ts words_count \n", - "count 1.112623e+06 1.112623e+06 \n", - "mean 1.506598e+12 2.011981e+02 \n", - "std 8.343066e+09 5.223881e+01 \n", - "min 1.166573e+12 0.000000e+00 \n", - "25% 1.507220e+12 1.700000e+02 \n", - "50% 1.507553e+12 1.970000e+02 \n", - "75% 1.507756e+12 2.280000e+02 \n", - "max 1.510666e+12 6.690000e+03 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "200000" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#训练集中的用户数量为20w\n", - "trn_click.user_id.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T16:03:01.378461Z", - "start_time": "2020-11-13T16:03:01.300712Z" - } - }, - "outputs": [ + "source": [ + "#用户点击日志信息\n", + "trn_click.info()" + ] + }, { - "data": { - "text/plain": [ - "2" + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
count1.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+061.112623e+06
mean1.221198e+051.951541e+051.507588e+123.947786e+001.815981e+001.301976e+011.310776e+001.813587e+011.910063e+007.118518e+001.323704e+013.056176e+021.506598e+122.011981e+02
std5.540349e+049.292286e+043.363466e+083.276715e-011.035170e+006.967844e+001.618264e+007.105832e+001.220012e+001.016095e+011.631503e+011.155791e+028.343066e+095.223881e+01
min0.000000e+003.000000e+001.507030e+121.000000e+001.000000e+002.000000e+001.000000e+001.000000e+001.000000e+001.000000e+002.000000e+001.000000e+001.166573e+120.000000e+00
25%7.934700e+041.239090e+051.507297e+124.000000e+001.000000e+002.000000e+001.000000e+001.300000e+011.000000e+002.000000e+004.000000e+002.500000e+021.507220e+121.700000e+02
50%1.309670e+052.038900e+051.507596e+124.000000e+001.000000e+001.700000e+011.000000e+002.100000e+012.000000e+004.000000e+008.000000e+003.280000e+021.507553e+121.970000e+02
75%1.704010e+052.777120e+051.507841e+124.000000e+003.000000e+001.700000e+011.000000e+002.500000e+012.000000e+008.000000e+001.600000e+014.100000e+021.507756e+122.280000e+02
max1.999990e+053.640460e+051.510603e+124.000000e+005.000000e+002.000000e+011.100000e+012.800000e+017.000000e+002.410000e+022.410000e+024.600000e+021.510666e+126.690000e+03
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", + "mean 1.221198e+05 1.951541e+05 1.507588e+12 3.947786e+00 \n", + "std 5.540349e+04 9.292286e+04 3.363466e+08 3.276715e-01 \n", + "min 0.000000e+00 3.000000e+00 1.507030e+12 1.000000e+00 \n", + "25% 7.934700e+04 1.239090e+05 1.507297e+12 4.000000e+00 \n", + "50% 1.309670e+05 2.038900e+05 1.507596e+12 4.000000e+00 \n", + "75% 1.704010e+05 2.777120e+05 1.507841e+12 4.000000e+00 \n", + "max 1.999990e+05 3.640460e+05 1.510603e+12 4.000000e+00 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", + "mean 1.815981e+00 1.301976e+01 1.310776e+00 1.813587e+01 \n", + "std 1.035170e+00 6.967844e+00 1.618264e+00 7.105832e+00 \n", + "min 1.000000e+00 2.000000e+00 1.000000e+00 1.000000e+00 \n", + "25% 1.000000e+00 2.000000e+00 1.000000e+00 1.300000e+01 \n", + "50% 1.000000e+00 1.700000e+01 1.000000e+00 2.100000e+01 \n", + "75% 3.000000e+00 1.700000e+01 1.000000e+00 2.500000e+01 \n", + "max 5.000000e+00 2.000000e+01 1.100000e+01 2.800000e+01 \n", + "\n", + " click_referrer_type rank click_cnts category_id \\\n", + "count 1.112623e+06 1.112623e+06 1.112623e+06 1.112623e+06 \n", + "mean 1.910063e+00 7.118518e+00 1.323704e+01 3.056176e+02 \n", + "std 1.220012e+00 1.016095e+01 1.631503e+01 1.155791e+02 \n", + "min 1.000000e+00 1.000000e+00 2.000000e+00 1.000000e+00 \n", + "25% 1.000000e+00 2.000000e+00 4.000000e+00 2.500000e+02 \n", + "50% 2.000000e+00 4.000000e+00 8.000000e+00 3.280000e+02 \n", + "75% 2.000000e+00 8.000000e+00 1.600000e+01 4.100000e+02 \n", + "max 7.000000e+00 2.410000e+02 2.410000e+02 4.600000e+02 \n", + "\n", + " created_at_ts words_count \n", + "count 1.112623e+06 1.112623e+06 \n", + "mean 1.506598e+12 2.011981e+02 \n", + "std 8.343066e+09 5.223881e+01 \n", + "min 1.166573e+12 0.000000e+00 \n", + "25% 1.507220e+12 1.700000e+02 \n", + "50% 1.507553e+12 1.970000e+02 \n", + "75% 1.507756e+12 2.280000e+02 \n", + "max 1.510666e+12 6.690000e+03 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trn_click.describe()" ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click.groupby('user_id')['click_article_id'].count().min() # 训练集里面每个用户至少点击了两篇文章" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### 画直方图大体看一下基本的属性分布" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.\n", - "findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.\n" - ] - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure()\n", - "plt.figure(figsize=(15, 20))\n", - "i = 1\n", - "for col in ['click_article_id', 'click_timestamp', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', \n", - " 'click_region', 'click_referrer_type', 'rank', 'click_cnts']:\n", - " plot_envs = plt.subplot(5, 2, i)\n", - " i += 1\n", - " v = trn_click[col].value_counts().reset_index()[:10]\n", - " fig = sns.barplot(x=v['index'], y=v[col])\n", - " for item in fig.get_xticklabels():\n", - " item.set_rotation(90)\n", - " plt.title(col)\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "注:此处click_cnts直方图表示的是每篇文章对应用户的点击次数累计图\n", - "\n", - "也可以以用户角度分析,画出每个用户点击文章次数的直方图" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4 1084627\n", - "2 25894\n", - "1 2102\n", - "Name: click_environment, dtype: int64" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click['click_environment'].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从点击环境click_environment来看,仅有2102次(占0.19%)点击环境为1;仅有25894次(占2.3%)点击环境为2;剩余(占97.6%)点击环境为4。" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1 678187\n", - "3 395558\n", - "4 38731\n", - "5 141\n", - "2 6\n", - "Name: click_deviceGroup, dtype: int64" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_click['click_deviceGroup'].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从点击设备组click_deviceGroup来看,设备1占大部分(61%),设备3占36%。" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 测试集用户点击日志" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
024999916097415069591428204117113219192811506912747000259
124999916041715069591728204117113218192811506942089000173
2249998160974150695905606641121132552811506912747000259
3249998202557150695908606641121132453271506938401000219
4249997183665150695908861341171155773011500895686000256
\n", - "
" + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200000" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "0 249999 160974 1506959142820 4 \n", - "1 249999 160417 1506959172820 4 \n", - "2 249998 160974 1506959056066 4 \n", - "3 249998 202557 1506959086066 4 \n", - "4 249997 183665 1506959088613 4 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "0 1 17 1 13 \n", - "1 1 17 1 13 \n", - "2 1 12 1 13 \n", - "3 1 12 1 13 \n", - "4 1 17 1 15 \n", - "\n", - " click_referrer_type rank click_cnts category_id created_at_ts \\\n", - "0 2 19 19 281 1506912747000 \n", - "1 2 18 19 281 1506942089000 \n", - "2 2 5 5 281 1506912747000 \n", - "3 2 4 5 327 1506938401000 \n", - "4 5 7 7 301 1500895686000 \n", - "\n", - " words_count \n", - "0 259 \n", - "1 173 \n", - "2 259 \n", - "3 219 \n", - "4 256 " - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tst_click = tst_click.merge(item_df, how='left', on=['click_article_id'])\n", - "tst_click.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
count518010.000000518010.0000005.180100e+05518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.0000005.180100e+05518010.000000
mean227342.428169193803.7925501.507387e+123.9473001.73828513.6284671.34820918.2502501.81961415.52178530.043586305.3249611.506883e+12210.966331
std14613.90718888279.3881773.706127e+080.3239161.0208586.6255641.7035247.0607981.08265733.95770256.868021110.4115135.816668e+0983.040065
min200000.000000137.0000001.506959e+121.0000001.0000002.0000001.0000001.0000001.0000001.0000001.0000001.0000001.265812e+120.000000
25%214926.000000128551.0000001.507026e+124.0000001.00000012.0000001.00000013.0000001.0000004.00000010.000000252.0000001.506970e+12176.000000
50%229109.000000199197.0000001.507308e+124.0000001.00000017.0000001.00000021.0000002.0000008.00000019.000000323.0000001.507249e+12199.000000
75%240182.000000272143.0000001.507666e+124.0000003.00000017.0000001.00000025.0000002.00000018.00000035.000000399.0000001.507630e+12232.000000
max249999.000000364043.0000001.508832e+124.0000005.00000020.00000011.00000028.0000007.000000938.000000938.000000460.0000001.509949e+123082.000000
\n", - "
" + "source": [ + "#训练集中的用户数量为20w\n", + "trn_click.user_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T16:03:01.378461Z", + "start_time": "2020-11-13T16:03:01.300712Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "count 518010.000000 518010.000000 5.180100e+05 518010.000000 \n", - "mean 227342.428169 193803.792550 1.507387e+12 3.947300 \n", - "std 14613.907188 88279.388177 3.706127e+08 0.323916 \n", - "min 200000.000000 137.000000 1.506959e+12 1.000000 \n", - "25% 214926.000000 128551.000000 1.507026e+12 4.000000 \n", - "50% 229109.000000 199197.000000 1.507308e+12 4.000000 \n", - "75% 240182.000000 272143.000000 1.507666e+12 4.000000 \n", - "max 249999.000000 364043.000000 1.508832e+12 4.000000 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "count 518010.000000 518010.000000 518010.000000 518010.000000 \n", - "mean 1.738285 13.628467 1.348209 18.250250 \n", - "std 1.020858 6.625564 1.703524 7.060798 \n", - "min 1.000000 2.000000 1.000000 1.000000 \n", - "25% 1.000000 12.000000 1.000000 13.000000 \n", - "50% 1.000000 17.000000 1.000000 21.000000 \n", - "75% 3.000000 17.000000 1.000000 25.000000 \n", - "max 5.000000 20.000000 11.000000 28.000000 \n", - "\n", - " click_referrer_type rank click_cnts category_id \\\n", - "count 518010.000000 518010.000000 518010.000000 518010.000000 \n", - "mean 1.819614 15.521785 30.043586 305.324961 \n", - "std 1.082657 33.957702 56.868021 110.411513 \n", - "min 1.000000 1.000000 1.000000 1.000000 \n", - "25% 1.000000 4.000000 10.000000 252.000000 \n", - "50% 2.000000 8.000000 19.000000 323.000000 \n", - "75% 2.000000 18.000000 35.000000 399.000000 \n", - "max 7.000000 938.000000 938.000000 460.000000 \n", - "\n", - " created_at_ts words_count \n", - "count 5.180100e+05 518010.000000 \n", - "mean 1.506883e+12 210.966331 \n", - "std 5.816668e+09 83.040065 \n", - "min 1.265812e+12 0.000000 \n", - "25% 1.506970e+12 176.000000 \n", - "50% 1.507249e+12 199.000000 \n", - "75% 1.507630e+12 232.000000 \n", - "max 1.509949e+12 3082.000000 " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tst_click.describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "我们可以看出训练集和测试集的用户是完全不一样的\n", - "\n", - "训练集的用户ID由0 ~ 199999,而测试集A的用户ID由200000 ~ 249999。\n", - "\n", - "因此,也就是我们在训练时,需要把测试集的数据也包括在内,称为全量数据。\n", - "\n", - "!!!!!!!!!!!!!!!后续将对训练集和测试集合并分析!!!!!!!!!!!" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "50000" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#测试集中的用户数量为5w\n", - "tst_click.user_id.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:56:07.717463Z", - "start_time": "2020-11-13T15:56:07.693494Z" - } - }, - "outputs": [ + "source": [ + "trn_click.groupby('user_id')['click_article_id'].count().min() # 训练集里面每个用户至少点击了两篇文章" + ] + }, { - "data": { - "text/plain": [ - "1" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 画直方图大体看一下基本的属性分布" ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tst_click.groupby('user_id')['click_article_id'].count().min() # 注意测试集里面有只点击过一次文章的用户" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻文章信息数据表" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:20:34.183761Z", - "start_time": "2020-11-13T15:20:34.164770Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
click_article_idcategory_idcreated_at_tswords_count
0001513144419000168
1111405341936000189
2211408667706000250
3311408468313000230
4411407071171000162
3640423640424601434034118000144
3640433640434601434148472000463
3640443640444601457974279000177
3640453640454601515964737000126
3640463640464601505811330000479
\n", - "
" + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.\n", + "findfont: Font family ['SimHei'] not found. Falling back to DejaVu Sans.\n" + ] + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " click_article_id category_id created_at_ts words_count\n", - "0 0 0 1513144419000 168\n", - "1 1 1 1405341936000 189\n", - "2 2 1 1408667706000 250\n", - "3 3 1 1408468313000 230\n", - "4 4 1 1407071171000 162\n", - "364042 364042 460 1434034118000 144\n", - "364043 364043 460 1434148472000 463\n", - "364044 364044 460 1457974279000 177\n", - "364045 364045 460 1515964737000 126\n", - "364046 364046 460 1505811330000 479" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#新闻文章数据集浏览\n", - "item_df.head().append(item_df.tail())" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:28:13.084501Z", - "start_time": "2020-11-13T15:28:13.062561Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "176 3485\n", - "182 3480\n", - "179 3463\n", - "178 3458\n", - "174 3456\n", - "183 3432\n", - "184 3427\n", - "173 3414\n", - "180 3403\n", - "177 3391\n", - "170 3387\n", - "187 3355\n", - "169 3352\n", - "185 3348\n", - "175 3346\n", - "181 3330\n", - "186 3328\n", - "189 3327\n", - "171 3327\n", - "172 3322\n", - "165 3308\n", - "188 3288\n", - "167 3269\n", - "190 3261\n", - "192 3257\n", - "168 3248\n", - "193 3225\n", - "166 3199\n", - "191 3182\n", - "194 3164\n", - " ... \n", - "601 1\n", - "857 1\n", - "1977 1\n", - "1626 1\n", - "697 1\n", - "1720 1\n", - "696 1\n", - "706 1\n", - "592 1\n", - "1605 1\n", - "586 1\n", - "582 1\n", - "1606 1\n", - "972 1\n", - "716 1\n", - "584 1\n", - "1608 1\n", - "715 1\n", - "841 1\n", - "968 1\n", - "964 1\n", - "587 1\n", - "1099 1\n", - "1355 1\n", - "711 1\n", - "845 1\n", - "710 1\n", - "965 1\n", - "847 1\n", - "1535 1\n", - "Name: words_count, Length: 866, dtype: int64" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_df['words_count'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:28:59.029535Z", - "start_time": "2020-11-13T15:28:58.816106Z" - } - }, - "outputs": [ + "source": [ + "plt.figure()\n", + "plt.figure(figsize=(15, 20))\n", + "i = 1\n", + "for col in ['click_article_id', 'click_timestamp', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', \n", + " 'click_region', 'click_referrer_type', 'rank', 'click_cnts']:\n", + " plot_envs = plt.subplot(5, 2, i)\n", + " i += 1\n", + " v = trn_click[col].value_counts().reset_index()[:10]\n", + " fig = sns.barplot(x=v['index'], y=v[col])\n", + " for item in fig.get_xticklabels():\n", + " item.set_rotation(90)\n", + " plt.title(col)\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "461\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "注:此处click_cnts直方图表示的是每篇文章对应用户的点击次数累计图\n", + "\n", + "也可以以用户角度分析,画出每个用户点击文章次数的直方图" + ] }, { - "data": { - "text/plain": [ - "" + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4 1084627\n", + "2 25894\n", + "1 2102\n", + "Name: click_environment, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trn_click['click_environment'].value_counts()" ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从点击环境click_environment来看,仅有2102次(占0.19%)点击环境为1;仅有25894次(占2.3%)点击环境为2;剩余(占97.6%)点击环境为4。" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "print(item_df['category_id'].nunique()) # 461个文章主题\n", - "item_df['category_id'].hist()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(364047, 4)" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_df.shape # 364047篇文章" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻文章embedding向量表示" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
article_idemb_0emb_1emb_2emb_3emb_4emb_5emb_6emb_7emb_8...emb_240emb_241emb_242emb_243emb_244emb_245emb_246emb_247emb_248emb_249
00-0.161183-0.957233-0.1379440.0508550.8300550.901365-0.335148-0.559561-0.500603...0.3212480.3139990.6364120.1691790.540524-0.8131820.286870-0.2316860.5974160.409623
11-0.523216-0.9740580.7386080.1552340.6262940.485297-0.715657-0.897996-0.359747...-0.4878430.8231240.412688-0.3386540.3207860.588643-0.5941370.1828280.397090-0.834364
22-0.619619-0.972960-0.207360-0.1288610.044748-0.387535-0.730477-0.066126-0.754899...0.4547560.4731840.377866-0.863887-0.3833650.137721-0.810877-0.4475800.805932-0.285284
33-0.740843-0.9757490.3916980.641738-0.2686450.191745-0.825593-0.710591-0.040099...0.2715350.0360400.480029-0.7631730.0226270.565165-0.910286-0.5378380.243541-0.885329
44-0.279052-0.9723150.6853740.1130560.2383150.271913-0.5688160.341194-0.600554...0.2382860.8092680.427521-0.615932-0.5036970.614450-0.917760-0.4240610.185484-0.580292
\n", - "

5 rows × 251 columns

\n", - "
" + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 678187\n", + "3 395558\n", + "4 38731\n", + "5 141\n", + "2 6\n", + "Name: click_deviceGroup, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " article_id emb_0 emb_1 emb_2 emb_3 emb_4 emb_5 \\\n", - "0 0 -0.161183 -0.957233 -0.137944 0.050855 0.830055 0.901365 \n", - "1 1 -0.523216 -0.974058 0.738608 0.155234 0.626294 0.485297 \n", - "2 2 -0.619619 -0.972960 -0.207360 -0.128861 0.044748 -0.387535 \n", - "3 3 -0.740843 -0.975749 0.391698 0.641738 -0.268645 0.191745 \n", - "4 4 -0.279052 -0.972315 0.685374 0.113056 0.238315 0.271913 \n", - "\n", - " emb_6 emb_7 emb_8 ... emb_240 emb_241 emb_242 \\\n", - "0 -0.335148 -0.559561 -0.500603 ... 0.321248 0.313999 0.636412 \n", - "1 -0.715657 -0.897996 -0.359747 ... -0.487843 0.823124 0.412688 \n", - "2 -0.730477 -0.066126 -0.754899 ... 0.454756 0.473184 0.377866 \n", - "3 -0.825593 -0.710591 -0.040099 ... 0.271535 0.036040 0.480029 \n", - "4 -0.568816 0.341194 -0.600554 ... 0.238286 0.809268 0.427521 \n", - "\n", - " emb_243 emb_244 emb_245 emb_246 emb_247 emb_248 emb_249 \n", - "0 0.169179 0.540524 -0.813182 0.286870 -0.231686 0.597416 0.409623 \n", - "1 -0.338654 0.320786 0.588643 -0.594137 0.182828 0.397090 -0.834364 \n", - "2 -0.863887 -0.383365 0.137721 -0.810877 -0.447580 0.805932 -0.285284 \n", - "3 -0.763173 0.022627 0.565165 -0.910286 -0.537838 0.243541 -0.885329 \n", - "4 -0.615932 -0.503697 0.614450 -0.917760 -0.424061 0.185484 -0.580292 \n", - "\n", - "[5 rows x 251 columns]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_emb_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(295141, 251)" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "item_emb_df.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 数据分析" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户重复点击" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:30:20.899771Z", - "start_time": "2020-11-13T15:30:20.750817Z" - } - }, - "outputs": [], - "source": [ - "#####merge\n", - "user_click_merge = trn_click.append(tst_click)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:30:26.290038Z", - "start_time": "2020-11-13T15:30:25.339579Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idcount
00307601
101575071
21637461
312891971
42361621
521684011
63361621
73506441
84398941
94425671
\n", - "
" + "source": [ + "trn_click['click_deviceGroup'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从点击设备组click_deviceGroup来看,设备1占大部分(61%),设备3占36%。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 测试集用户点击日志" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
024999916097415069591428204117113219192811506912747000259
124999916041715069591728204117113218192811506942089000173
2249998160974150695905606641121132552811506912747000259
3249998202557150695908606641121132453271506938401000219
4249997183665150695908861341171155773011500895686000256
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "0 249999 160974 1506959142820 4 \n", + "1 249999 160417 1506959172820 4 \n", + "2 249998 160974 1506959056066 4 \n", + "3 249998 202557 1506959086066 4 \n", + "4 249997 183665 1506959088613 4 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "0 1 17 1 13 \n", + "1 1 17 1 13 \n", + "2 1 12 1 13 \n", + "3 1 12 1 13 \n", + "4 1 17 1 15 \n", + "\n", + " click_referrer_type rank click_cnts category_id created_at_ts \\\n", + "0 2 19 19 281 1506912747000 \n", + "1 2 18 19 281 1506942089000 \n", + "2 2 5 5 281 1506912747000 \n", + "3 2 4 5 327 1506938401000 \n", + "4 5 7 7 301 1500895686000 \n", + "\n", + " words_count \n", + "0 259 \n", + "1 173 \n", + "2 259 \n", + "3 219 \n", + "4 256 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id count\n", - "0 0 30760 1\n", - "1 0 157507 1\n", - "2 1 63746 1\n", - "3 1 289197 1\n", - "4 2 36162 1\n", - "5 2 168401 1\n", - "6 3 36162 1\n", - "7 3 50644 1\n", - "8 4 39894 1\n", - "9 4 42567 1" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#用户重复点击\n", - "user_click_count = user_click_merge.groupby(['user_id', 'click_article_id'])['click_timestamp'].agg({'count'}).reset_index()\n", - "user_click_count[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:34:27.418638Z", - "start_time": "2020-11-13T15:34:27.372761Z" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idcount
311242862957425410
311243862957626810
39376110323720594810
39376310323723568910
5769021348506946313
\n", - "
" + "source": [ + "tst_click = tst_click.merge(item_df, how='left', on=['click_article_id'])\n", + "tst_click.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
count518010.000000518010.0000005.180100e+05518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.000000518010.0000005.180100e+05518010.000000
mean227342.428169193803.7925501.507387e+123.9473001.73828513.6284671.34820918.2502501.81961415.52178530.043586305.3249611.506883e+12210.966331
std14613.90718888279.3881773.706127e+080.3239161.0208586.6255641.7035247.0607981.08265733.95770256.868021110.4115135.816668e+0983.040065
min200000.000000137.0000001.506959e+121.0000001.0000002.0000001.0000001.0000001.0000001.0000001.0000001.0000001.265812e+120.000000
25%214926.000000128551.0000001.507026e+124.0000001.00000012.0000001.00000013.0000001.0000004.00000010.000000252.0000001.506970e+12176.000000
50%229109.000000199197.0000001.507308e+124.0000001.00000017.0000001.00000021.0000002.0000008.00000019.000000323.0000001.507249e+12199.000000
75%240182.000000272143.0000001.507666e+124.0000003.00000017.0000001.00000025.0000002.00000018.00000035.000000399.0000001.507630e+12232.000000
max249999.000000364043.0000001.508832e+124.0000005.00000020.00000011.00000028.0000007.000000938.000000938.000000460.0000001.509949e+123082.000000
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "count 518010.000000 518010.000000 5.180100e+05 518010.000000 \n", + "mean 227342.428169 193803.792550 1.507387e+12 3.947300 \n", + "std 14613.907188 88279.388177 3.706127e+08 0.323916 \n", + "min 200000.000000 137.000000 1.506959e+12 1.000000 \n", + "25% 214926.000000 128551.000000 1.507026e+12 4.000000 \n", + "50% 229109.000000 199197.000000 1.507308e+12 4.000000 \n", + "75% 240182.000000 272143.000000 1.507666e+12 4.000000 \n", + "max 249999.000000 364043.000000 1.508832e+12 4.000000 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "count 518010.000000 518010.000000 518010.000000 518010.000000 \n", + "mean 1.738285 13.628467 1.348209 18.250250 \n", + "std 1.020858 6.625564 1.703524 7.060798 \n", + "min 1.000000 2.000000 1.000000 1.000000 \n", + "25% 1.000000 12.000000 1.000000 13.000000 \n", + "50% 1.000000 17.000000 1.000000 21.000000 \n", + "75% 3.000000 17.000000 1.000000 25.000000 \n", + "max 5.000000 20.000000 11.000000 28.000000 \n", + "\n", + " click_referrer_type rank click_cnts category_id \\\n", + "count 518010.000000 518010.000000 518010.000000 518010.000000 \n", + "mean 1.819614 15.521785 30.043586 305.324961 \n", + "std 1.082657 33.957702 56.868021 110.411513 \n", + "min 1.000000 1.000000 1.000000 1.000000 \n", + "25% 1.000000 4.000000 10.000000 252.000000 \n", + "50% 2.000000 8.000000 19.000000 323.000000 \n", + "75% 2.000000 18.000000 35.000000 399.000000 \n", + "max 7.000000 938.000000 938.000000 460.000000 \n", + "\n", + " created_at_ts words_count \n", + "count 5.180100e+05 518010.000000 \n", + "mean 1.506883e+12 210.966331 \n", + "std 5.816668e+09 83.040065 \n", + "min 1.265812e+12 0.000000 \n", + "25% 1.506970e+12 176.000000 \n", + "50% 1.507249e+12 199.000000 \n", + "75% 1.507630e+12 232.000000 \n", + "max 1.509949e+12 3082.000000 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " user_id click_article_id count\n", - "311242 86295 74254 10\n", - "311243 86295 76268 10\n", - "393761 103237 205948 10\n", - "393763 103237 235689 10\n", - "576902 134850 69463 13" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user_click_count[user_click_count['count']>7]" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:32:53.298575Z", - "start_time": "2020-11-13T15:32:53.285611Z" - } - }, - "outputs": [ + "source": [ + "tst_click.describe()" + ] + }, { - "data": { - "text/plain": [ - "array([ 1, 2, 4, 3, 6, 5, 10, 7, 13])" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "我们可以看出训练集和测试集的用户是完全不一样的\n", + "\n", + "训练集的用户ID由0 ~ 199999,而测试集A的用户ID由200000 ~ 249999。\n", + "\n", + "因此,也就是我们在训练时,需要把测试集的数据也包括在内,称为全量数据。\n", + "\n", + "!!!!!!!!!!!!!!!后续将对训练集和测试集合并分析!!!!!!!!!!!" ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user_click_count['count'].unique()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1 1605541\n", - "2 11621\n", - "3 422\n", - "4 77\n", - "5 26\n", - "6 12\n", - "10 4\n", - "7 3\n", - "13 1\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#用户点击新闻次数\n", - "user_click_count.loc[:,'count'].value_counts() " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "###### 可以看出:有1605541(约占99.2%)的用户未重复阅读过文章,仅有极少数用户重复点击过某篇文章。 这个也可以单独制作成特征" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户点击环境变化分析" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:39:41.961797Z", - "start_time": "2020-11-13T15:39:41.949829Z" - } - }, - "outputs": [], - "source": [ - "def plot_envs(df, cols, r, c):\n", - " plt.figure()\n", - " plt.figure(figsize=(10, 5))\n", - " i = 1\n", - " for col in cols:\n", - " plt.subplot(r, c, i)\n", - " i += 1\n", - " v = df[col].value_counts().reset_index()\n", - " fig = sns.barplot(x=v['index'], y=v[col])\n", - " for item in fig.get_xticklabels():\n", - " item.set_rotation(90)\n", - " plt.title(col)\n", - " plt.tight_layout()\n", - " plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:39:55.476626Z", - "start_time": "2020-11-13T15:39:48.764592Z" - } - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50000" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#测试集中的用户数量为5w\n", + "tst_click.user_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:56:07.717463Z", + "start_time": "2020-11-13T15:56:07.693494Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tst_click.groupby('user_id')['click_article_id'].count().min() # 注意测试集里面有只点击过一次文章的用户" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻文章信息数据表" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:20:34.183761Z", + "start_time": "2020-11-13T15:20:34.164770Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
click_article_idcategory_idcreated_at_tswords_count
0001513144419000168
1111405341936000189
2211408667706000250
3311408468313000230
4411407071171000162
3640423640424601434034118000144
3640433640434601434148472000463
3640443640444601457974279000177
3640453640454601515964737000126
3640463640464601505811330000479
\n", + "
" + ], + "text/plain": [ + " click_article_id category_id created_at_ts words_count\n", + "0 0 0 1513144419000 168\n", + "1 1 1 1405341936000 189\n", + "2 2 1 1408667706000 250\n", + "3 3 1 1408468313000 230\n", + "4 4 1 1407071171000 162\n", + "364042 364042 460 1434034118000 144\n", + "364043 364043 460 1434148472000 463\n", + "364044 364044 460 1457974279000 177\n", + "364045 364045 460 1515964737000 126\n", + "364046 364046 460 1505811330000 479" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#新闻文章数据集浏览\n", + "item_df.head().append(item_df.tail())" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:28:13.084501Z", + "start_time": "2020-11-13T15:28:13.062561Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "176 3485\n", + "182 3480\n", + "179 3463\n", + "178 3458\n", + "174 3456\n", + "183 3432\n", + "184 3427\n", + "173 3414\n", + "180 3403\n", + "177 3391\n", + "170 3387\n", + "187 3355\n", + "169 3352\n", + "185 3348\n", + "175 3346\n", + "181 3330\n", + "186 3328\n", + "189 3327\n", + "171 3327\n", + "172 3322\n", + "165 3308\n", + "188 3288\n", + "167 3269\n", + "190 3261\n", + "192 3257\n", + "168 3248\n", + "193 3225\n", + "166 3199\n", + "191 3182\n", + "194 3164\n", + " ... \n", + "601 1\n", + "857 1\n", + "1977 1\n", + "1626 1\n", + "697 1\n", + "1720 1\n", + "696 1\n", + "706 1\n", + "592 1\n", + "1605 1\n", + "586 1\n", + "582 1\n", + "1606 1\n", + "972 1\n", + "716 1\n", + "584 1\n", + "1608 1\n", + "715 1\n", + "841 1\n", + "968 1\n", + "964 1\n", + "587 1\n", + "1099 1\n", + "1355 1\n", + "711 1\n", + "845 1\n", + "710 1\n", + "965 1\n", + "847 1\n", + "1535 1\n", + "Name: words_count, Length: 866, dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_df['words_count'].value_counts()" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:28:59.029535Z", + "start_time": "2020-11-13T15:28:58.816106Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "461\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "print(item_df['category_id'].nunique()) # 461个文章主题\n", + "item_df['category_id'].hist()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(364047, 4)" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_df.shape # 364047篇文章" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻文章embedding向量表示" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
article_idemb_0emb_1emb_2emb_3emb_4emb_5emb_6emb_7emb_8...emb_240emb_241emb_242emb_243emb_244emb_245emb_246emb_247emb_248emb_249
00-0.161183-0.957233-0.1379440.0508550.8300550.901365-0.335148-0.559561-0.500603...0.3212480.3139990.6364120.1691790.540524-0.8131820.286870-0.2316860.5974160.409623
11-0.523216-0.9740580.7386080.1552340.6262940.485297-0.715657-0.897996-0.359747...-0.4878430.8231240.412688-0.3386540.3207860.588643-0.5941370.1828280.397090-0.834364
22-0.619619-0.972960-0.207360-0.1288610.044748-0.387535-0.730477-0.066126-0.754899...0.4547560.4731840.377866-0.863887-0.3833650.137721-0.810877-0.4475800.805932-0.285284
33-0.740843-0.9757490.3916980.641738-0.2686450.191745-0.825593-0.710591-0.040099...0.2715350.0360400.480029-0.7631730.0226270.565165-0.910286-0.5378380.243541-0.885329
44-0.279052-0.9723150.6853740.1130560.2383150.271913-0.5688160.341194-0.600554...0.2382860.8092680.427521-0.615932-0.5036970.614450-0.917760-0.4240610.185484-0.580292
\n", + "

5 rows × 251 columns

\n", + "
" + ], + "text/plain": [ + " article_id emb_0 emb_1 emb_2 emb_3 emb_4 emb_5 \\\n", + "0 0 -0.161183 -0.957233 -0.137944 0.050855 0.830055 0.901365 \n", + "1 1 -0.523216 -0.974058 0.738608 0.155234 0.626294 0.485297 \n", + "2 2 -0.619619 -0.972960 -0.207360 -0.128861 0.044748 -0.387535 \n", + "3 3 -0.740843 -0.975749 0.391698 0.641738 -0.268645 0.191745 \n", + "4 4 -0.279052 -0.972315 0.685374 0.113056 0.238315 0.271913 \n", + "\n", + " emb_6 emb_7 emb_8 ... emb_240 emb_241 emb_242 \\\n", + "0 -0.335148 -0.559561 -0.500603 ... 0.321248 0.313999 0.636412 \n", + "1 -0.715657 -0.897996 -0.359747 ... -0.487843 0.823124 0.412688 \n", + "2 -0.730477 -0.066126 -0.754899 ... 0.454756 0.473184 0.377866 \n", + "3 -0.825593 -0.710591 -0.040099 ... 0.271535 0.036040 0.480029 \n", + "4 -0.568816 0.341194 -0.600554 ... 0.238286 0.809268 0.427521 \n", + "\n", + " emb_243 emb_244 emb_245 emb_246 emb_247 emb_248 emb_249 \n", + "0 0.169179 0.540524 -0.813182 0.286870 -0.231686 0.597416 0.409623 \n", + "1 -0.338654 0.320786 0.588643 -0.594137 0.182828 0.397090 -0.834364 \n", + "2 -0.863887 -0.383365 0.137721 -0.810877 -0.447580 0.805932 -0.285284 \n", + "3 -0.763173 0.022627 0.565165 -0.910286 -0.537838 0.243541 -0.885329 \n", + "4 -0.615932 -0.503697 0.614450 -0.917760 -0.424061 0.185484 -0.580292 \n", + "\n", + "[5 rows x 251 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_emb_df.head()" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(295141, 251)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_emb_df.shape" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据分析" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户重复点击" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:30:20.899771Z", + "start_time": "2020-11-13T15:30:20.750817Z" + } + }, + "outputs": [], + "source": [ + "#####merge\n", + "user_click_merge = trn_click.append(tst_click)" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:30:26.290038Z", + "start_time": "2020-11-13T15:30:25.339579Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idcount
00307601
101575071
21637461
312891971
42361621
521684011
63361621
73506441
84398941
94425671
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id count\n", + "0 0 30760 1\n", + "1 0 157507 1\n", + "2 1 63746 1\n", + "3 1 289197 1\n", + "4 2 36162 1\n", + "5 2 168401 1\n", + "6 3 36162 1\n", + "7 3 50644 1\n", + "8 4 39894 1\n", + "9 4 42567 1" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#用户重复点击\n", + "user_click_count = user_click_merge.groupby(['user_id', 'click_article_id'])['click_timestamp'].agg({'count'}).reset_index()\n", + "user_click_count[:10]" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAs8AAAFgCAYAAABE0JQRAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAABDJUlEQVR4nO3de9xtY73//9fbyplEdMA6SCJKZEWnvUlyKIdSQtqhg713FLt0UP0QZXfSQakoy6GUU/ruVZREdiGxSIr2ynJqWZTzWWJ5//4Y181Yc837vudc6573mHPd7+fjMR73HNcY15if++azxmeOeY1ryDYRERERETG6pZoOICIiIiJiUKR4joiIiIjoUIrniIiIiIgOpXiOiIiIiOhQiueIiIiIiA6leI6IiIiI6FCK54iILknaR9LFtfWHJL1glD7TJFnSMxbzvW+WtM3iHKMcZ9SYIyJiYSme+0CTJ+KxkhNxTGS2V7J9Y9NxdGOsYpb0Bkm/kvSgpLslXS3pY5KWG4s4IwbdknCOjwWleO5DE/lE3CRJW0m6tek4IgaFpN2As4AfAFNtPxvYHVgbmDxMnxQDMaEtCefLiS7Fc/ScpElNxxCxqCRNlnS2pDvLldVvtNnHkl5YXi8v6WhJt0i6X9LFkpZv0+etZQjGS0Z5/38rx7pb0idbti0l6eOSbijbz5C0Wtn2M0kHtOz/B0m7dhOzpFdKulTSfaX/VqVdwJeBI2x/x/Y9ALZn2/6A7evLfodLOkvS9yU9AOwjaU1JMyXdI2mOpPfVYjxJ0mdq6wt8qC1/s0MkXSfpXkkn5ip3RIynFM/jrA9OxG1PhGXbRZKOlHRJ+Qr2F5JWL9u6ORGfJOlbks6V9DDwOkkvLse/T9K1knauHeckScdKOqe87+8krdvy93i/pOvL9iMlrVt+jwdKwbBMbf8dVX11fF/ZZ+PatpslHSzpmvL3PF3ScpJWBH4GrKnqK7WHJK050t8ylnyqPvj9FLgFmAasBZw2SrcvAZsBrwZWAz4KPNly3H2BzwPb2P7TCO+/IfAt4N+ANYFnU13VHfIB4M3AlmX7vcCxZdsPgT1bjjUVOKfTmCWtVfb/TGk/GPiRpDWA9UssPxrhbzFkF6or1M8CTqX6G95aYn4bcJSkrTs4zpC9gO2AdYEXAZ/qom9Ez/TBOX7nco69r5xzX1zb9jFJ88p5dLak14/F7zwh2c4yTgswCfgD8BVgRWA54LXAPsDFtf0MvLC8Pha4iOqkPYnq5LYs1YncwDOAfYE5Q31GeP+1gLuBN1J9cHpDWV+jbL8IuIHqZLR8Wf9c2fYu4JLasTYE7gOWbRPzScD9wGvK+6xc4vsEsAywNfAgsH5t/7uBzcvvcypwWsvf43+AZwIbAY8BFwAvAFYBrgP2LvtuCtwBbFH+XnsDN9fivBm4nOqkvRrwZ+A/yratgFub/v8kS/8swKuAO4FntLS3zdny//ujwMvaHGsoZw8u/8+u3cH7H9qSCysC/6Qquin//76+tv35wOMlj1YGHqYaTgHwWWBGlzF/DPheS9t5Ja9eW46xXG3baeXfhUeAfytthwO/ru0zGZgPrFxr+2/gpPL6JOAztW0L5GXJ4f+orb8RuKHp/1eyZKH5c/yLSs6/AVia6kPwHKrz7vrAXGDNsu80YN2m/2aDuuTK8/janKpo+4jth23/w/bFw+0saSng3cCBtufZnm/7UtuP1XY7CPgIsJXtOaO8/zuBc22fa/tJ2+cDs6hOPkNOtP0X248CZwCblPYfA5tImlrW9wLOboml7n9sX2L7yXKMlagK8X/avpDqat6etf1/bPty209QFc+btBzvC7YfsH0t8CfgF7ZvtH0/1RXjTct++wHH2f5d+XudTFVsv7J2rGNs3+bqa+aftHmviCGTgVvK/5edWJ3qhHnDCPt8BDjWdifj69ekOuEBYPthqg+aQ6YCPy5Xme6jKqbnA8+1/SDVVeM9yr57UuVWNzFPBXYbOn55j9dSFelDcTy/Ft8etp8FXEVVCAyZW3u9JnBPiW/ILVTFQ6fqx7ulHDOiaU2f43cHzrF9vu3Hqb5RWp6qIJ9PVZRvKGlp2zfbHunfqRhBiufx1fSJeKQT4ZC/1V4/QlX00sWJeEjryXJuKaSHtJ4s275vzd9rrx9tsz60/1Tgwy2/42QWPLmO9l4RQ+YCU9T5TW53Af+gGk4wnG2BT0l6awfHu53ajXeSVqAaulGPbwfbz6oty9meV7b/ENhT0quo/i35VZcxz6W68lw//oq2PwfMBuYBu3bwe7j2+jZgNUkr19qmlGNBdeVshdq257U5Xv1mxCnlmBFNa/ocvybVuRWAcs6dC6xVCu+DqL4JukPSaRmauOhSPI+vpk/EI50IO9HJiXhI68lycvmUPaR+shxLc4HPtvyOK9j+YQd9PfouMcFcTlXAfk7SimV8/GuG27mcrGYAX1Z1U9wkSa+StGxtt2uB7YFj62P/h3EWsKOk15Zx/Uew4L/b3wY+O/SNkKQ1JO1S234u1QfKI4DTWz7AdhLz94GdJG1X2pdTdQPf2qXfh4HDJL1P0qqqrAc8d4S/0VzgUuC/y/E2Bt5T3gvgauCNklaT9DyqE36r/SWtrermyE8Cp4/0R4wYJ02f42+jynfgqZt6J1POtbZ/YPu1ZR9T3XcRiyDF8/hq+kQ87Imww/hHPREP43dUV3g/KmlpVTcp7sToN14tiu8A/yFpi3IiX1HSm1qucg3n78CzJa3Sg7hiANmeT/X/6guBv1Ld5Lb7KN0OBv4IXAHcQ3WCWuDfWtt/AHYEviNphxHe/1pgf6qp4G6nuiGwfgXqa8BM4BeSHgQuoxrvP9T/MeBsYJtyjK5iLoXuLlT3K9xJVRx8ZOj3sX068HaqIWFzqYqBM4DjgTNHeL89qcZc3kY1JOww278s275HNW70ZuAXtC+Mf1C23Uh11e4zbfaJGG9Nn+PPAN4k6fWSlqb6cPsYcKmk9SVtXY79D6pvbDs9h0erpgZbT9SF6orr/6MaL3gXcAwj30ywPPBVqk+O9wO/Lm3Tyn7PKPtNpyr+dhjl/bcA/pfqBHkn1VCMKWXbRcB7a/suEFdpO6G87yta2ltvGPxMy/aNyvveT3Wz1Ftq2xbYn4VvEHrq2GX9YmCf2vpngO/W1renKgLuo/qH7EzKzUlUJ+RtavseDny/tj6j/Le5j3JjRZYsWfpnac3hLFn6aemDc/xbyjn2/nLO3ai0b0xV3D9Yzv8/zTlu0ReVP2pERETfk3Qz1Yf8X462b0REL2TYRkREgyTtpafnFq8v1zYdW0RELCxXnpcwkvYCjmuz6RbbG413PBERETE2co7vDymeIyIiIiI61Ol0KgNh9dVX97Rp05oOI2JcXHnllXfZXqPpOBZHcjYmkkHP2eRrTCQj5esSVTxPmzaNWbNmNR1GxLiQdMvoe/W35GxMJIOes8nXmEhGytfcMBgRERER0aGeFs+SZki6Q9KfhtkuScdImiPpGkkvr23bW9L1Zdm7l3FGRCU5GzE4kq8Rzej1leeTqB5YMZwdgPXKsh/wLYDyyNXDqB7osTnV419X7WmkEQHJ2YhBchLJ14hx19Pi2favqZ5kM5xdgFNcuQx4lqTnA9sB59u+x/a9wPmM/A9ERIyB5GzE4Ei+RjSj6RsG1wLm1tZvLW3DtS9E0n5Un6iZMmXKmAW22UdOGbNjRQy58ovvajqExZWcjQllwHM2+RoTynjl68DfMGj7eNvTbU9fY42BnQEoYsJIzkYMjuRrxMKaLp7nAZNr62uXtuHaI6JZydmIwZF8jeiBpovnmcC7yh3BrwTut307cB6wraRVy00M25a2iGhWcjZicCRfI3qgp2OeJf0Q2ApYXdKtVHf3Lg1g+9vAucAbgTnAI8C+Zds9ko4EriiHOsL2SDdFRMQYSM5GDI7ka0Qzelo8295zlO0G9h9m2wxgRi/iioj2krMRgyP5GtGMpodtREREREQMjBTPEREREREdSvEcEREREdGhjotnSd/rpC0i+oekZ0pauek4IiIilhTdXHneqL4iaRKw2diGExFjQdIrJP0RuAb4k6Q/SEq+RkRELKZRi2dJh0h6ENhY0gNleRC4A/ifnkcYEYviBOD9tqfZnkp1x/2JDccUEREx8EYtnm3/t+2VgS/afmZZVrb9bNuHjEOMEdG9+bZ/M7Ri+2LgiQbjiYiIWCJ0PM+z7UMkrQVMrfez/eteBBYRi+V/JR0H/BAwsDtwkaSXA9i+qsngIiIiBlXHxbOkzwF7ANcB80uzgRTPEf3nZeXnYS3tm1Ll7dbjG05ERMSSoZsnDL4FWN/2Y70KJiLGhu3XNR1DRETEkqib4vlGYGkgxXNEn5N0aLt220eMdywRERFLkm6K50eAqyVdQK2Atv3BMY8qIhbXw7XXywE7An9uKJaIiIglRjfF88yyRESfs310fV3Sl4DzGgonIiJiidHNbBsnS1oemGJ7dg9jioixtwKwdtNBREREDLpuHs+9E3A18POyvomkXImO6EOS/ijpmrJcC8wGvtpwWBEREQOvm2EbhwObAxcB2L5a0gt6EFNELL4da6+fAP5uOw9JiYiIWEwdX3kGHrd9f0vbk2MZTESMDdu3AM8CdqKaZnLDRgOKiIhYQnRTPF8r6R3AJEnrSfo6cGmP4oqIxSDpQOBU4DllOVXSB5qNKiIiYvB1Uzx/ANiIapq6HwIPAAeN1knS9pJmS5oj6eNttn9F0tVl+Yuk+2rb5te2ZXx1ROfeA2xh+1DbhwKvBN43Wqfka8RgSc5GjL9uZtt4BPhkWToiaRJwLPAG4FbgCkkzbV9XO+5/1fb/ANXjg4c8anuTTt8vIp4iYH5tfX5pG75D8jVioCRnI5rRcfEsaTrwCWBavZ/tjUfotjkwx/aN5RinAbsA1w2z/57AYZ3GFBHDOhH4naQfl/U3AyeM0if5GjFYkrMRDehm2MapwEnAW6luQhpaRrIWMLe2fmtpW4ikqcA6wIW15uUkzZJ0maQ3D9Nvv7LPrDvvvLOT3yNiiSZpKeAyYF/gnrLsa/uro3Tteb6WvsnZiLGRc2xEA7qZqu5O270cE7UHcJbt+lfNU23PK1PiXSjpj7ZvqHeyfTxwPMD06dPdw/giBoLtJyUda3tT4Koevc0i5WuJLzkbMf5yjo0YI91ceT5M0ncl7Slp16FllD7zgMm19bVLWzt7UN2I+BTb88rPG6nml9504W4R0cYFkt4qacRxzi2SrxGDJTkb0YBuiud9gU2A7Xl6yMaOI3UArgDWk7SOpGWoknehq9eSNgBWBX5ba1tV0rLl9erAaxh+HFdELOjfgTOBxyQ9IOlBSQ+M0if5GjFYkrMRDehm2MYrbK/fzcFtPyHpAOA8YBIww/a1ko4AZtWGgewBnGa7/pXQi4HjJD1JVeR/rn4HcUQMz/bKi9An+RoxQJKzEc3opni+VNKG3SaX7XOBc1vaDm1ZP7xNv0uBl3bzXhETXZm6annbD5X1VwLLlM2/t/3gSP2TrxGDJTkbMf66KZ5fCVwt6SaqB6UI8ChT1UXE+Po8cAfwhbL+Q+BPwHJUNw9+rKG4IiIilgjdFM/b9yyKiBgrrwdeUVu/z/ZO5cbB3zQUU0RExBKj4xsGbd9CNYfk44BrS0T0j6VsP1Fb/xhUXxEBKzUTUkR0qtzIl290I/pYN08Y/ADVk4n+DjxZmg0kySP6xzKSVh4a22z7FwCSVqEauhERfUbSRcDOVOfkK4E7JF1i+0ONBhYRbXUzVd2BwPq2N7L90rKkcI7oL98BTpc0ZaihPFnsh8B3G4sqIkayiu0HgF2BU2xvAWzTcEwRMYxuxjzPBe7vVSARsfhsf1nSI8DFklakurH3QappqL7VbHQRMYxnSHo+8Hbgk00HExEj66Z4vhG4SNI5VLNtANXJesyjiohFZvvbwLclrVzWR5yeLiIadwTVXM2X2L6iPC77+oZjiohhdFM8/7Usy/D0vLER0YckPRc4ClgT2EHShsCrbJ/QbGQR0cr2mVRPBB1avxF4a3MRRcRIOi6ebX8aQNJKZf2hXgUVEYvtJOBEnv4K+C/A6UCK54g+I2lt4OtUj8iGalrJA23f2lxUETGcjm8YlPQSSb8HrgWulXSlpI16F1pELIbVbZ9BmRmnTF83v9mQImIYJwIzqb4pWhP4SWmLiD7UzWwbxwMfsj3V9lTgw1R39kdE/3lY0rMpc7GXx3Tnht+I/rSG7RNtP1GWk4A1mg4qItrrZszzirZ/NbRi+6JyN39E9J8PUV3JWlfSJVQn4rc1G1JEDONuSe+kmlISYE/g7gbjiYgRdDXbhqT/D/heWX8n1QwcEdFnbF8laUtgfarp6mbbfrzhsCKivXdTjXn+CtW3RZcC+zYaUUQMq5thG++munp1NvAjYPXSFhF9RtL+wEq2r7X9J2AlSe9vOq6IWJjtW2zvbHsN28+x/Wbbfx3aLumQJuOLiAV1VDxLmgScbfuDtl9uezPbB9m+t8fxRcSieZ/t+4ZWSq6+r7lwImIx7NZ0ABHxtI6KZ9vzgSclrdLjeCJibEySpKGV8gE487NHDCaNvktEjJduxjw/BPxR0vnAw0ONtj845lFFxOL6OXC6pOPK+r+XtogYPG46gIh4WjfF89lliYj+9zGqgvk/y/r5wHebCyciFkOuPEf0kY5vGLR9crtltH6Stpc0W9IcSR9vs30fSXdKuros761t21vS9WXZu/NfK2Jis/2k7W/ZfltZjivDr0aUfI0Yf5JWa9O2Tm31zNbttf2SsxHjbNQrz5LOsP12SX+kzVdHtjceoe8k4FjgDcCtwBWSZtq+rmXX020f0NJ3NeAwYHp53ytL39ykGDGM5GvEQPqJpB1sPwAgaUPgDOAlALaPatcpORvRjE6GbRxYfu64CMffHJhj+0YASacBuwCtid3OdsD5tu8pfc8HtufpSeQjYmHJ14jBcxRVAf0mqrnZTwH26qBfcjaiAaMO27B9e3n5VuDxMh/lU8so3dcC5tbWby1trd4q6RpJZ0ma3E1fSftJmiVp1p133jnarxOxROv3fIXkbEQr2+dQPSDlF8BJwFtsX91B15xjIxrQzUNSVgbOl/QbSQdIeu4YxfATYFr5Ovl8YNRx1HW2j7c93fb0NdZYY4xCihh4fZmvkJyNGCLp65KOkXQMsDWwCnATcEBpGws5x0aMsW5uGPy07Y2A/YHnA/8r6ZejdJsHTK6tr13a6se92/ZjZfW7wGad9o2I9pKvEQNhFnBlbfkC1RN8h9ZHk5yNaEA3U9UNuQP4G3A38JxR9r0CWK/cNTwP2AN4R30HSc+vfdW8M/Dn8vo84ChJq5b1bYE8ojSiO8nXiD41NGOVpBWBfwzNiFNuBFy2g0MkZyMa0HHxLOn9wNuBNaimzXlfmzt6F2D7CUkHUCXpJGCG7WslHQHMsj0T+KCknYEngHuAfUrfeyQdSfWPA8ARQzc2RMTIkq8RA+UCYBuqh5EBLE81/vnVI3VKzkY0o5srz5OBgzq8ieEpts8Fzm1pO7T2+hCG+bRrewYwo5v3iwgg+RoxSJazPVQ4Y/shSSt00jE5GzH+uhnzfAjV47nXlDRlaOlhbBGxiEq+riRpXwBJa7Q8dCEi+sfDkl4+tCJpM+DRBuOJiBF0M2zjAOBw4O/Ak6XZwLAPXYiIZkgaevjB+sCJwNLA94HXNBlXRLR1EHCmpNuoHsX9PGD3RiOKiGF1M2zjIGB923f3KJaIGDtvATYFrgKwfZuklZsNKSLasX2FpA2oPuwCzLb9eJMxRcTwuime5wL39yqQiBhT/7RtSYan7uaPiD4iaWvbF0ratWXTiyRh++xGAouIEXVTPN8IXCTpHGBozkhsf3nMo4qIxXWGpOOAZ0l6H/Bu4DsNxxQRC9oSuBDYqc02AymeI/pQN8XzX8uyTFkiok/Z/pKkNwAPUH0VfKjt8xsOKyJqbB9Wfu7bdCwR0bmOi2fbnwaQtILtR3oXUkSMhVIsp2CO6FOSPjTS9nyzG9Gfuplt41XACcBKwBRJLwP+3fb7exVcRHRH0oNUX/e2ZfuZ4xhORIxspJt4h83jiGhWN8M2vgpsB8wEsP0HSf/ai6AiYtHYXhmgPDnsduB7VFNf7QU8v8HQIqJF7Rvdk4EDbd9X1lcFjm4wtIgYQccPSQGwPbelaf4YxhIRY2dn29+0/aDtB2x/C9il6aAioq2NhwpnANv3Uk01GRF9qJviea6kVwOWtLSkg4E/9yiuiFg8D0vaS9IkSUtJ2gt4uOmgIqKtpcrVZgAkrUZ33wxHxDjqJjn/A/gasBYwD/gFsH8vgoqIxfYOqnz9GtXYyUtKW0T0n6OB30o6s6zvBny2wXgiYgTdzLZxF9W4ybYkHWL7v8ckqohYLLZvZoRhGsnXiP5h+xRJs4CtS9Outq9rMqaIGF5XY55HsdsYHisieiv5GtFHbF9n+xtlSeEc0cfGsnjWGB4rInor+RoREbEIxrJ4zpyUEYMj+RoREbEIcuU5YmJKvkZERCyCjovnMnVOa9s6tdUzW7dHRDOSrxEREb3RzZXnn0h66tG+kjYEfjK0bvuodp0kbS9ptqQ5kj7eZvuHJF0n6RpJF0iaWts2X9LVZZnZRawRE13yNWICSM5GjL9uiuejqE7IK0najOrK1TtH6iBpEnAssAOwIbBnOYnX/R6Ybntj4CzgC7Vtj9repCw7dxFrxESXfI1YwiVnI5rRzTzP50hamurhKCsDb7H9l1G6bQ7MsX0jgKTTqOaefWoaHtu/qu1/GaOc4CNidMnXiAkhORvRgFGLZ0lfZ8E781cBbgAOkITtD47QfS1gbm39VmCLEfZ/D/Cz2vpyZeL4J4DP2f5/beLbD9gPYMqUKSMcOmLJ1+/5WmJMzkaMjZxjIxrQyZXnWS3rV/YiEEnvBKYDW9aap9qeJ+kFwIWS/mj7hno/28cDxwNMnz4902/FRNfX+QrJ2Ygm5BwbMXZGLZ5tnwwgaUXgH7bnl/VJwLKjdJ8HTK6tr13aFiBpG+CTwJa2H6u997zy80ZJFwGbUl1Fi4g2kq8RE0pyNqIB3dwweAGwfG19eeCXo/S5AlhP0jqSlgH2ABa4o1fSpsBxwM6276i1rypp2fJ6deA11MZxRcSIkq8RS77kbEQDOr5hEFjO9kNDK7YfkrTCSB1sPyHpAOA8YBIww/a1ko4AZtmeCXwRWAk4UxLAX8tdvy8GjpP0JFWR/znbSeyIziRfI5ZwydmIZnRTPD8s6eW2rwIo0189Olon2+cC57a0HVp7vc0w/S4FXtpFfBHxtORrxASQnI0Yf90UzwdRfXK9jerRvs8Ddu9FUBGx2A4i+RoRETHmupnn+QpJGwDrl6bZth/vTVgRsTiSrxEREb3RyTzPW9u+UNKuLZteVOaNPbtHsUVEl5KvERERvdXJlectgQuBndpsM5CTcUT/SL5GRET0UCfzPB9Wfu7b+3AiYnEkXyMiInqrk2EbHxppu+0vj104EbE4kq8RERG91cmwjZVH2JZHdUb0l+RrRERED3UybOPTAJJOBg60fV9ZXxU4uqfRRURXkq8RERG91c3juTceOhED2L4X2HTMI4qIsZB8jYiI6IFuiuelytUrACStRncPWYmI8ZN8jYiI6IFuTqZHA7+VdGZZ3w347NiHFBFjIPkaERHRA908YfAUSbOArUvTrrav601YEbE4kq8RERG90dXXuOXkmxNwxABIvkZERIy9bsY8R0RERERMaCmeIyIiIiI6lOI5IiIiIqJDKZ4jIiIiIjqU4jkiIiIiokM9L54lbS9ptqQ5kj7eZvuykk4v238naVpt2yGlfbak7Xoda8REl3yNGCzJ2Yjx19PiWdIk4FhgB2BDYE9JG7bs9h7gXtsvBL4CfL703RDYA9gI2B74ZjleRPRA8jVisCRnI5rR6yvPmwNzbN9o+5/AacAuLfvsApxcXp8FvF6SSvtpth+zfRMwpxwvInoj+RoxWJKzEQ3o6iEpi2AtYG5t/VZgi+H2sf2EpPuBZ5f2y1r6rtX6BpL2A/Yrqw9Jmj02oUcXVgfuajqIQaAv7T2Wh5s6lgdjHPIVkrN9IjnboYmes8nXvpB87dB45Wuvi+ees308cHzTcUxkkmbZnt50HDEYkrPNS85Gp5KvzUu+9p9eD9uYB0yura9d2truI+kZwCrA3R32jYixk3yNGCzJ2YgG9Lp4vgJYT9I6kpahujlhZss+M4Gh6+xvAy607dK+R7lTeB1gPeDyHscbMZElXyMGS3I2ogE9HbZRxlcdAJwHTAJm2L5W0hHALNszgROA70maA9xDlfyU/c4ArgOeAPa3Pb+X8cYiy1d6S4Dk64SSnF0CJGcnjORrn1H1ATQiIiIiIkaTJwxGRERERHQoxXNERERERIdSPEdEREREdCjFcyw2Sac0HUNERETEeBj4h6TE+JLUOg2SgNdJehaA7Z3HPaiIiIiIcZLiObq1NtXURt8FTFU8TweObjKoiOiepH1tn9h0HBERgyRT1UVXJC0FHAi8EfiI7asl3Wj7BQ2HFhFdkvRX21OajiMiKpKuAs4Gfmj7hqbjifZy5Tm6YvtJ4CuSziw//07+P4roW5KuGW4T8NzxjCUiRrUq8CzgV5L+BvwQON32bY1GFQvIledYLJLeBLzG9ieajiUiFlY+4G4H3Nu6CbjU9prjH1VEtCPpKtsvL6//BdgT2BX4M9XV6DxtsA+keI6IWIJJOgE40fbFbbb9wPY7GggrItqoF8+1tknAG4Ddbe/bTGRRl+I5IiIiog9IOs32Hk3HESPLPM8RERERfWCkwllSrjr3iRTPsRBJl3a5/1aSftqreCIiIoJPNx1AVDJLQizE9qubjiEiOiPp0m5yVtJWwMG2d+xZUBGxSDI7zmBI8RwLkfSQ7ZXKSfZw4C7gJcCVwDttW9L2wFeBR4CLa31XBL5e9l8aONz2/0j6GnC37SMkbQd8EtiqTH0XEYsoH3YjlijPZYTZccY/nGgnwzZiNJsCBwEbAi8AXiNpOeA7wE7AZsDzavt/ErjQ9ubA64AvloL6EGB3Sa8DjgH2TeEcsfgkPVR+biXpIklnSfo/SadKUtm2fWm7imraq6G+K0qaIelySb+XtEtp/5qkQ8vr7ST9ujwgKSJ666fASrZvaVluBi5qNrQYkivPMZrLbd8KIOlqYBrwEHCT7etL+/eB/cr+2wI7Szq4rC8HTLH9Z0nvA34N/FeenBTRE5sCGwG3AZdQfdidRfVhd2tgDnB6bf+hD7vvlvQs4HJJv6T6sHuFpN9Qfdh9Yz7sRvSe7feMsC3TSvaJFM8xmsdqr+cz+v8zAt5qe3abbS8F7gbyUIaI3siH3YiIHsvXcLEo/g+YJmndsr5nbdt5wAdqXxdvWn5OBT5MdWVsB0lbjGO8ERPFon7Y3aQsU2z/uWzLh92IiDZSPEfXbP+D6srVOWUM5R21zUdS3Sh4jaRrgSNLIX0C1R3+twHvAb5bxk5HRG/lw25En8hUsEuGDNuIhdheqfy8iNoNCrYPqL3+ObBBm76PAv/e5rDb1Pa5kuqqVkT0mO1/SBr6sPsI8Btg5bL5SKpZc64pNwTeJGknah92Jb0HOEnSK8oH54hYRJkdZ8mQx3NHREREjINFnAr2BbZ3zFSw/SNXniMiIiLGX2bHGVAZ8xwREREx/i63fWspdK+mmh1nA8rsOK6GBny/tv+2wMfLTDoX8fTsOI8A7wPOB76R2XF6L1eeIyIiIsZfpoIdULnyHBEREdEfMjvOAEjxHBEREdEHMhXsYMhsGxERERERHcqV54iIiIiIDqV4joiIiIjoUIrniIiIiIgOpXiOiIiIiOhQiueIiIiIiA6leI6IiIiI6FCK54iIiIiIDqV4joiIiIjoUIrniIiIiIgOpXiOiIiIiOhQiueIiIiIiA6leI6IiIiI6FCK5yWApH0kXVxbf0jSC0bpM02SJT2j9xFGxHAGNX8l7SXpF029f0Qn+jm/JL1G0vUlpjf38r1ibKVwWgLZXqnpGMaKpGnATcDStp9oOJyInhuU/LV9KnBq03FEdKPP8usI4Bu2v9Z0IK0kHQ680PY7m46lH+XKcwy8XD2P6F7yJqJ3OsyvqcC1Y3V8SZMW9xjRmRTPA0bSZElnS7pT0t2SvtFmH0t6YXm9vKSjJd0i6X5JF0tavk2ft0q6WdJLRnn/10q6VNJ9kuZK2qe0ryLplBLXLZI+JWmpsu1wSd+vHWOBr8QkXSTpSEmXSHpQ0i8krV52/3X5eV/5autV5Wu4SyR9RdLdwBGS7pH00tp7PEfSI5LW6ObvG9FLTeZvLe/eI+mvwIWl/d2S/izpXknnSZpa67OtpNnlvb8p6X8lvbdsa/06/NWSrij7XiHp1bVtI+V4xJgYpPySdAPwAuAn5dy2bDmPniDpdknzJH1GpSBuc947XNJJkr4l6VxJDwOvk7SmpB+Vv8FNkj5Yi/FwSWdJ+r6kB4B9hvldtgc+AexeYvuDpN0kXdmy34ck/U95fZKkb0s6v+T4/7b8W7JB2XZP+Tfl7cP9LQdBiucBUpLop8AtwDRgLeC0Ubp9CdgMeDWwGvBR4MmW4+4LfB7YxvafRnj/qcDPgK8DawCbAFeXzV8HVqH6x2BL4F3Avh3+agDvKPs/B1gGOLi0/2v5+SzbK9n+bVnfArgReC5wJNXfof710p7ABbbv7CKGiJ5pOn9rtgReDGwnaReqk+SuVDn9G+CH5birA2cBhwDPBmaXONr9bqsB5wDHlH2/DJwj6dm13YbL8YjFNmj5ZXtd4K/ATuXc9hhwEvAE8EJgU2Bb4L21Y9fPe58tbe8or1cGLgV+Avyh/P6vBw6StF3tGLtQ5fWzGGbYle2fA0cBp5fYXgbMBNaR9OLarv8GnFJb34vqfLw6VW1wKoCkFYHzgR9Q5f8ewDclbdj2LzgIbGcZkAV4FXAn8IyW9n2Ai2vrpkq+pYBHgZe1Oda0st/BwHXA2h28/yHAj9u0TwL+CWxYa/t34KLy+nDg+23e+xll/SLgU7Xt7wd+3m7f2u/715YYtqD6h0hlfRbw9qb/m2XJMrT0Qf4O9XlBre1nwHtq60sBj1B9nfwu4Le1bQLmAu9tjZvqJHp5y/v9FtinvB42x7NkGYtl0PKrrN9MVZRDVRA/Bixf239P4Fe136P1vHcScEptfYs2+xwCnFheHw78usO/5+HUztul7VvAZ8vrjYB7gWVrsZxW23clYD4wGdgd+E3LsY4DDmv6/5tFXXLlebBMBm5x5zfOrQ4sB9wwwj4fAY61fWuH79/uWKsDS1N94h9yC9Un3079rfb6EarEG8nc+ort35V+W0nagOofx5ldvH9ErzWdv0PquTMV+JqqYVj3AfdQFclrAWvW93V1xhvufdZkwfyHhf8N6DbHI7oxaPnVairVefT22v7HUV2pbXfs4d5vzaH+5RifoCrMRzpGp04G3iFJVB+Yz3B1xXyhY9t+iOr3XbPEtUVLXHsBz1uMWBqVweKDZS4wRdIzOvwH4i7gH8C6VF/jtLMt8HNJf7P9ow7ef/Nh3udxqgS5rrRNAeaV1w8DK9T27yZh3EX7yVRDN/4GnGX7H128T0SvNZ2/Q+q5M5fqStJCX99KWg9Yu7au+nqL26jyv24K8PMOY4pYXAOVX23MpbryvPoI8bc777W+30221+swvpEstJ/tyyT9E/gXquEi72jZZfLQC0krUQ2Fua3E9b+239Dhe/e9XHkeLJcDtwOfk7SipOUkvWa4nW0/CcwAvlxuIpik6oa7ZWu7XQtsDxwraedR3v9UYBtJb5f0DEnPlrSJ7fnAGcBnJa1cxkZ/CBi6SfBq4F8lTZG0CtXXSJ26k2oM2ojzchbfB95CVUCfMsq+EeOt6fxt59vAIZI2gqdu/N2tbDsHeKmkN6u6uXd/hv/gey7wIknvKP827A5sSDUGNWI8DFp+tcZzO/AL4GhJz5S0lKR1JW3ZxftdDjwo6WOqboacJOklkl6xCLH/HZimcuN/zSnAN4DHbV/csu2NqiYVWIZq7PNltudS/TvwIkn/JmnpsryiZfz0QEnxPEBKkboT1ZCEv1J9hbr7KN0OBv4IXEH1FcrnafnvbvsPwI7AdyTtMML7/xV4I/DhcqyrgZeVzR+gusJ8I3Ax1Y0BM0q/84HTgWuAK+nihGr7EaqbIS4pX/e8coR95wJXUX1i/k2n7xExHprO32Fi+nE55mmq7r7/E7BD2XYXsBvwBeBuqmJ4FtXVsdbj3F1i+HDZ96PAjuUYET03aPk1jHdR3Ux7HdV44rOA53fxfvNLrJtQPR/hLuC7VDfzd+vM8vNuSVfV2r8HvISnL47V/QA4jOpvuRnlJn7bD1Jdxd+D6kr036j+Lsu2OcZAGLq5KmKJIGkGcJvtTzUdS8SSpFyBuhXYy/avmo4nIsafqqn87gBebvv6WvtJwK0T5dybMc+xxFD1NMJdqab4iYjFVKa4+h3VrAQfobrZ6bJGg4qIJv0ncEW9cJ6IMmwjFiBpL1WTorcui/QUpPEi6Uiqr8S+aPumpuOJaEIP8vdVVLMR3EX1lfibbT86ZgFHDJBBPT8OR9LPhvl9PjHM/jcDB1INz5rQMmwjIiIiIqJDufIcEREREdGhJWrM8+qrr+5p06Y1HUbEuLjyyivvsr1G03EsjuRsTCSDnrPJ15hIRsrXJap4njZtGrNmzWo6jIhxIan1iW4DJzkbE8mg52zyNSaSkfK1p8M2JM2QdIekPw2zXZKOkTRH0jWSXl7btrek68uydy/jjIhKcjZicCRfI5rR6zHPJ1E9nWc4OwDrlWU/4FsAklajmmh7C6rHQR8madWeRhoRkJyNGCQnkXyNGHc9LZ5t/5rqSTPD2QU4xZXLgGdJej6wHXC+7Xts3wucz8j/QETEGEjORgyO5GtEM5oe87wWMLe2fmtpG659IZL2o/pEzZQpU8YssM0+csqYHStiyJVffFfTISyu5GxMKAOes8nXmFDGK18Hfqo628fbnm57+hprDOxNzBETRnI2YnAkXyMW1nTxPA+YXFtfu7QN1x4RzUrORgyO5GtEDzRdPM8E3lXuCH4lcL/t24HzgG0lrVpuYti2tEVEs5KzEYMj+RrRAz0d8yzph8BWwOqSbqW6u3dpANvfBs4F3gjMAR4B9i3b7pF0JHBFOdQRtke6KSIixkByNmJwJF8jmtHT4tn2nqNsN7D/MNtmADN6EVdEtJecjRgcydeIZjQ9bCMiIiIiYmCkeI6IiIiI6FCK54iIiIiIDqV4joiIaJCkF0m6QNKfyvrGkj7VdFwR0V6K54iIiGZ9BzgEeBzA9jXAHo1GFBHDSvEcERHRrBVsX97S9kQjkUTEqFI8R0RENOsuSesCBpD0NuD2ZkOKiOH0dJ7niIiIGNX+wPHABpLmATcBezUbUkQMJ8VzREREg2zfCGwjaUVgKdsPNh1TRAwvwzYiIiIaJOnZko4BfgNcJOlrkp7ddFwR0V6K54iIiGadBtwJvBV4W3l9eqMRRcSwMmwjIiKiWc+3fWRt/TOSdm8smogYUa48R0RENOsXkvaQtFRZ3g6c13RQEdFeiueIiIhmvQ/4AfBYWU4D/l3Sg5IeaDSyiFhIhm1EREQ0yPbKTccQEZ3LleeIiIgGSfqRpDdKyjk5YgB0nKiSXtrLQCIiIiaob1E9FOV6SZ+TtH7TAUXE8Lr5lPtNSZdLer+kVXoWUURExARi+5e29wJeDtwM/FLSpZL2lbR0s9FFRKuOi2fb/0L1yXgycKWkH0h6w2j9JG0vabakOZI+3mb7VyRdXZa/SLqvtm1+bdvMTmONiEWTfI1oRnkoyj7Ae4HfA1+jKqbPH6VfcjZinHV1w6Dt6yV9CpgFHANsKknAJ2yf3bq/pEnAscAbgFuBKyTNtH1d7Zj/Vdv/A8CmtUM8anuTbmKMiIqktYCp1PLc9q9H2D/5GtEAST8G1ge+B+xk+/ay6XRJs0bol5yNaEDHxbOkjYF9gTdRfRLeyfZVktYEfgssVDwDmwNzbN9YjnEasAtwXZt9AfYEDus8/IhoR9Lngd2pcm1+aTYwbPFM8jWiKd+xfW69QdKyth+zPX2EfsnZiAZ0M+b568BVwMts72/7KgDbtwGfGqbPWsDc2vqtpW0hkqYC6wAX1pqXkzRL0mWS3txFrBET3ZuB9W2/0fZOZdl5lD7J14hmfKZN22876JecjWhAR1eey1dD82x/r9324dq7tAdwlu35tbaptudJegFwoaQ/2r6hJbb9gP0ApkyZMgZhRCwRbgSWpnrgQi8sUr5CcjZiiKTnURW7y0vaFFDZ9ExghTF+u5xjI8ZIR8Wz7fmSJktaxvY/uzj+PKobDIesXdra2QPYv+V955WfN0q6iGqs1g0t+xwPHA8wffp0dxFbxJLsEeBqSRdQK6Btf3CEPj3P17I9ORtR2Y7qJsG1gaN5unh+APhEB/1zjo1oQDc3DN4EXFLuyH14qNH2l0focwWwnqR1qBJ6D+AdrTtJ2gBYldrXVJJWBR6x/Zik1YHXAF/oIt6IiWxmWbqRfI0YR7ZPBk6W9FbbPxpuP0l7l31bJWcjGtBN8XxDWZYChh4lOuKnUNtPSDoAOA+YBMywfa2kI4BZtodO7nsAp9muH+/FwHGSnizv+bn6HcQRMTzbJ0taBnhRaZpt+/FR+iRfIxowUuFcHAgsVDwnZyOa0U3xfJ3tM+sNknYbrVO5g/jclrZDW9YPb9PvUiBPNYxYBJK2ojrZ3kz1VfDkcvVqpNk2kq8R/UnDbUjORoy/bmbbOKTDtoho3tHAtra3tP2vVGMrv9JwTBGxaDLWOKKPjHrlWdIOwBuBtSQdU9v0TOCJXgUWEYtladuzh1Zs/yWP+Y0YWMNeeY6I8dfJsI3bqJ4ouDNwZa39QeC/2vaIiKbNkvRd4PtlfS+qPI6IPiJpKeBtts8YYbdLxiueiBjdqMWz7T8Af5D0g9FuOIqIvvGfVNNSDU1N9xvgm82FExHt2H5S0keBYYtn2weMY0gRMYpubhjcXNLhwNTST4Btv6AXgUXEorP9GPDlskREf/ulpIOB01lwKth7mgspIobTTfF8AtUwjSuB+aPsGxENkHSG7bdL+iNtbjKyvXEDYUXEyHYvP+sPMTGQi1MRfaib4vl+2z/rWSQRMRYOLD93bDSKiOiY7XWajiEiOtdN8fwrSV8EzmbBx/1eNeZRRcQisX17+XlL07FERGckrQB8CJhiez9J6wHr2/5pw6FFRBvdFM9blJ/Ta20Gth67cCJiLEh6kIWHbdxPNePGh23fOP5RRcQwTqQaEvnqsj4POBNI8RzRhzounm2/rpeBRMSY+ipwK/ADqpt79wDWBa4CZgBbNRVYRCxkXdu7S9oTwPYjkjK3c0Sf6rh4lnRou3bbR4xdOBExRna2/bLa+vGSrrb9MUmfaCyqiGjnn5KWp3xbJGldasMjI6K/dPN47odry3xgB2BaD2KKiMX3iKS3S1qqLG8H/lG25VG/Ef3lMODnwGRJpwIXAB9tNqSIGE43wzaOrq9L+hJw3phHFBFjYS/ga1QPRjFwGfDOcnUrD1yI6BPlCYOrArsCr6QaZnWg7bsaDSwihtXNDYOtVgDWHqtAImLslBsCdxpm88XjGUtEDG/oCYPl8dznNB1PRIyu42Ebkv4o6ZqyXAvMpropKSL6jKQXSbpA0p/K+saSPtV0XBHR1i8lHSxpsqTVhpamg4qI9rq58lx/6MITwN9tPzHG8UTE2PgO8BHgOADb10j6AfCZRqOKiHbyhMGIAdLNmOdbJL0M+JfS9Gvgmp5EFRGLawXbl7fMdpUPuxF9pox5/rjt05uOJSI6082wjQOBU4HnlOVUSR/oVWARsVjuKtNdDU199Tbg9mZDiohWtp+k+pYoIgZEN8M23gNsYfthAEmfB34LfL0XgUXEYtkfOB7YQNI84CaqGTgiov/8UtLBwOlU08ECYPue5kKKiOF0M8+zqOZ3HjK/tI3cSdpe0mxJcyR9vM32fSTdKenqsry3tm1vSdeXZe8uYo2YsCRNAt5vextgDWAD26+1fUsHfZOvEeNvd6oPvL+mekz3lcCsTjomZyPGXzdXnk8Efifpx2X9zcAJI3UoJ/FjgTdQPSr4CkkzbV/Xsuvptg9o6bsa1cTx06m+er6y9L23i5gjJhzb8yW9trx+eLT9hyRfI5phe51F6ZecjWhGx1eebX8Z2Be4pyz72v7qKN02B+bYvtH2P4HTgF06fMvtgPNt31OS+Xxg+07jjZjgfi9ppqR/k7Tr0DJKn+RrRAMkrSDpU5KOL+vrSdpxtH4kZyMa0c0Ng68Errd9jO1jgBskbTFKt7WAubX1W0tbq7eW+aPPkjS5m76S9pM0S9KsO++8s9NfJ2JJtxxwN7A11cNSdmLB6Sbb6Xm+QnI2oo0TgX8Cry7r8+hsWsmcYyMa0M2Y528BD9XWHypti+snwDTbG1N98j25m862j7c93fb0NdZYYwzCiRh8tvdts7x7aLukQxbx0IuVryW25GzEgta1/QXgcQDbj9DBPUUdyjk2Yox1dcOgbQ+tlOl1RhszPQ+YXFtfu7Q9xfbdth8rq98FNuu0b0Qsst3atCVfI5rxT0nL8/TUkusCj43cBUjORjSim+L5RkkflLR0WQ4EbhylzxXAepLWkbQMsAcws76DpOfXVncG/lxenwdsK2lVSasC25a2iFh87a5qJV8jmnEY8HNgsqRTgQuAj3bQLzkb0YBuZtv4D+AY4FNUn44vAPYbqYPtJyQdQJWQk4AZtq+VdAQwy/ZM4IOSdqZ6+tk9wD6l7z2SjqT6xwHgiMx5GTFmvFBD8jViXEl6je1LqKao2xV4JdUH2wNt3zVa/+RsRDO6eTz3HVSfatuSdIjt/27T71zg3Ja2Q2uvDwHajr+0PQOY0WmMEdGxtuMpk68R4+oYqmEUv7X9cuCcbg+QnI0Yf91ceR7NbsBCxXNEjD9Jq7VeRZK0ju2byuqZDYQVEQt6vExPt7akY1o32v5gAzFFxCjGsngeqzuDI2Lx/UTSDrYfAJC0IXAG8BIA20c1GVxEANX0kdtQzbl8ZcOxRESHxrJ4XmgMZUQ05iiqAvpNwPrAKcBezYYUEXVlXPNpkv5s+w9NxxMRncmV54glkO1zJC0N/AJYGXiL7b80HFZEtPeopAuA59p+iaSNgZ1td/KglIgYZx0XzxlDGdH/JH2dBb8FWgW4AThAUsZQRvSn7wAfAY4DsH2NpB/Q2VMGI2KcdXPlOWMoI/rfrJb1jKOM6H8r2L5cWuAL3CeaCiYiRtZN8ZwxlBF9zvbJAJJWBP5he35ZnwQs22RsETGsu8pTBYeeMPg24PZmQ4qI4XQzz3PGUEYMjguo7uJ/qKwvT5W7r24soogYzv7A8cAGkuYBN5GLUxF9a9TiOWMoIwbScraHCmdsPyRphSYDioiFlW+F3m97m/KN0VK2H2w6rogYXidXnjOGMmLwPCzp5bavApC0GfBowzFFRAvb8yW9trx+uOl4ImJ0oxbPGUMZMZAOAs6UdBvVNJLPA3ZvNKKIGM7vJc2kmrXqqQLa9tnNhRQRw+nmhsGMoYwYELavkLQB1c29ALNtP95kTBExrOWAu4Gta20GUjxH9KFuiueMoYzoc5K2tn2hpF1bNr2o3KOQk3FEn7G970jbJR1i+7/HK56IGFk3xXPGUEb0vy2BC4Gd2mzLlayIwbQbkOI5ok90UzwfRMZQRvQ124eVnyNeyYqIgaLRd4mI8dLNPM8ZQxnR5yR9aKTttr88XrFExJjx6LtExHjpZJ7njKGMGBwrj7AtJ+CIwZQrzxF9pJMrzxlDGTEgbH8aQNLJwIG27yvrqwJHNxhaRAxD0mq272lpW8f2TWX1zAbCiohhLDXaDvUxlG2Wd4/WX9L2kmZLmiPp4222f0jSdZKukXSBpKm1bfMlXV2Wmd3+chET2MZDhTOA7XuBTUfrlHyNaMRPJD1zaEXShsBPhtZtHzVcx+RsxPjrZNjGIo+hLA9SORZ4A3ArcIWkmbavq+32e2C67Uck/SfwBZ6+EfFR25uMFmNELGQpSauWohlJqzFKvidfIxpzFFUB/Saq+4pOAfYarVNyNqIZnQzbWJwxlJsDc2zfCCDpNGAX4KnEtv2r2v6XAe/sIKaIGNnRwG8lDX3duxvw2VH6JF8jGmD7HElLUz14bGXgLbb/0kHX5GxEAzp5PPfijKFcC5hbW78V2GKE/d8D/Ky2vpykWcATwOds/7/WDpL2A/YDmDJlyijhREwMtk8puTP0xLJdW65GtdPzfIXkbMQQSV9nwYtQqwA3AAeUG/I/OMohco6NaEA38zwvNIZS0qhjKDsl6Z3AdKobFIdMtT1P0guACyX90fYN9X62jweOB5g+fXpmE4goSrE8WsG8SBY1X0tcydmIyqyW9St79UY5x0aMnW6K567HUALzgMm19bVL2wIkbQN8EtjS9mND7bbnlZ83SrqI6oanhU7GETEmkq8R48j2yQCSVgT+YXt+WZ8ELNvBIZKzEQ0YdbaNmqExlEdKOhK4lOrGg5FcAawnaR1JywB7AAvc0VuuXh8H7Gz7jlr7qpKWLa9XB15Dj66iRQSQfI1oygXA8rX15YFfdtAvORvRgG6eMNj1GErbT0g6ADgPmATMsH2tpCOAWbZnAl8EVqJ69DfAX23vDLwYOE7Sk1RF/uc6GLMZEYso+RrRmOVsPzS0YvshSSuM1ik5G9GMboZtLNIYStvnAue2tB1ae73NMP0uBV7azXtFxOJJvkY04mFJL7d9FYCkzYBHO+mYnI0Yf10VzxERETHmDqK6Mnwb1aO4n8fTczFHRJ9J8RwREdEg21dI2oDqASkAs20/3mRMETG8FM8RERENkLS17Qsl7dqy6UVlnuezGwksIkaU4jkiIqIZWwIXAju12WYgxXNEH0rxHBER0QDbh5Wf+zYdS0R0LsVzREREAyR9aKTttr88XrFEROdSPEdERDRj5RG25VHYEX0qxXNEREQDbH8aQNLJwIG27yvrq1I91Tci+lA3j+eOiIiIsbfxUOEMYPteYNPmwomIkaR4joiIaNZS5WozAJJWI98MR/StJGdERESzjgZ+K+nMsr4b8NkG44mIEaR4joiIaJDtUyTNArYuTbvavq7JmCJieCmeIyIiGlaK5RTMEQMgY54jIiIiIjqU4jkiIiIiokMpniMiIiIiOpTiOSIiIiKiQymeIyIiIiI61PPiWdL2kmZLmiPp4222Lyvp9LL9d5Km1bYdUtpnS9qu17FGTHTJ14jBkpyNGH89LZ4lTQKOBXYANgT2lLRhy27vAe61/ULgK8DnS98NgT2AjYDtgW+W40VEDyRfIwZLcjaiGb2+8rw5MMf2jbb/CZwG7NKyzy7AyeX1WcDrJam0n2b7Mds3AXPK8SKiN5KvEYMlORvRgF4/JGUtYG5t/VZgi+H2sf2EpPuBZ5f2y1r6rtX6BpL2A/Yrqw9Jmj02oUcXVgfuajqIQaAv7T2Wh5s6lgdjHPIVkrN9IjnboYmes8nXvpB87dB45evAP2HQ9vHA8U3HMZFJmmV7etNxxGBIzjYvORudSr42L/naf3o9bGMeMLm2vnZpa7uPpGcAqwB3d9g3IsZO8jVisCRnIxrQ6+L5CmA9SetIWobq5oSZLfvMBIaus78NuNC2S/se5U7hdYD1gMt7HG/ERJZ8jRgsydmIBvR02EYZX3UAcB4wCZhh+1pJRwCzbM8ETgC+J2kOcA9V8lP2OwO4DngC2N/2/F7GG4ssX+ktAZKvE0pydgmQnJ0wkq99RtUH0IiIiIiIGE2eMBgRERER0aEUzxERERERHUrxHBERERHRoRTPEREREX1C0gaSXi9ppZb27ZuKKRaU4jnGjKR9m44hIiJiUEn6IPA/wAeAP0mqP279qGaiilYpnmMsfbrpACLiafUrVZJWkXSCpGsk/UDSc5uMLSLaeh+wme03A1sB/5+kA8s2NRVULGjgH88d40vSNcNtAnIyjugvRwE/L6+PBm4HdgJ2BY4D3txMWBExjKVsPwRg+2ZJWwFnSZpKiue+keI5uvVcYDvg3pZ2AZeOfzgR0aHptjcpr78iae+Rdo6IRvxd0ia2rwaw/ZCkHYEZwEsbjSyekuI5uvVTYKWhxK6TdNG4RxMRI3mOpA9Rfbh9piT56SdjZdheRP95F9UTH59i+wngXZKOayakaJUnDEZELKEkHdbS9E3bd0p6HvAF2+9qIq6IiEGW4jkiYgkmaQNgLeB3Q2MpS/v2tn8+fM+IiGgnX9tFRCyhJH2ATHsVETGmUjzHQiR1deOfpK0k/bRX8UTEItuPTHsV0Tdyfl0y5IbBWIjtVzcdQ0SMiUx7FdFHcn5dMuTKcyxE0kPl51aSLpJ0lqT/k3SqJJVt25e2q6jmjB3qu6KkGZIul/T7oa+JJX1N0qHl9XaSfi0p//9F9NbfJW0ytFIK6R2B1cm0VxHjLufXJUOuPMdoNgU2Am4DLgFeI2kW8B1ga2AOcHpt/08CF9p+t6RnAZdL+iVwCHCFpN8AxwBvtP3k+P0aERNSpr2K6F85vw6ofDKJ0Vxu+9aSiFcD04ANgJtsX1/mjP1+bf9tgY9Luhq4CFgOmGL7EarHjp4PfMP2DeP2G0RMUCV3/zbMtkvGO56IWEDOrwMqV55jNI/VXs9n9P9nBLzV9uw2214K3A2sOUaxRUREDKqcXwdUrjzHovg/YJqkdcv6nrVt5wEfqI3d2rT8nAp8mOprqh0kbTGO8UZERAyCnF8HQIrn6Jrtf1BNgXVOuaHhjtrmI4GlgWskXQscWRL9BOBg27cB7wG+K2m5cQ49YomTqa8ilhw5vw6GPGEwImICKdPVHWx7x4ZDiYgYSLnyHBExwDL1VUTE+MoNgxERS45MfRUR0WO5khARseTI1FcRET2WK88REUuOTH0VEdFjufIcEbFky9RXERFjKMVzRMQSLFNfRUSMrUxVFxERERHRoVx5joiIiIjoUIrniIiIiIgOpXiOiIiIiOhQiueIiIiIiA6leI6IiIiI6FCK54iIiIiIDqV4joiIiIjo0P8PTSiMo7XMJNQAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:34:27.418638Z", + "start_time": "2020-11-13T15:34:27.372761Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idcount
311242862957425410
311243862957626810
39376110323720594810
39376310323723568910
5769021348506946313
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id count\n", + "311242 86295 74254 10\n", + "311243 86295 76268 10\n", + "393761 103237 205948 10\n", + "393763 103237 235689 10\n", + "576902 134850 69463 13" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_click_count[user_click_count['count']>7]" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:32:53.298575Z", + "start_time": "2020-11-13T15:32:53.285611Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 4, 3, 6, 5, 10, 7, 13])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_click_count['count'].unique()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 1605541\n", + "2 11621\n", + "3 422\n", + "4 77\n", + "5 26\n", + "6 12\n", + "10 4\n", + "7 3\n", + "13 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#用户点击新闻次数\n", + "user_click_count.loc[:,'count'].value_counts() " ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "###### 可以看出:有1605541(约占99.2%)的用户未重复阅读过文章,仅有极少数用户重复点击过某篇文章。 这个也可以单独制作成特征" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户点击环境变化分析" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:39:41.961797Z", + "start_time": "2020-11-13T15:39:41.949829Z" + } + }, + "outputs": [], + "source": [ + "def plot_envs(df, cols, r, c):\n", + " plt.figure()\n", + " plt.figure(figsize=(10, 5))\n", + " i = 1\n", + " for col in cols:\n", + " plt.subplot(r, c, i)\n", + " i += 1\n", + " v = df[col].value_counts().reset_index()\n", + " fig = sns.barplot(x=v['index'], y=v[col])\n", + " for item in fig.get_xticklabels():\n", + " item.set_rotation(90)\n", + " plt.title(col)\n", + " plt.tight_layout()\n", + " plt.show()" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:39:55.476626Z", + "start_time": "2020-11-13T15:39:48.764592Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# 分析用户点击环境变化是否明显,这里随机采样10个用户分析这些用户的点击环境分布\n", + "sample_user_ids = np.random.choice(tst_click['user_id'].unique(), size=10, replace=False)\n", + "sample_users = user_click_merge[user_click_merge['user_id'].isin(sample_user_ids)]\n", + "cols = ['click_environment','click_deviceGroup', 'click_os', 'click_country', 'click_region','click_referrer_type']\n", + "for _, user_df in sample_users.groupby('user_id'):\n", + " plot_envs(user_df, cols, 2, 3)" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "# 分析用户点击环境变化是否明显,这里随机采样10个用户分析这些用户的点击环境分布\n", - "sample_user_ids = np.random.choice(tst_click['user_id'].unique(), size=10, replace=False)\n", - "sample_users = user_click_merge[user_click_merge['user_id'].isin(sample_user_ids)]\n", - "cols = ['click_environment','click_deviceGroup', 'click_os', 'click_country', 'click_region','click_referrer_type']\n", - "for _, user_df in sample_users.groupby('user_id'):\n", - " plot_envs(user_df, cols, 2, 3)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以看出绝大多数数的用户的点击环境是比较固定的。思路:可以基于这些环境的统计特征来代表该用户本身的属性" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户点击新闻数量的分布" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:40:04.296033Z", - "start_time": "2020-11-13T15:40:03.980868Z" - } - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "[]" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看出绝大多数数的用户的点击环境是比较固定的。思路:可以基于这些环境的统计特征来代表该用户本身的属性" ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" }, { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAASw0lEQVR4nO3da4yc1X3H8e8fr+/GVxZj1nZsgnshVCl0RYyS8iLkBm1qKpGIqCpWimSpJU1SWjXQvEjUV0nUQEMTkTghFamilIRQYVW0gQJRlRdxsgbCNYSNa8CLsZeLL/EFbHz6Yo6dsbPjZ9be2Znn+PuRrH2e85yZ55x9xr+ZOXP2TKSUkCSV64xuN0CS1FkGvSQVzqCXpMIZ9JJUOINekgrX1+0GAJx11llpxYoV3W6GJNXKpk2bXk4p9VfV64mgX7FiBUNDQ91uhiTVSkQ81049h24kqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSpcrYP+p1te5eb7nuGNQ4e73RRJ6lm1DvqHn3uNWx8c5tBhg16SWql10EuSqhn0klQ4g16SCmfQS1Lhigh6v99cklqrddBHdLsFktT7ah30kqRqBr0kFc6gl6TCGfSSVLgigt5JN5LUWq2DPnDajSRVqXXQS5KqGfSSVDiDXpIKV0TQJ9dAkKSWah30LoEgSdVqHfSSpGoGvSQVzqCXpMIZ9JJUuCKC3jk3ktRaEUEvSWrNoJekwhn0klS4toI+Iv4mIp6MiCci4jsRMSMiVkbExogYjog7I2Jarjs97w/n4ys62gNJ0glVBn1EDAAfBwZTShcCU4BrgM8Dt6SUzgdeA67LN7kOeC2X35LrSZK6pN2hmz5gZkT0AbOAbcC7gbvy8TuAq/L2mrxPPn55RGcXK3CpG0lqrTLoU0ojwD8Bz9MI+F3AJmBnSulQrrYVGMjbA8AL+baHcv1Fx99vRKyLiKGIGBodHT2pxnf4+UOSitDO0M0CGq/SVwLnArOBD5zqiVNK61NKgymlwf7+/lO9O0lSC+0M3bwH+L+U0mhK6SBwN/BOYH4eygFYCozk7RFgGUA+Pg94ZUJbLUlqWztB/zywOiJm5bH2y4GngIeAq3OdtcA9eXtD3icffzC5YLwkdU07Y/QbaXyo+jDweL7NeuBTwA0RMUxjDP72fJPbgUW5/Abgxg60W5LUpr7qKpBS+gzwmeOKNwOXjFH3APChU2/aOPh+QZJaqvVfxjrnRpKq1TroJUnVDHpJKpxBL0mFM+glqXBFBH1y2o0ktVTroHepG0mqVuuglyRVM+glqXAGvSQVzqCXpMIVEfSujSlJrdU66J10I0nVah30kqRqBr0kFc6gl6TCFRH0fhYrSa3VOujDNRAkqVKtg16SVM2gl6TCGfSSVDiDXpIKV0TQJ9dAkKSWah30TrqRpGq1DnpJUjWDXpIKZ9BLUuEMekkqXBFB75wbSWqt1kHvpBtJqlbroJckVTPoJalwBr0kFa6toI+I+RFxV0T8PCKejohLI2JhRNwfEc/mnwty3YiIWyNiOCIei4iLO9sFSdKJtPuK/kvAf6eUfgd4O/A0cCPwQEppFfBA3ge4AliV/60DbpvQFo/BpW4kqbXKoI+IecBlwO0AKaU3Uko7gTXAHbnaHcBVeXsN8K3U8GNgfkQsmeB2H2lcR+5WkkrSziv6lcAo8K8R8UhEfCMiZgOLU0rbcp2XgMV5ewB4oen2W3OZJKkL2gn6PuBi4LaU0kXAXn49TANAaqwTPK4BlIhYFxFDETE0Ojo6nptKksahnaDfCmxNKW3M+3fRCP7tR4Zk8s8d+fgIsKzp9ktz2TFSSutTSoMppcH+/v6Tbb8kqUJl0KeUXgJeiIjfzkWXA08BG4C1uWwtcE/e3gBcm2ffrAZ2NQ3xSJImWV+b9f4a+HZETAM2Ax+l8STx3Yi4DngO+HCuey9wJTAM7Mt1Oyq52o0ktdRW0KeUHgUGxzh0+Rh1E3D9qTWrPc65kaRq/mWsJBXOoJekwhn0klQ4g16SCldG0DvpRpJaqnXQu9SNJFWrddBLkqoZ9JJUOINekgpXRND7WawktVbroA8XQZCkSrUOeklSNYNekgpn0EtS4Qx6SSpcEUGfnHYjSS3VOuhdAkGSqtU66CVJ1Qx6SSqcQS9JhTPoJalwRQR9crUbSWqp1kHvpBtJqlbroJckVTPoJalwBr0kFc6gl6TCFRH0rnUjSa3VOuhd60aSqtU66CVJ1Qx6SSqcQS9JhTPoJalwRQS9k24kqbW2gz4ipkTEIxHxn3l/ZURsjIjhiLgzIqbl8ul5fzgfX9GhthOudiNJlcbziv4TwNNN+58HbkkpnQ+8BlyXy68DXsvlt+R6kqQuaSvoI2Ip8EfAN/J+AO8G7spV7gCuyttr8j75+OW5viSpC9p9Rf/PwN8Dh/P+ImBnSulQ3t8KDOTtAeAFgHx8V65/jIhYFxFDETE0Ojp6cq2XJFWqDPqI+GNgR0pp00SeOKW0PqU0mFIa7O/vn8i7liQ16WujzjuBP4mIK4EZwFzgS8D8iOjLr9qXAiO5/giwDNgaEX3APOCVCW95k+RiN5LUUuUr+pTSTSmlpSmlFcA1wIMppT8DHgKuztXWAvfk7Q15n3z8wdSpJHbkX5Iqnco8+k8BN0TEMI0x+Ntz+e3Aolx+A3DjqTVRknQq2hm6OSql9EPgh3l7M3DJGHUOAB+agLZJkiZAEX8ZK0lqrYig97NYSWqt1kHvZ7GSVK3WQS9JqmbQS1LhDHpJKpxBL0mFM+glqXC1DnpXP5akarUOeklSNYNekgpn0EtS4Qx6SSpcEUHvWjeS1Fqtg945N5JUrdZBL0mqZtBLUuEMekkqnEEvSYUrIugTTruRpFZqHfQudSNJ1Wod9JKkaga9JBXOoJekwhn0klS4IoLetW4kqbVaB72zbiSpWq2DXpJUzaCXpMIZ9JJUOINekgpXRNA76UaSWqt10IffMSVJlSqDPiKWRcRDEfFURDwZEZ/I5Qsj4v6IeDb/XJDLIyJujYjhiHgsIi7udCckSa2184r+EPC3KaULgNXA9RFxAXAj8EBKaRXwQN4HuAJYlf+tA26b8FZLktpWGfQppW0ppYfz9h7gaWAAWAPckavdAVyVt9cA30oNPwbmR8SSiW64JKk94xqjj4gVwEXARmBxSmlbPvQSsDhvDwAvNN1say47/r7WRcRQRAyNjo6Ot93HSK6BIEkttR30ETEH+D7wyZTS7uZjqZG040rblNL6lNJgSmmwv79/PDdtatNJ3UySTittBX1ETKUR8t9OKd2di7cfGZLJP3fk8hFgWdPNl+YySVIXtDPrJoDbgadTSjc3HdoArM3ba4F7msqvzbNvVgO7moZ4JEmTrK+NOu8E/hx4PCIezWX/AHwO+G5EXAc8B3w4H7sXuBIYBvYBH53IBkuSxqcy6FNKP4KWf5l0+Rj1E3D9KbZLkjRBav2XsUc450aSWisi6CVJrRn0klQ4g16SCmfQS1LhDHpJKlwRQe9SN5LUWq2DPlzsRpIq1TroJUnVDHpJKpxBL0mFM+glqXC1Dvq9rx8C4I1Dh7vcEknqXbUO+gWzpgJw2PmVktRSrYN+xtQpALzuK3pJaqnWQT+9rxH0Dt1IUmv1Dvqpjea//KvXu9wSSepdtQ76aVMazT/Dv5CVpJZqHfQLZ08DYMsre7vcEknqXbUO+vl51s3BNx2jl6RWah30s6Y1vtv82R2/6nJLJKl31TroARbNnsbTL+7udjMkqWfVPujPmTeDzS87Ri9JrdQ+6N+xchEAv9i+p8stkaTeVPug/+DblwDw9f/d3OWWSFJvqn3QX7R8AQDf27T16CJnkqRfq33QA/zjmrcB8MEv/6jLLZGk3lNE0F976QrOP3sOm0f3ctkXHmLPgYPdbpIk9Ywigh7g3o//IcsXzuL5V/fxe5+9jy/e94yLnUkSEKkH1nIfHBxMQ0NDE3JfN9//C2594Nmj++/53bN5/9vO4f0XnsPcGVMn5ByS1AsiYlNKabCyXmlBD7DvjUP8y4PD3P3wVrbv/vXKlufMncFFy+dz4cA8Llo2n7cNzGPO9D6mnOGiaJLq57QO+mbbdx9gw6Mv8pMtr/LY1p3HBP8R5/XPZmD+TM6aM53zz57DnOl9rDxrNnNnTmXZgpksmjO9I22TpFNh0Ldw8M3DPDGyi0ee38nW1/azY88Bntq2m9cPHmZk5/4T3nbx3OmcM3cG0Fg5c/nCWUePLVs46+hqmkDjiePMY58gBubPPPqtWJJ0qtoN+r4OnfwDwJeAKcA3Ukqf68R5TsbUKWdw0fIFR+ffN9v/xpvsP/gmW17Zy659B9m26wDbdx8AGn95u//gmwBseXkvz726j0de2ElKsGt/+7N8zpz+m7/y/jOnc+78mWPWnzltCr+1eM4J73PZgmOfZE7kwoF59I1jqGp63xTmzfKzDanOJjzoI2IK8BXgvcBW4KcRsSGl9NREn2uizZw2hZnTprQdmkfs2neQnfvfOLo/snM/o3uOHSJ6/pV9vLbvN58Qhkd/xd7XDx19Emm2bed+dux5nYd+vqPluQ8d7vw7sgWzpk7IO5HxPCFVmTV9CqvOPnNC7msiNJ6sZ0zqOd/aP4czZ3TktZpOwYy+KZzRY5/7deJRcgkwnFLaDBAR/w6sAXo+6E/WvFlTj3nV+5ZFsyft3Dt2H2DHnva+SvGpF3dz8HD7U04PvZl4YmQXE/EFXk+M7GbX/oPjevfTyos797PHv4JWj4qA8/tP/C682ccvX8UH335uB1vUmaAfAF5o2t8KvOP4ShGxDlgHsHz58g404/Rw9twZnD23vVeSFw7M63BrJs+BMd4Bdcure9/g+Vf3Teo5n3tlLzvHeIeo7npx535Gx/kd1vNmdn5otGvv+1JK64H10PgwtlvtUD310ofa586f2fIzlk5Zfd6iST2f6q0Tfxk7Aixr2l+ayyRJXdCJoP8psCoiVkbENOAaYEMHziNJasOED92klA5FxMeAH9CYXvnNlNKTE30eSVJ7OjJGn1K6F7i3E/ctSRqfYlavlCSNzaCXpMIZ9JJUOINekgrXE6tXRsQo8NxJ3vws4OUJbE4d2OfTg30+PZxKn9+SUuqvqtQTQX8qImKonWU6S2KfTw/2+fQwGX126EaSCmfQS1LhSgj69d1uQBfY59ODfT49dLzPtR+jlySdWAmv6CVJJ2DQS1Lhah30EfGBiHgmIoYj4sZut2e8ImJLRDweEY9GxFAuWxgR90fEs/nnglweEXFr7utjEXFx0/2szfWfjYi1TeV/kO9/ON920r/IMiK+GRE7IuKJprKO97HVObrY589GxEi+1o9GxJVNx27K7X8mIt7fVD7m4zsvAb4xl9+ZlwMnIqbn/eF8fMUkdZmIWBYRD0XEUxHxZER8IpcXe61P0Ofeu9YppVr+o7EE8i+B84BpwM+AC7rdrnH2YQtw1nFlXwBuzNs3Ap/P21cC/wUEsBrYmMsXApvzzwV5e0E+9pNcN/Jtr+hCHy8DLgaemMw+tjpHF/v8WeDvxqh7QX7sTgdW5sf0lBM9voHvAtfk7a8Cf5m3/wr4at6+BrhzEvu8BLg4b58J/CL3rdhrfYI+99y1ntT/9BP8S74U+EHT/k3ATd1u1zj7sIXfDPpngCVND6Rn8vbXgI8cXw/4CPC1pvKv5bIlwM+byo+pN8n9XMGxodfxPrY6Rxf73Oo//zGPWxrf43Bpq8d3DrmXgb5cfrTekdvm7b5cL7p0ze8B3ns6XOsx+txz17rOQzdjfQn5QJfacrIScF9EbIrGl6UDLE4pbcvbLwGL83ar/p6ofOsY5b1gMvrY6hzd9LE8TPHNpuGF8fZ5EbAzpXTouPJj7isf35XrT6o8jHARsJHT5Fof12fosWtd56AvwbtSShcDVwDXR8RlzQdT4+m66Pmvk9HHHvk93ga8Ffh9YBvwxa62pkMiYg7wfeCTKaXdzcdKvdZj9LnnrnWdg772X0KeUhrJP3cA/wFcAmyPiCUA+eeOXL1Vf09UvnSM8l4wGX1sdY6uSCltTym9mVI6DHydxrWG8ff5FWB+RPQdV37MfeXj83L9SRERU2kE3rdTSnfn4qKv9Vh97sVrXeegr/WXkEfE7Ig488g28D7gCRp9ODLTYC2NcT9y+bV5tsJqYFd+u/oD4H0RsSC/RXwfjXG8bcDuiFidZydc23Rf3TYZfWx1jq44EkTZn9K41tBo5zV5FsVKYBWNDx3HfHznV6wPAVfn2x//+zvS56uBB3P9jsu//9uBp1NKNzcdKvZat+pzT17rbnxoMYEfflxJ45PuXwKf7nZ7xtn282h8uv4z4Mkj7acxzvYA8CzwP8DCXB7AV3JfHwcGm+7rL4Dh/O+jTeWD+UH2S+DLdOGDOeA7NN6+HqQxxnjdZPSx1Tm62Od/y316LP8nXdJU/9O5/c/QNDOq1eM7P3Z+kn8X3wOm5/IZeX84Hz9vEvv8LhpDJo8Bj+Z/V5Z8rU/Q55671i6BIEmFq/PQjSSpDQa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKtz/A1/NmoIeUlAfAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户点击新闻数量的分布" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "user_click_item_count = sorted(user_click_merge.groupby('user_id')['click_article_id'].count(), reverse=True)\n", - "plt.plot(user_click_item_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以根据用户的点击文章次数看出用户的活跃度" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#点击次数在前50的用户\n", - "plt.plot(user_click_item_count[:50])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "点击次数排前50的用户的点击次数都在100次以上。思路:我们可以定义点击次数大于等于100次的用户为活跃用户,这是一种简单的处理思路, 判断用户活跃度,更加全面的是再结合上点击时间,后面我们会基于点击次数和点击时间两个方面来判断用户活跃度。" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAD4CAYAAAAaT9YAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAARV0lEQVR4nO3dfYxc1X3G8eexd7ExEDAYjEPYrkOQFZekKUxT2lKgJQHHSuWGphJIDaRYWaUBKUitKJQqRWlTNYnaSFWippvaMonASZsUGSVtg4tSXKkYYqd+WQqYlwLxSzAvcYgIBYxP/5i7u6Nl786dmTt7z5n7/UjWzt6Z3fmdnfGjM+ece65DCAIApGlB1QUAALpHiANAwghxAEgYIQ4ACSPEASBhQ/P5ZMuWLQujo6Pz+ZQAkLydO3c+H0I4fbb75jXER0dHtWPHjvl8SgBInu2n8+5jOAUAEkaIA0DCCHEASBghDgAJI8QBIGFtQ9z2RtuHbU/Mct8f2g62l/WnPADAXIr0xDdJWjPzoO2zJV0u6ZmSawIAFNR2nXgIYZvt0Vnu+oKkmyRtKbuome59+Fnt/uGRfj/Nm5yy5Dh99FdHtWCB5/25AaCIrk72sb1O0oEQwm577oCzPSZpTJJGRka6eTrdt+85fW177lr3vpjcZv2SVafrnNNPnNfnBoCiOg5x20sk/YmaQylthRDGJY1LUqPR6OoKFJ9ed54+ve68bn60a9/ec1A33PnfeuMYF80AEK9uVqecI2mlpN22n5L0Nkk/sH1mmYXFggsfAYhZxz3xEMJeSWdMfp8FeSOE8HyJdVXOYhwcQPyKLDHcLOl+Sats77e9vv9lVW9yqD+IrjiAeBVZnXJ1m/tHS6smIpP9cIZTAMSMMzZzTPXECXEAESPE22A4BUDMCPFcTGwCiB8hnoPhFAApIMRz0A8HkAJCvA164gBiRojnaLcnDADEgBDPMbVOnNUpACJGiOdgYhNACgjxHNOn3QNAvAhxAEgYIZ5jchfDwHgKgIgR4nkYTgGQAEI8B7sYAkgBIZ5jep04KQ4gXoQ4ACSMEM/BcAqAFBDiOVgnDiAFhHiO6SWGFRcCAHMgxHOw/xWAFBDibXCyD4CYEeI5WGAIIAWEeB52MQSQAEI8x9TEJn1xABEjxHMwsQkgBYR4O3TEAUSMEM/BxCaAFBDiOSY3wGJiE0DMCPEcjIkDSAEhnoOr3QNIASHeBsMpAGLWNsRtb7R92PZEy7E/t73H9i7b99h+a3/LnH/sYgggBUV64pskrZlx7PMhhHeHEN4j6duSPlVyXRFgUBxA/IbaPSCEsM326IxjL7V8e4IGsMM6tKAZ4us3fV8LIp/lPGHRQm25/iKNnLak6lIAzLO2IZ7H9mckXSPpJ5J+Y47HjUkak6SRkZFun27erX7rW3TTmlX66f8drbqUOR088oq27DqoA0deIcSBGuo6xEMIt0q61fYtkm6Q9Gc5jxuXNC5JjUYjmR778MIF+sSl76i6jLa2P/mCtuw6yCoaoKbKWJ1yh6TfKeH3oAtTAz1kOFBLXYW47XNbvl0n6ZFyykG3yHCgntoOp9jeLOlSScts71dz2GSt7VWSjkl6WtLH+1kk8rE9AFBvRVanXD3L4Q19qAVdiHzhDIA+44zNxLE9AFBvhHjizGXkgFojxAcEGQ7UEyGevMmJTWIcqCNCPHFMbAL1RognjsvIAfVGiCfO7JkL1BohPiBYYgjUEyGeuKnhFDIcqCVCPHFMbAL1RognzmLvFKDOCPHEMa8J1BshPiA42QeoJ0J8QBDhQD0R4oljYhOoN0I8cUxsAvVGiCfOXGQTqDVCfEDQEwfqiRBPHEsMgXojxBNnMbMJ1BkhnjguzwbUGyGeOC6UDNQbIT4g6IkD9USIJ46JTaDeCPHkMbEJ1BkhnrjpiU364kAdEeKJox8O1BshPiDoiAP1RIgnbvJq9ywxBOqJEE8cwylAvbUNcdsbbR+2PdFy7PO2H7G9x/Zdtk/pa5XIxRmbQL0V6YlvkrRmxrGtks4LIbxb0j5Jt5RcFwpiP3Gg3obaPSCEsM326Ixj97R8u13Sh0uuCx26b99zOvLK61WX0bMVJy/W2netqLoMIBltQ7yA6yR9I+9O22OSxiRpZGSkhKdDq5OXDOvERUO6e/dB3b37YNXllGLvbZfrpMXDVZcBJKGnELd9q6Sjku7Ie0wIYVzSuCQ1Gg0+9Jfs5OOHteNP36dXjx6rupSe3fnAM/rsvz2io2/wNgGK6jrEbX9U0gclXRY4XbBSi4cXavHwwqrL6Nnxw80pGt5MQHFdhbjtNZJuknRJCOFn5ZaEurJZMAl0qsgSw82S7pe0yvZ+2+slfVHSSZK22t5l+8t9rhM1wD4wQOeKrE65epbDG/pQC2pu+gIXAIrijE1Eh444UBwhjniwDwzQMUIc0WBaE+gcIY5omEFxoGOEOKJDhgPFEeKIBpt5AZ0jxBGNqXXi9MWBwghxRIOJTaBzhDiiwQUugM4R4ogOGQ4UR4gjGtMTm8Q4UBQhjngwnAJ0jBBHNJjYBDpHiANAwghxRGPyohAMpwDFEeKIxvTWKaQ4UBQhjmiwThzoHCEOAAkjxBGN6b1TABRFiCManOwDdI4QRzToiQOdI8QRHTriQHGEOAAkjBBHNMxFNoGOEeKIxlSEk+FAYYQ4osHEJtA5QhzRoScOFEeIIxpmM1qgY4Q4osHV7oHOEeKIBhObQOcIcUSDXQyBzrUNcdsbbR+2PdFy7HdtP2T7mO1Gf0tE3TCcAhRXpCe+SdKaGccmJF0paVvZBaHOmNgEOjXU7gEhhG22R2cce1hqPcMO6N2C7O30e//wgIYWDv5I37lnnKg7P3Zh1WUgcW1DvFe2xySNSdLIyEi/nw4J++WVp+m6X1upV15/o+pS+m7vgSP6rydeqLoMDIC+h3gIYVzSuCQ1Gg0GO5Hr5CXD+tRvra66jHnxha37NHHgparLwAAY/M+sQISmV+LQr0FvCHGgQmQ4elVkieFmSfdLWmV7v+31tj9ke7+kX5H0Hdvf7XehwCCZuhRdxXUgfUVWp1ydc9ddJdcC1AYLu1AWhlOACkxvMUBfHL0hxIEKsHc6ykKIAxWiI45eEeJABSbPdmafGPSKEAeAhBHiQAXYdhdlIcSBCnApOpSFEAcqRE8cvSLEgQpwPVGUhRAHKsBgCspCiAMVYGITZSHEgQqwARbKQogDFWLvFPSKEAcqwN4pKAshDgAJI8SBCkztnUJXHD0ixIEKTC0xJMTRI0IcqBAn+6BXhDhQAdaJoyyEOFABzthEWQhxoALTF4UAekOIAxWYHk4hxtEbQhyoEBGOXhHiQAUmx8TpiKNXhDhQBTO1iXIQ4kAFpnriDKigR4Q4UAFPpzjQE0IcqBAZjl4R4kAFpi4KQYqjR4Q4UAHmNVGWtiFue6Ptw7YnWo6danur7ceyr0v7WyYwWJjYRFmK9MQ3SVoz49jNku4NIZwr6d7sewAFsQEWyjLU7gEhhG22R2ccXifp0uz27ZL+Q9Ifl1kYUAf/sveQli45ruoyorRoeIHev3q5Fg0trLqUqLUN8RzLQwiHsts/krQ874G2xySNSdLIyEiXTwcMlmUnLpIk/cV3Hq64krj9/Ucu0BU/f2bVZUSt2xCfEkIItnM/FIYQxiWNS1Kj0eDDIyDpsncu1/ZbLtNrR49VXUqUnn7xZX1kw4N6lb9PW92G+LO2V4QQDtleIelwmUUBdXDmyYurLiFar73RDG92eWyv2yWGd0u6Nrt9raQt5ZQDACzB7ESRJYabJd0vaZXt/bbXS/orSe+3/Zik92XfA0ApyPDiiqxOuTrnrstKrgUAJLVc+YjRlLY4YxNAtDgZqj1CHEB0uGhGcYQ4gOgwsVkcIQ4gOuzyWBwhDiA6U3vLVFtGEghxANHiZJ/2CHEA0SLC2yPEAUSHic3iCHEA0TGD4oUR4gCiw5WPiiPEAUSLec32CHEA0WE0pThCHEB0zD6GhRHiAKLDhaSLI8QBRIeJzeIIcQDRoifeHiEOID5MbBZGiAOIDhObxRHiAKJjrgpRGCEOIDrTE5tohxAHEC064u0R4gCiM321e1K8HUIcQHSY1iyOEAcQHfZOKY4QBxAdLpRcHCEOIFpkeHuEOID4TG2ARYy3Q4gDiA7X2CyOEAcQHTK8OEIcQHSm14lXXEgCCHEA0WI/8fZ6CnHbn7Q9Yfsh2zeWVBOAmmP/q+K6DnHb50n6mKT3SvoFSR+0/Y6yCgNQX0xsFjfUw8++U9IDIYSfSZLt+yRdKelzZRQGoL4mT/b5yn8+qW/u3F9xNeX4yyvfpV8aPbX039tLiE9I+ozt0yS9ImmtpB0zH2R7TNKYJI2MjPTwdADqYvHwAn38knP0zIsvV11KaY4fXtiX3+teFtPbXi/pE5JelvSQpFdDCDfmPb7RaIQdO96U8wCAOdjeGUJozHZfTxObIYQNIYQLQggXS/qxpH29/D4AQGd6GU6R7TNCCIdtj6g5Hn5hOWUBAIroKcQlfSsbE39d0vUhhCO9lwQAKKqnEA8h/HpZhQAAOscZmwCQMEIcABJGiANAwghxAEhYTyf7dPxk9nOSnu7yx5dJer7EclJAm+uBNtdDL23+uRDC6bPdMa8h3gvbO/LOWBpUtLkeaHM99KvNDKcAQMIIcQBIWEohPl51ARWgzfVAm+uhL21OZkwcAPBmKfXEAQAzEOIAkLAkQtz2GtuP2n7c9s1V19ML20/Z3mt7l+0d2bFTbW+1/Vj2dWl23Lb/Nmv3Htvnt/yea7PHP2b72qraMxvbG20ftj3Rcqy0Ntq+IPsbPp79bOVXZMxp8222D2Sv9S7ba1vuuyWr/1HbV7Qcn/W9bnul7Qey49+wfdz8tW52ts+2/T3b/5NdLP2T2fGBfa3naHN1r3UIIep/khZKekLS2yUdJ2m3pNVV19VDe56StGzGsc9Jujm7fbOkz2a310r6VzUv/n2hmtc0laRTJT2ZfV2a3V5addta2nOxpPMlTfSjjZIezB7r7Gc/EGmbb5P0R7M8dnX2Pl4kaWX2/l4413td0j9Kuiq7/WVJfxBBm1dIOj+7fZKaF4VZPciv9Rxtruy1TqEn/l5Jj4cQngwhvCbp65LWVVxT2dZJuj27fbuk3245/tXQtF3SKbZXSLpC0tYQwoshhB9L2ippzTzXnCuEsE3SizMOl9LG7L63hBC2h+a7/Kstv6syOW3Os07S10MIr4YQ/lfS42q+z2d9r2e9z9+U9M3s51v/fpUJIRwKIfwgu/1TSQ9LOksD/FrP0eY8fX+tUwjxsyT9sOX7/Zr7jxa7IOke2zvdvIi0JC0PIRzKbv9I0vLsdl7bU/yblNXGs7LbM4/H6oZs6GDj5LCCOm/zaZKOhBCOzjgeDdujkn5R0gOqyWs9o81SRa91CiE+aC4KIZwv6QOSrrd9ceudWY9joNd91qGNmb+TdI6k90g6JOmvK62mT2yfKOlbkm4MIbzUet+gvtaztLmy1zqFED8g6eyW79+WHUtSCOFA9vWwpLvU/Fj1bPbRUdnXw9nD89qe4t+krDYeyG7PPB6dEMKzIYQ3QgjHJH1Fzdda6rzNL6g59DA043jlbA+rGWZ3hBD+OTs80K/1bG2u8rVOIcS/L+ncbMb2OElXSbq74pq6YvsE2ydN3pZ0uaQJNdszOSN/raQt2e27JV2TzepfKOkn2cfU70q63PbS7GPb5dmxmJXSxuy+l2xfmI0fXtPyu6IyGWSZD6n5WkvNNl9le5HtlZLOVXMCb9b3etab/Z6kD2c/3/r3q0z2998g6eEQwt+03DWwr3Vemyt9rauc6S36T81Z7X1qzubeWnU9PbTj7WrOQu+W9NBkW9QcB7tX0mOS/l3SqdlxS/pS1u69khotv+s6NSdJHpf0+1W3bUY7N6v5kfJ1Ncf01pfZRkmN7D/JE5K+qOzM4wjb/LWsTXuy/8wrWh5/a1b/o2pZcZH3Xs/eOw9mf4t/krQogjZfpOZQyR5Ju7J/awf5tZ6jzZW91px2DwAJS2E4BQCQgxAHgIQR4gCQMEIcABJGiANAwghxAEgYIQ4ACft/AbwTsfQSxAYAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#点击次数排名在[25000:50000]之间\n", - "plt.plot(user_click_item_count[25000:50000])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以看出点击次数小于等于两次的用户非常的多,这些用户可以认为是非活跃用户" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻点击次数分析" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:42:14.526476Z", - "start_time": "2020-11-13T15:42:14.463642Z" - } - }, - "outputs": [], - "source": [ - "item_click_count = sorted(user_click_merge.groupby('click_article_id')['user_id'].count(), reverse=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T15:42:16.198000Z", - "start_time": "2020-11-13T15:42:16.044455Z" - } - }, - "outputs": [ + }, { - "data": { - "text/plain": [ - "[]" + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:40:04.296033Z", + "start_time": "2020-11-13T15:40:03.980868Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAASw0lEQVR4nO3da4yc1X3H8e8fr+/GVxZj1nZsgnshVCl0RYyS8iLkBm1qKpGIqCpWimSpJU1SWjXQvEjUV0nUQEMTkTghFamilIRQYVW0gQJRlRdxsgbCNYSNa8CLsZeLL/EFbHz6Yo6dsbPjZ9be2Znn+PuRrH2e85yZ55x9xr+ZOXP2TKSUkCSV64xuN0CS1FkGvSQVzqCXpMIZ9JJUOINekgrX1+0GAJx11llpxYoV3W6GJNXKpk2bXk4p9VfV64mgX7FiBUNDQ91uhiTVSkQ81049h24kqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSpcrYP+p1te5eb7nuGNQ4e73RRJ6lm1DvqHn3uNWx8c5tBhg16SWql10EuSqhn0klQ4g16SCmfQS1Lhigh6v99cklqrddBHdLsFktT7ah30kqRqBr0kFc6gl6TCGfSSVLgigt5JN5LUWq2DPnDajSRVqXXQS5KqGfSSVDiDXpIKV0TQJ9dAkKSWah30LoEgSdVqHfSSpGoGvSQVzqCXpMIZ9JJUuCKC3jk3ktRaEUEvSWrNoJekwhn0klS4toI+Iv4mIp6MiCci4jsRMSMiVkbExogYjog7I2Jarjs97w/n4ys62gNJ0glVBn1EDAAfBwZTShcCU4BrgM8Dt6SUzgdeA67LN7kOeC2X35LrSZK6pN2hmz5gZkT0AbOAbcC7gbvy8TuAq/L2mrxPPn55RGcXK3CpG0lqrTLoU0ojwD8Bz9MI+F3AJmBnSulQrrYVGMjbA8AL+baHcv1Fx99vRKyLiKGIGBodHT2pxnf4+UOSitDO0M0CGq/SVwLnArOBD5zqiVNK61NKgymlwf7+/lO9O0lSC+0M3bwH+L+U0mhK6SBwN/BOYH4eygFYCozk7RFgGUA+Pg94ZUJbLUlqWztB/zywOiJm5bH2y4GngIeAq3OdtcA9eXtD3icffzC5YLwkdU07Y/QbaXyo+jDweL7NeuBTwA0RMUxjDP72fJPbgUW5/Abgxg60W5LUpr7qKpBS+gzwmeOKNwOXjFH3APChU2/aOPh+QZJaqvVfxjrnRpKq1TroJUnVDHpJKpxBL0mFM+glqXBFBH1y2o0ktVTroHepG0mqVuuglyRVM+glqXAGvSQVzqCXpMIVEfSujSlJrdU66J10I0nVah30kqRqBr0kFc6gl6TCFRH0fhYrSa3VOujDNRAkqVKtg16SVM2gl6TCGfSSVDiDXpIKV0TQJ9dAkKSWah30TrqRpGq1DnpJUjWDXpIKZ9BLUuEMekkqXBFB75wbSWqt1kHvpBtJqlbroJckVTPoJalwBr0kFa6toI+I+RFxV0T8PCKejohLI2JhRNwfEc/mnwty3YiIWyNiOCIei4iLO9sFSdKJtPuK/kvAf6eUfgd4O/A0cCPwQEppFfBA3ge4AliV/60DbpvQFo/BpW4kqbXKoI+IecBlwO0AKaU3Uko7gTXAHbnaHcBVeXsN8K3U8GNgfkQsmeB2H2lcR+5WkkrSziv6lcAo8K8R8UhEfCMiZgOLU0rbcp2XgMV5ewB4oen2W3OZJKkL2gn6PuBi4LaU0kXAXn49TANAaqwTPK4BlIhYFxFDETE0Ojo6nptKksahnaDfCmxNKW3M+3fRCP7tR4Zk8s8d+fgIsKzp9ktz2TFSSutTSoMppcH+/v6Tbb8kqUJl0KeUXgJeiIjfzkWXA08BG4C1uWwtcE/e3gBcm2ffrAZ2NQ3xSJImWV+b9f4a+HZETAM2Ax+l8STx3Yi4DngO+HCuey9wJTAM7Mt1Oyq52o0ktdRW0KeUHgUGxzh0+Rh1E3D9qTWrPc65kaRq/mWsJBXOoJekwhn0klQ4g16SCldG0DvpRpJaqnXQu9SNJFWrddBLkqoZ9JJUOINekgpXRND7WawktVbroA8XQZCkSrUOeklSNYNekgpn0EtS4Qx6SSpcEUGfnHYjSS3VOuhdAkGSqtU66CVJ1Qx6SSqcQS9JhTPoJalwRQR9crUbSWqp1kHvpBtJqlbroJckVTPoJalwBr0kFc6gl6TCFRH0rnUjSa3VOuhd60aSqtU66CVJ1Qx6SSqcQS9JhTPoJalwRQS9k24kqbW2gz4ipkTEIxHxn3l/ZURsjIjhiLgzIqbl8ul5fzgfX9GhthOudiNJlcbziv4TwNNN+58HbkkpnQ+8BlyXy68DXsvlt+R6kqQuaSvoI2Ip8EfAN/J+AO8G7spV7gCuyttr8j75+OW5viSpC9p9Rf/PwN8Dh/P+ImBnSulQ3t8KDOTtAeAFgHx8V65/jIhYFxFDETE0Ojp6cq2XJFWqDPqI+GNgR0pp00SeOKW0PqU0mFIa7O/vn8i7liQ16WujzjuBP4mIK4EZwFzgS8D8iOjLr9qXAiO5/giwDNgaEX3APOCVCW95k+RiN5LUUuUr+pTSTSmlpSmlFcA1wIMppT8DHgKuztXWAvfk7Q15n3z8wdSpJHbkX5Iqnco8+k8BN0TEMI0x+Ntz+e3Aolx+A3DjqTVRknQq2hm6OSql9EPgh3l7M3DJGHUOAB+agLZJkiZAEX8ZK0lqrYig97NYSWqt1kHvZ7GSVK3WQS9JqmbQS1LhDHpJKpxBL0mFM+glqXC1DnpXP5akarUOeklSNYNekgpn0EtS4Qx6SSpcEUHvWjeS1Fqtg945N5JUrdZBL0mqZtBLUuEMekkqnEEvSYUrIugTTruRpFZqHfQudSNJ1Wod9JKkaga9JBXOoJekwhn0klS4IoLetW4kqbVaB72zbiSpWq2DXpJUzaCXpMIZ9JJUOINekgpXRNA76UaSWqt10IffMSVJlSqDPiKWRcRDEfFURDwZEZ/I5Qsj4v6IeDb/XJDLIyJujYjhiHgsIi7udCckSa2184r+EPC3KaULgNXA9RFxAXAj8EBKaRXwQN4HuAJYlf+tA26b8FZLktpWGfQppW0ppYfz9h7gaWAAWAPckavdAVyVt9cA30oNPwbmR8SSiW64JKk94xqjj4gVwEXARmBxSmlbPvQSsDhvDwAvNN1say47/r7WRcRQRAyNjo6Ot93HSK6BIEkttR30ETEH+D7wyZTS7uZjqZG040rblNL6lNJgSmmwv79/PDdtatNJ3UySTittBX1ETKUR8t9OKd2di7cfGZLJP3fk8hFgWdPNl+YySVIXtDPrJoDbgadTSjc3HdoArM3ba4F7msqvzbNvVgO7moZ4JEmTrK+NOu8E/hx4PCIezWX/AHwO+G5EXAc8B3w4H7sXuBIYBvYBH53IBkuSxqcy6FNKP4KWf5l0+Rj1E3D9KbZLkjRBav2XsUc450aSWisi6CVJrRn0klQ4g16SCmfQS1LhDHpJKlwRQe9SN5LUWq2DPlzsRpIq1TroJUnVDHpJKpxBL0mFM+glqXC1Dvq9rx8C4I1Dh7vcEknqXbUO+gWzpgJw2PmVktRSrYN+xtQpALzuK3pJaqnWQT+9rxH0Dt1IUmv1Dvqpjea//KvXu9wSSepdtQ76aVMazT/Dv5CVpJZqHfQLZ08DYMsre7vcEknqXbUO+vl51s3BNx2jl6RWah30s6Y1vtv82R2/6nJLJKl31TroARbNnsbTL+7udjMkqWfVPujPmTeDzS87Ri9JrdQ+6N+xchEAv9i+p8stkaTeVPug/+DblwDw9f/d3OWWSFJvqn3QX7R8AQDf27T16CJnkqRfq33QA/zjmrcB8MEv/6jLLZGk3lNE0F976QrOP3sOm0f3ctkXHmLPgYPdbpIk9Ywigh7g3o//IcsXzuL5V/fxe5+9jy/e94yLnUkSEKkH1nIfHBxMQ0NDE3JfN9//C2594Nmj++/53bN5/9vO4f0XnsPcGVMn5ByS1AsiYlNKabCyXmlBD7DvjUP8y4PD3P3wVrbv/vXKlufMncFFy+dz4cA8Llo2n7cNzGPO9D6mnOGiaJLq57QO+mbbdx9gw6Mv8pMtr/LY1p3HBP8R5/XPZmD+TM6aM53zz57DnOl9rDxrNnNnTmXZgpksmjO9I22TpFNh0Ldw8M3DPDGyi0ee38nW1/azY88Bntq2m9cPHmZk5/4T3nbx3OmcM3cG0Fg5c/nCWUePLVs46+hqmkDjiePMY58gBubPPPqtWJJ0qtoN+r4OnfwDwJeAKcA3Ukqf68R5TsbUKWdw0fIFR+ffN9v/xpvsP/gmW17Zy659B9m26wDbdx8AGn95u//gmwBseXkvz726j0de2ElKsGt/+7N8zpz+m7/y/jOnc+78mWPWnzltCr+1eM4J73PZgmOfZE7kwoF59I1jqGp63xTmzfKzDanOJjzoI2IK8BXgvcBW4KcRsSGl9NREn2uizZw2hZnTprQdmkfs2neQnfvfOLo/snM/o3uOHSJ6/pV9vLbvN58Qhkd/xd7XDx19Emm2bed+dux5nYd+vqPluQ8d7vw7sgWzpk7IO5HxPCFVmTV9CqvOPnNC7msiNJ6sZ0zqOd/aP4czZ3TktZpOwYy+KZzRY5/7deJRcgkwnFLaDBAR/w6sAXo+6E/WvFlTj3nV+5ZFsyft3Dt2H2DHnva+SvGpF3dz8HD7U04PvZl4YmQXE/EFXk+M7GbX/oPjevfTyos797PHv4JWj4qA8/tP/C682ccvX8UH335uB1vUmaAfAF5o2t8KvOP4ShGxDlgHsHz58g404/Rw9twZnD23vVeSFw7M63BrJs+BMd4Bdcure9/g+Vf3Teo5n3tlLzvHeIeo7npx535Gx/kd1vNmdn5otGvv+1JK64H10PgwtlvtUD310ofa586f2fIzlk5Zfd6iST2f6q0Tfxk7Aixr2l+ayyRJXdCJoP8psCoiVkbENOAaYEMHziNJasOED92klA5FxMeAH9CYXvnNlNKTE30eSVJ7OjJGn1K6F7i3E/ctSRqfYlavlCSNzaCXpMIZ9JJUOINekgrXE6tXRsQo8NxJ3vws4OUJbE4d2OfTg30+PZxKn9+SUuqvqtQTQX8qImKonWU6S2KfTw/2+fQwGX126EaSCmfQS1LhSgj69d1uQBfY59ODfT49dLzPtR+jlySdWAmv6CVJJ2DQS1Lhah30EfGBiHgmIoYj4sZut2e8ImJLRDweEY9GxFAuWxgR90fEs/nnglweEXFr7utjEXFx0/2szfWfjYi1TeV/kO9/ON920r/IMiK+GRE7IuKJprKO97HVObrY589GxEi+1o9GxJVNx27K7X8mIt7fVD7m4zsvAb4xl9+ZlwMnIqbn/eF8fMUkdZmIWBYRD0XEUxHxZER8IpcXe61P0Ofeu9YppVr+o7EE8i+B84BpwM+AC7rdrnH2YQtw1nFlXwBuzNs3Ap/P21cC/wUEsBrYmMsXApvzzwV5e0E+9pNcN/Jtr+hCHy8DLgaemMw+tjpHF/v8WeDvxqh7QX7sTgdW5sf0lBM9voHvAtfk7a8Cf5m3/wr4at6+BrhzEvu8BLg4b58J/CL3rdhrfYI+99y1ntT/9BP8S74U+EHT/k3ATd1u1zj7sIXfDPpngCVND6Rn8vbXgI8cXw/4CPC1pvKv5bIlwM+byo+pN8n9XMGxodfxPrY6Rxf73Oo//zGPWxrf43Bpq8d3DrmXgb5cfrTekdvm7b5cL7p0ze8B3ns6XOsx+txz17rOQzdjfQn5QJfacrIScF9EbIrGl6UDLE4pbcvbLwGL83ar/p6ofOsY5b1gMvrY6hzd9LE8TPHNpuGF8fZ5EbAzpXTouPJj7isf35XrT6o8jHARsJHT5Fof12fosWtd56AvwbtSShcDVwDXR8RlzQdT4+m66Pmvk9HHHvk93ga8Ffh9YBvwxa62pkMiYg7wfeCTKaXdzcdKvdZj9LnnrnWdg772X0KeUhrJP3cA/wFcAmyPiCUA+eeOXL1Vf09UvnSM8l4wGX1sdY6uSCltTym9mVI6DHydxrWG8ff5FWB+RPQdV37MfeXj83L9SRERU2kE3rdTSnfn4qKv9Vh97sVrXeegr/WXkEfE7Ig488g28D7gCRp9ODLTYC2NcT9y+bV5tsJqYFd+u/oD4H0RsSC/RXwfjXG8bcDuiFidZydc23Rf3TYZfWx1jq44EkTZn9K41tBo5zV5FsVKYBWNDx3HfHznV6wPAVfn2x//+zvS56uBB3P9jsu//9uBp1NKNzcdKvZat+pzT17rbnxoMYEfflxJ45PuXwKf7nZ7xtn282h8uv4z4Mkj7acxzvYA8CzwP8DCXB7AV3JfHwcGm+7rL4Dh/O+jTeWD+UH2S+DLdOGDOeA7NN6+HqQxxnjdZPSx1Tm62Od/y316LP8nXdJU/9O5/c/QNDOq1eM7P3Z+kn8X3wOm5/IZeX84Hz9vEvv8LhpDJo8Bj+Z/V5Z8rU/Q55671i6BIEmFq/PQjSSpDQa9JBXOoJekwhn0klQ4g16SCmfQS1LhDHpJKtz/A1/NmoIeUlAfAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "user_click_item_count = sorted(user_click_merge.groupby('user_id')['click_article_id'].count(), reverse=True)\n", + "plt.plot(user_click_item_count)" ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" }, { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以根据用户的点击文章次数看出用户的活跃度" ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(item_click_count)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(item_click_count[:100])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以看出点击次数最多的前100篇新闻,点击次数大于1000次" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(item_click_count[:20])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "点击次数最多的前20篇新闻,点击次数大于2500。思路:可以定义这些新闻为热门新闻, 这个也是简单的处理方式,后面我们也是根据点击次数和时间进行文章热度的一个划分。" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(item_click_count[3500:])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以发现很多新闻只被点击过一两次。思路:可以定义这些新闻是冷门新闻" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻共现频次:两篇新闻连续出现的次数" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count
count433597.000000
mean3.184139
std18.851753
min1.000000
25%1.000000
50%1.000000
75%2.000000
max2202.000000
\n", - "
" + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " count\n", - "count 433597.000000\n", - "mean 3.184139\n", - "std 18.851753\n", - "min 1.000000\n", - "25% 1.000000\n", - "50% 1.000000\n", - "75% 2.000000\n", - "max 2202.000000" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tmp = user_click_merge.sort_values('click_timestamp')\n", - "tmp['next_item'] = tmp.groupby(['user_id'])['click_article_id'].transform(lambda x:x.shift(-1))\n", - "union_item = tmp.groupby(['click_article_id','next_item'])['click_timestamp'].agg({'count'}).reset_index().sort_values('count', ascending=False)\n", - "union_item[['count']].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "由统计数据可以看出,平均共现次数3.18,最高为2202。\n", - "\n", - "说明用户看的新闻,相关性是比较强的。" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#画个图直观地看一看\n", - "x = union_item['click_article_id']\n", - "y = union_item['count']\n", - "plt.scatter(x, y)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAATdElEQVR4nO3df6xkZX3H8fe37Aq2EPmxN7pd9nKhmhgxuOB1hUANISHlV+CPYrqkRUTNNoopVlsrmiCamIhNlSpG3ApF1Cr4syuFWFqwahuW7OKy/BK9KgYQ3AVkkarU1W//mLMwd5hzZ+7MmTt3znm/ksmeOeeZOd89s/dzn32ec85EZiJJqr/fG3cBkqSlYeBLUkMY+JLUEAa+JDWEgS9JDbFiXDtetWpVzszMjGv3kjSRtm3b9mhmTg3y2rEF/szMDFu3bh3X7iVpIkXETwZ9rUM6ktQQBr4kNYSBL0kNYeBLUkMY+JLUEH0HfkTsExHfjYjru2zbNyKujYi5iNgSETOVVilJGtpievgXAveWbHsj8PPMfDHwEeDSYQuTJFWrr/PwI+JQ4HTgA8DbuzQ5C7ikWP4ScHlERI7g3sv3PfIL/m3HT0u3n/CSKdYffnDVu5WkidfvhVeXAe8EDijZvgZ4ACAz90TEbuAQ4NH2RhGxEdgIMD09PUC5MLfzKT52y1zXbZlw648f57q/PG6g95akOusZ+BFxBrAzM7dFxInD7CwzNwGbAGZnZwfq/Z9+1GpOP+r0rtv+/FO38vRvfjd4gZJUY/2M4R8PnBkR9wNfAE6KiM92tHkIWAsQESuAFwCPVVhn3/z+LknqrmfgZ+ZFmXloZs4AG4CbM/MvOpptBs4rls8u2pi9krSMDHzztIh4P7A1MzcDVwKfiYg54HFavxiWXBD4e0aSultU4GfmN4FvFssXt63/NfDaKguTJFWrVlfaRoy7AklavmoV+OCkrSSVqV3gS5K6q13gO2crSd3VLvAlSd3VKvAjwjF8SSpRq8CXJJWrVeB7VqYklatV4APO2kpSiVoFvhdeSVK5WgU+eOGVJJWpXeBLkrqrVeAHDuFLUplaBb4kqVytAj+ctZWkUrUKfIB02laSuqpV4Nu/l6RytQp8cNJWksrULvAlSd3VKvAj7OFLUplaBb4kqVzNAt9pW0kqU7PA9146klSmVoHvdVeSVK5n4EfEfhFxW0TcERF3R8T7urR5fUTsiojtxeNNoym3t3TWVpK6WtFHm6eBkzLzqYhYCXwnIm7MzFs72l2bmW+tvkRJUhV6Bn62usxPFU9XFo9l2Y12REeSyvU1hh8R+0TEdmAncFNmbunS7E8jYkdEfCki1pa8z8aI2BoRW3ft2jV41ZKkResr8DPzt5m5DjgUWB8RL+9o8nVgJjOPAm4CPl3yPpsyczYzZ6empoYouzsnbSWp3KLO0snMJ4BbgFM61j+WmU8XTz8FvLKS6gbgnK0kddfPWTpTEXFgsfx84GTgex1tVrc9PRO4t8Ia+xaO4ktSqX7O0lkNfDoi9qH1C+K6zLw+It4PbM3MzcBfRcSZwB7gceD1oyq4F++HL0nd9XOWzg7g6C7rL25bvgi4qNrSJElVqt2Vto7hS1J3tQp8SVK5WgW+p2VKUrlaBT4s00uAJWkZqFXge1qmJJWrVeCDd8uUpDK1C3xJUnf1CvxwDF+SytQr8CVJpWoV+E7ZSlK5WgU+4JiOJJWoVeCHV15JUqlaBT7YwZekMrULfElSd7UK/MALrySpTK0CX5JUrlaB75ytJJWrVeCDk7aSVKZWgW8HX5LK1Srwwa84lKQytQt8SVJ3tQr8iCAdxZekrmoV+JKkcrUKfCdtJalcz8CPiP0i4raIuCMi7o6I93Vps29EXBsRcxGxJSJmRlJtH5y0laTu+unhPw2clJmvANYBp0TEsR1t3gj8PDNfDHwEuLTSKvtlF1+SSq3o1SBbN6d5qni6snh09qPPAi4plr8EXB4RkWO4sc3j//t//O0X7xj49RvWr+WVhx1cYUWStDz0DHyAiNgH2Aa8GPh4Zm7paLIGeAAgM/dExG7gEODRjvfZCGwEmJ6eHq7yLtbPHMytP3yM/557tHfjLh558tckGPiSaqmvwM/M3wLrIuJA4KsR8fLMvGuxO8vMTcAmgNnZ2cp7/xvWT7Nh/eC/SI7/4M0VViNJy8uiztLJzCeAW4BTOjY9BKwFiIgVwAuAxyqob8k56Suprvo5S2eq6NkTEc8HTga+19FsM3BesXw2cPM4xu8lSeX6GdJZDXy6GMf/PeC6zLw+It4PbM3MzcCVwGciYg54HNgwsopHzCt1JdVVP2fp7ACO7rL+4rblXwOvrbY0SVKVanWl7bD8AhVJdWbgd3JER1JNGfht7OFLqjMDv4MdfEl1ZeBLUkMY+G2CwMsHJNWVgS9JDWHgt3HSVlKdGfgdHNCRVFcGfhs7+JLqzMDv4JytpLoy8CWpIQz8NhHhGL6k2jLwJakhDPw2TtpKqjMDv4NX2kqqKwO/nV18STVm4Hewfy+prgx8SWoIA79NgF18SbVl4EtSQxj4bcLbZUqqMQO/QzqmI6mmDPw29u8l1VnPwI+ItRFxS0TcExF3R8SFXdqcGBG7I2J78bh4NOWOntddSaqrFX202QO8IzNvj4gDgG0RcVNm3tPR7tuZeUb1JUqSqtCzh5+ZD2fm7cXyL4B7gTWjLmwcIuzhS6qvRY3hR8QMcDSwpcvm4yLijoi4MSKOLHn9xojYGhFbd+3atfhqJUkD6zvwI2J/4MvA2zLzyY7NtwOHZeYrgI8BX+v2Hpm5KTNnM3N2ampqwJJHJ5y2lVRjfQV+RKykFfafy8yvdG7PzCcz86li+QZgZUSsqrTSJeJpmZLqqp+zdAK4Erg3Mz9c0uZFRTsiYn3xvo9VWehS8LorSXXWz1k6xwPnAndGxPZi3buBaYDMvAI4G3hzROwBfgVsyAm9sfxkVi1JvfUM/Mz8Dj2uScrMy4HLqypKklQ9r7TtYAdfUl0Z+JLUEAZ+G++WKanODPwOTtpKqisDv439e0l1ZuA/h118SfVk4EtSQxj4bbxbpqQ6M/AlqSEM/DYRjuBLqi8DX5IawsBv4/3wJdWZgd9hQm/yKUk9GfiS1BAGfhsnbSXVmYEvSQ1h4LcJvPBKUn0Z+JLUEAZ+O++HL6nGDPwOjuhIqisDX5IawsBv05q0tY8vqZ4MfElqCAO/jXO2kuqsZ+BHxNqIuCUi7omIuyPiwi5tIiI+GhFzEbEjIo4ZTbmSpEGt6KPNHuAdmXl7RBwAbIuImzLznrY2pwIvKR6vBj5R/DlR7OBLqrOegZ+ZDwMPF8u/iIh7gTVAe+CfBVyTrRnPWyPiwIhYXbx2otz50G7OvXLLuMt4jvOPn+Gkl75w3GVImmD99PCfEREzwNFAZyKuAR5oe/5gsW5e4EfERmAjwPT09CJLHb0zjvpDvr7jpzz19J5xlzLP3Q89ydQB+xr4kobSd+BHxP7Al4G3ZeaTg+wsMzcBmwBmZ2eX3fmPbzjhcN5wwuHjLuM5Trj05nGXIKkG+jpLJyJW0gr7z2XmV7o0eQhY2/b80GKdqrLsfj1KmjT9nKUTwJXAvZn54ZJmm4HXFWfrHAvsnsTxe0mqs36GdI4HzgXujIjtxbp3A9MAmXkFcANwGjAH/BI4v/JKG8wvZpFUhX7O0vkOPc5YLM7OuaCqoiRJ1fNK2wkQXiEgqQIG/oTwpm6ShmXgTwDv8SOpCgb+hLB/L2lYBr4kNYSBPwFaX8wy7iokTToDX5IawsCfABHhGL6koRn4ktQQBv4E8KxMSVUw8CeEF15JGpaBL0kNYeBPAu+WKakCBr4kNYSBPwEC7OJLGpqBL0kNYeBPgPB2mZIqYOBPiHRMR9KQDHxJaggDfwJ4t0xJVTDwJakhDPwJEGEPX9LwDHxJaggDfwKE98uUVIGegR8RV0XEzoi4q2T7iRGxOyK2F4+Lqy9TnpYpaVgr+mhzNXA5cM0Cbb6dmWdUUpEkaSR69vAz81vA40tQi0o4aSupClWN4R8XEXdExI0RcWRZo4jYGBFbI2Lrrl27Ktq1JKkfVQT+7cBhmfkK4GPA18oaZuamzJzNzNmpqakKdt0cdvAlDWvowM/MJzPzqWL5BmBlRKwaujJJUqWGDvyIeFEUt3OMiPXFez427PvqWd4tU1IVep6lExGfB04EVkXEg8B7gZUAmXkFcDbw5ojYA/wK2JB+43blPKKShtUz8DPznB7bL6d12qYkaRnzStsJ0BrQsYsvaTgGviQ1hIE/AbzwSlIVDHxJaggDfwJ4VqakKhj4E8IRHUnDMvAngPfDl1QFA39CeC2bpGEZ+JLUEAb+BIhwDF/S8Ax8SWoIA38COGUrqQoG/oRwzlbSsAz8SeCVV5IqYOBPCDv4koZl4EtSQxj4EyDwwitJwzPwJakhDPwJ4JytpCoY+JLUEAb+BLCDL6kKBv6EcM5W0rAMfElqCAN/AkQE6aVXkobUM/Aj4qqI2BkRd5Vsj4j4aETMRcSOiDim+jIlScPqp4d/NXDKAttPBV5SPDYCnxi+LLVz0lZSFVb0apCZ34qImQWanAVck61LQW+NiAMjYnVmPlxVkYLbf/IEJ3/4v8ZdhqQK/Nmr1vKmPz5iyffbM/D7sAZ4oO35g8W65wR+RGyk9b8ApqenK9h1M5x73GF84+5Hxl2GpIqs2n/fsey3isDvW2ZuAjYBzM7OOgvZp7PWreGsdWvGXYakCVfFWToPAWvbnh9arJMkLSNVBP5m4HXF2TrHArsdv5ek5afnkE5EfB44EVgVEQ8C7wVWAmTmFcANwGnAHPBL4PxRFStJGlw/Z+mc02N7AhdUVpEkaSS80laSGsLAl6SGMPAlqSEMfElqiBjXl2NHxC7gJwO+fBXwaIXlVMnaFm+51gXWNojlWhfUo7bDMnNqkB2MLfCHERFbM3N23HV0Y22Lt1zrAmsbxHKtC6zNIR1JaggDX5IaYlIDf9O4C1iAtS3ecq0LrG0Qy7UuaHhtEzmGL0lavEnt4UuSFsnAl6SmyMyJetD6ft37aN2d810j3M/9wJ3AdmBrse5g4CbgB8WfBxXrA/hoUdMO4Ji29zmvaP8D4Ly29a8s3n+ueG0sUMtVwE7grrZ1I6+lbB991HYJre9E2F48TmvbdlGxn/uAP+n1uQKHA1uK9dcCzyvW71s8nyu2z3TUtRa4BbgHuBu4cLkctwVqG+txA/YDbgPuKOp63xDvVUm9fdR2NfDjtmO2bkw/B/sA3wWuXy7HrGuWjCowR/EoDuoPgSOA5xUf/stGtK/7gVUd6z6094AD7wIuLZZPA24s/pEdC2xp+4fyo+LPg4rlvQFzW9E2iteeukAtrwGOYX6ojryWsn30UdslwN90afuy4jPbt/jH+sPiMy39XIHrgA3F8hXAm4vltwBXFMsbgGs79rWa4occOAD4frH/sR+3BWob63Er/h77F8sraYXJsYt9ryrr7aO2q4Gzuxyzpf45eDvwLzwb+GM/Zl2zZBRhOaoHcBzwjbbnFwEXjWhf9/PcwL8PWN32Q3tfsfxJ4JzOdsA5wCfb1n+yWLca+F7b+nntSuqZYX6ojryWsn30UdsldA+ueZ8X8I3iM+36uRY/eI8CKzo//72vLZZXFO0W+l/SvwInL6fj1qW2ZXPcgN8Hbgdevdj3qrLekuPVXtvVdA/8Jfs8aX3L338CJwHXD3L8R33M9j4mbQy/7AvTRyGBf4+IbcWXrwO8MJ/9Nq9HgBf2qGuh9Q92Wb8YS1FL2T768daI2BERV0XEQQPWdgjwRGbu6VLbM68ptu8u2j9HRMwAR9PqFS6r49ZRG4z5uEXEPhGxndYw3U20epeLfa8q620/VvNqy8y9x+wDxTH7SETs/Xbwpfw8LwPeCfyueD7I8R/JMes0aYG/lE7IzGOAU4ELIuI17Ruz9Ws1x1JZh6WoZZH7+ATwR8A64GHgH0ZUVk8RsT/wZeBtmflk+7ZxH7cutY39uGXmbzNzHa1e63rgpUtdQ5nO2iLi5bR6uy8FXkVrmObvRlzDvM8zIs4AdmbmtlHutyqTFvhL9oXpmflQ8edO4Ku0/vH/LCJWAxR/7uxR10LrD+2yfjGWopayfSwoM39W/HD+DvgnWsdukNoeAw6MiBUd6+e9V7H9BUX7Z0TESlqB+rnM/EqPv9OSHrdutS2X41bU8gStieXjBnivKut9jrbaTsnMh7PlaeCfGfyYDfp5Hg+cGRH3A1+gNazzjwv8fcZyzJ7Ra8xnOT1ojXn9iNakxt4JjCNHsJ8/AA5oW/4fWjPlf8/8yZsPFcunM3+C6LZi/cG0ziA4qHj8GDi42NY5QXRaj5pmmD9OPvJayvbRR22r25b/GvhCsXwk8yemfkRrUqr0cwW+yPyJqbcUyxcwf/Lruo6aArgGuKxj/diP2wK1jfW4AVPAgcXy84FvA2cs9r2qrLeP2la3HdPLgA+O8efgRJ6dtB37MetaY9VhOeoHrdn379MaW3zPiPZxRHFg954C9p5i/SG0Jmd+APxH2z+UAD5e1HQnMNv2Xm+gddrUHHB+2/pZ4K7iNZez8ITj52n9F/83tMbq3rgUtZTto4/aPlPsewewmflB9p5iP/fRdmZS2edafBa3FTV/Edi3WL9f8Xyu2H5ER10n0Pqv9w7aTnNcDsdtgdrGetyAo2idWrij+HtdPMR7VVJvH7XdXByzu4DP8uyZPEv6c1C0O5FnA3/sx6zbw1srSFJDTNoYviRpQAa+JDWEgS9JDWHgS1JDGPiS1BAGviQ1hIEvSQ3x/4tppPoWqYdUAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(union_item['count'].values[40000:])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "大概有75000个pair至少共现一次" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 新闻文章信息" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYoAAAD4CAYAAADy46FuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAgu0lEQVR4nO3de5Bc5X3m8e/T3XORNAhdZiwLSSAJCzsysQWoQJvYrAM2CLK2IEscsamgZCkrXkMltpNa43XFOBenbGcdymxsXBC0Fq4EjMEO2qxsLMskrmwiYISxkABZIyEFCSGNbgjd5vrbP/odcaZnpmc0F/WM5vlUdc3p37n0e6bFPLzve063IgIzM7O+5CrdADMzG90cFGZmVpaDwszMynJQmJlZWQ4KMzMrq1DpBgy3+vr6mDt3bqWbYWY2pmzcuPFARDT0tu6cC4q5c+fS2NhY6WaYmY0pknb1tc5DT2ZmVpaDwszMynJQmJlZWQ4KMzMry0FhZmZlOSjMzKwsB4WZmZXloEie3XmI//nkVto7OivdFDOzUcVBkTz/70f4m6eaONXuoDAzy3JQJLVVxV/FqbaOCrfEzGx06TcoJK2StF/S5kztO5KeT4+dkp5P9bmSTmbWfTOzzxWSXpDUJOleSUr1aZLWSdqWfk5NdaXtmiRtknT5sJ99Rk0hDzgozMxKDaRH8S1gabYQEb8VEYsiYhHwOPC9zOrtXesi4uOZ+n3Ax4AF6dF1zLuA9RGxAFifngPckNl2Zdp/xNSc7lF46MnMLKvfoIiInwKHeluXegUfBR4udwxJM4HJEbEhil/S/RBwU1q9DFidlleX1B+Kog3AlHScEVFb5R6FmVlvhjpH8X5gX0Rsy9TmSfqZpH+W9P5UmwXszmyzO9UAZkTE3rT8OjAjs8+rfezTjaSVkholNTY3Nw/qRLqCoqXdQWFmljXUoLiV7r2JvcCFEXEZ8Gng7yVNHujBUm8jzrQREXF/RCyOiMUNDb1+nHq/agvFX0WLh57MzLoZ9PdRSCoAvwFc0VWLiBagJS1vlLQduATYA8zO7D471QD2SZoZEXvT0NL+VN8DzOljn2F3eujJPQozs26G0qP4IPByRJweUpLUICmfludTnIjekYaWjkpakuY1bgOeSLutAVak5RUl9dvS1U9LgDcyQ1TD7q05CvcozMyyBnJ57MPAvwHvlLRb0u1p1XJ6TmJfDWxKl8s+Bnw8Iromwj8B/C3QBGwHfpDqXwI+JGkbxfD5UqqvBXak7R9I+48Y30dhZta7foeeIuLWPuq/20vtcYqXy/a2fSNwaS/1g8C1vdQDuKO/9g2Xt+6jcI/CzCzLd2Yn7lGYmfXOQZF4MtvMrHcOiqSm4Duzzcx646BIJFFTyNHioSczs24cFBm1VXnPUZiZlXBQZNRW5Wjx91GYmXXjoMioKbhHYWZWykGRUVuV82S2mVkJB0VGbVXel8eamZVwUGTUeujJzKwHB0VGjYeezMx6cFBk+PJYM7OeHBQZtVV5Xx5rZlbCQZFRW8i5R2FmVsJBkVGco3BQmJllOSgyagseejIzK+WgyOiazC5+Z5KZmYGDopvaqhydAW0dDgozsy4Oigx/eZGZWU/9BoWkVZL2S9qcqX1B0h5Jz6fHjZl1n5XUJGmrpOsz9aWp1iTprkx9nqSnU/07kqpTvSY9b0rr5w7bWfehKl/8dbS7R2FmdtpAehTfApb2Ur8nIhalx1oASQuB5cC70z7fkJSXlAe+DtwALARuTdsCfDkd6x3AYeD2VL8dOJzq96TtRlQ+JwDaOz2hbWbWpd+giIifAocGeLxlwCMR0RIRrwBNwJXp0RQROyKiFXgEWCZJwDXAY2n/1cBNmWOtTsuPAdem7UdMoSso3KMwMzttKHMUd0ralIampqbaLODVzDa7U62v+nTgSES0l9S7HSutfyNt34OklZIaJTU2NzcP+oS6ehQdnQ4KM7Mugw2K+4CLgUXAXuCrw9WgwYiI+yNicUQsbmhoGPRxTs9ROCjMzE4bVFBExL6I6IiITuABikNLAHuAOZlNZ6daX/WDwBRJhZJ6t2Ol9een7UfMWz0Kz1GYmXUZVFBImpl5ejPQdUXUGmB5umJpHrAAeAZ4FliQrnCqpjjhvSaKd7Y9BdyS9l8BPJE51oq0fAvwkxjhO+FOz1G4R2Fmdlqhvw0kPQx8AKiXtBu4G/iApEVAADuB3weIiC2SHgVeBNqBOyKiIx3nTuBJIA+siogt6SU+Azwi6S+AnwEPpvqDwLclNVGcTF8+1JPtT96T2WZmPfQbFBFxay/lB3updW3/ReCLvdTXAmt7qe/graGrbP0U8Jv9tW84FfKezDYzK+U7szPyOU9mm5mVclBkFHx5rJlZDw6KDN+ZbWbWk4Miw3dmm5n15KDI8J3ZZmY9OSgyfGe2mVlPDooM35ltZtaTgyLDd2abmfXkoMjwHIWZWU8OioxCzt9wZ2ZWykGRkfdHeJiZ9eCgyPAchZlZTw6KDN+ZbWbWk4Miw3dmm5n15KDIKKQb7jxHYWb2FgdFhucozMx6clBk+M5sM7OeHBQZeblHYWZWqt+gkLRK0n5JmzO1v5L0sqRNkr4vaUqqz5V0UtLz6fHNzD5XSHpBUpOke6XiX2VJ0yStk7Qt/Zya6krbNaXXuXzYz75ELidy8hyFmVnWQHoU3wKWltTWAZdGxHuAXwCfzazbHhGL0uPjmfp9wMeABenRdcy7gPURsQBYn54D3JDZdmXaf8QVcjn3KMzMMvoNioj4KXCopPajiGhPTzcAs8sdQ9JMYHJEbIiIAB4CbkqrlwGr0/LqkvpDUbQBmJKOM6LyOblHYWaWMRxzFP8V+EHm+TxJP5P0z5Len2qzgN2ZbXanGsCMiNibll8HZmT2ebWPfbqRtFJSo6TG5ubmIZxK8con30dhZvaWIQWFpM8B7cDfpdJe4MKIuAz4NPD3kiYP9Hipt3HGf6Uj4v6IWBwRixsaGs50927yefnObDOzjMJgd5T0u8B/Aq5Nf+CJiBagJS1vlLQduATYQ/fhqdmpBrBP0syI2JuGlvan+h5gTh/7jBjPUZiZdTeoHoWkpcB/Bz4SEScy9QZJ+bQ8n+JE9I40tHRU0pJ0tdNtwBNptzXAirS8oqR+W7r6aQnwRmaIasQUcqLDQ09mZqf126OQ9DDwAaBe0m7gbopXOdUA69JVrhvSFU5XA38mqQ3oBD4eEV0T4Z+geAXVBIpzGl3zGl8CHpV0O7AL+GiqrwVuBJqAE8DvDeVEByqfk3sUZmYZ/QZFRNzaS/nBPrZ9HHi8j3WNwKW91A8C1/ZSD+CO/to33Ap5+c5sM7MM35ldwj0KM7PuHBQlCr6PwsysGwdFibyvejIz68ZBUcI9CjOz7hwUJTxHYWbWnYOiRPEjPHzVk5lZFwdFiULePQozsywHRYlCLuc5CjOzDAdFCc9RmJl156AoUbzqyXMUZmZdHBQl8v4+CjOzbhwUJYqf9eSgMDPr4qAokfdktplZNw6KEgVPZpuZdeOgKJH3R3iYmXXjoChRlRdtvjPbzOw0B0UJ9yjMzLpzUJQo+GPGzcy6cVCUcI/CzKy7AQWFpFWS9kvanKlNk7RO0rb0c2qqS9K9kpokbZJ0eWafFWn7bZJWZOpXSHoh7XOvJJV7jZFUvOrJcxRmZl0G2qP4FrC0pHYXsD4iFgDr03OAG4AF6bESuA+Kf/SBu4GrgCuBuzN/+O8DPpbZb2k/rzFi3KMwM+tuQEERET8FDpWUlwGr0/Jq4KZM/aEo2gBMkTQTuB5YFxGHIuIwsA5YmtZNjogNERHAQyXH6u01RozvozAz624ocxQzImJvWn4dmJGWZwGvZrbbnWrl6rt7qZd7jW4krZTUKKmxubl5kKdTlM/liIBOh4WZGTBMk9mpJzCif1nLvUZE3B8RiyNicUNDw5Bep5AXgHsVZmbJUIJiXxo2Iv3cn+p7gDmZ7WanWrn67F7q5V5jxBRyXUHhCW0zMxhaUKwBuq5cWgE8kanflq5+WgK8kYaPngSukzQ1TWJfBzyZ1h2VtCRd7XRbybF6e40Rk8+5R2FmllUYyEaSHgY+ANRL2k3x6qUvAY9Kuh3YBXw0bb4WuBFoAk4AvwcQEYck/TnwbNruzyKia4L8ExSvrJoA/CA9KPMaI6arR9Hh76QwMwMGGBQRcWsfq67tZdsA7ujjOKuAVb3UG4FLe6kf7O01RlI+X+xktXnoycwM8J3ZPcyvnwTA2k17+9nSzGx8cFCU+JWLp/O+d9Rzz4+38eaptko3x8ys4hwUJSTx6esu4Y2Tbax9wb0KMzMHRS8umzOF+Q2TePy5Pf1vbGZ2jnNQ9EISv3HZLJ555RD7j56qdHPMzCrKQdGHixvqADh4vLXCLTEzqywHRR9qqoq/mpZ2XyZrZuObg6IPNYU8AC1tHRVuiZlZZTko+lBTcI/CzAwcFH063aNwUJjZOOeg6EPXHEWrg8LMxjkHRR/eGnryHIWZjW8Oij546MnMrMhB0YfTPQpf9WRm45yDog++j8LMrMhB0YfqvIPCzAwcFH0q5HPkc/JktpmNew6KMmoKOVra3KMws/Ft0EEh6Z2Sns88jkr6pKQvSNqTqd+Y2eezkpokbZV0faa+NNWaJN2Vqc+T9HSqf0dS9eBP9czVFHIeejKzcW/QQRERWyNiUUQsAq4ATgDfT6vv6VoXEWsBJC0ElgPvBpYC35CUl5QHvg7cACwEbk3bAnw5HesdwGHg9sG2dzBqCnkPPZnZuDdcQ0/XAtsjYleZbZYBj0RES0S8AjQBV6ZHU0TsiIhW4BFgmSQB1wCPpf1XAzcNU3sHpKYq5zuzzWzcG66gWA48nHl+p6RNklZJmppqs4BXM9vsTrW+6tOBIxHRXlLvQdJKSY2SGpubm4d+NomHnszMhiEo0rzBR4DvptJ9wMXAImAv8NWhvkZ/IuL+iFgcEYsbGhqG7bjFoScHhZmNb4VhOMYNwHMRsQ+g6yeApAeAf0xP9wBzMvvNTjX6qB8EpkgqpF5Fdvuzotij8ByFmY1vwzH0dCuZYSdJMzPrbgY2p+U1wHJJNZLmAQuAZ4BngQXpCqdqisNYayIigKeAW9L+K4AnhqG9A1bty2PNzIbWo5A0CfgQ8PuZ8lckLQIC2Nm1LiK2SHoUeBFoB+6IiI50nDuBJ4E8sCoitqRjfQZ4RNJfAD8DHhxKe89UTSHHm6fa+9/QzOwcNqSgiIjjFCeds7XfKbP9F4Ev9lJfC6ztpb6D4lVRFeHLY83MfGd2WTVVvurJzMxBUYY/wsPMzEFRloeezMwcFGX5hjszMwdFWf4IDzMzB0VZNYU87Z1Be4fDwszGLwdFGV3fm93qoDCzccxBUUZ1Cgpf+WRm45mDooyaQh7w92ab2fjmoCija+jJl8ia2XjmoCijpqr46znloSczG8ccFGXU1RQ/CuvNU20VbomZWeU4KMqor6sB4MCxlgq3xMyschwUZTScVwyK5mOtFW6JmVnlOCjKmDapGoADb7pHYWbjl4OijKp8jqkTqzz0ZGbjmoOiH/V1NRz00JOZjWMOin7U19W4R2Fm45qDoh/15zkozGx8G3JQSNop6QVJz0tqTLVpktZJ2pZ+Tk11SbpXUpOkTZIuzxxnRdp+m6QVmfoV6fhNaV8Ntc1nor6umgMeejKzcWy4ehS/FhGLImJxen4XsD4iFgDr03OAG4AF6bESuA+KwQLcDVwFXAnc3RUuaZuPZfZbOkxtHpD6uhqOtbRzqs0f42Fm49NIDT0tA1an5dXATZn6Q1G0AZgiaSZwPbAuIg5FxGFgHbA0rZscERsiIoCHMsc6K+rripfINvsSWTMbp4YjKAL4kaSNklam2oyI2JuWXwdmpOVZwKuZfXenWrn67l7q3UhaKalRUmNzc/NQz6ebt02uBWDPkZPDelwzs7FiOILifRFxOcVhpTskXZ1dmXoCMQyv06eIuD8iFkfE4oaGhmE99qLZU5Dg2VcODetxzczGiiEHRUTsST/3A9+nOMewLw0bkX7uT5vvAeZkdp+dauXqs3upnzVTJ1XzrrdPZsMrB8/my5qZjRpDCgpJkySd17UMXAdsBtYAXVcurQCeSMtrgNvS1U9LgDfSENWTwHWSpqZJ7OuAJ9O6o5KWpKudbssc66xZMn8aG3cd9vdSmNm4NNQexQzgXyT9HHgG+L8R8UPgS8CHJG0DPpieA6wFdgBNwAPAJwAi4hDw58Cz6fFnqUba5m/TPtuBHwyxzWfsqnnTOdXWyeY9b5ztlzYzq7jCUHaOiB3Ae3upHwSu7aUewB19HGsVsKqXeiNw6VDaOVS/NPM8ALbvP84VF02rZFPMzM4635k9ALOmTKAqL3YcOF7pppiZnXUOigEo5HNcOG0irxw4VummmJmddQ6KAZpXX8fOAycq3Qwzs7POQTFA8xsm8crB43R2jugtIWZmo46DYoDm1U+itb2T197wHdpmNr44KAZofv0kAF7a+2aFW2JmdnY5KAZo0YVTqKsp8OMX91W6KWZmZ5WDYoBqCnmuedfbWPfSPto7OivdHDOzs8ZBcQaWXvp2Dh1v5UfuVZjZOOKgOAPX/tLbuHTWZD7z+CZ2+uY7MxsnHBRnoKaQ577fvoJ8Tvz+tzdyorW90k0yMxtxDoozNGfaRO5dfhlb973JveubKt0cM7MR56AYhKsvaeA3LpvFqv/3Ci/tPVrp5piZjSgHxSD98fXvpK6mwEf+5l/416YDlW6OmdmIcVAM0gVTJvCjT13N1InV/O9/3Vnp5piZjRgHxRDU19Xw4fdewD9t3c+RE62Vbo6Z2YhwUAzRTYtm0dYRfLdxd6WbYmY2IhwUQ3TprMlcfUkD9/z4FzTt9/dVmNm5Z9BBIWmOpKckvShpi6Q/TPUvSNoj6fn0uDGzz2clNUnaKun6TH1pqjVJuitTnyfp6VT/jqTqwbZ3pEjiL2++lHxO3PC1n3L7t57l5dd9JZSZnTuG0qNoB/4oIhYCS4A7JC1M6+6JiEXpsRYgrVsOvBtYCnxDUl5SHvg6cAOwELg1c5wvp2O9AzgM3D6E9o6Y2VMn8qNPXc1vX3URjbsO85nHX6D49eBmZmPfoIMiIvZGxHNp+U3gJWBWmV2WAY9EREtEvAI0AVemR1NE7IiIVuARYJkkAdcAj6X9VwM3Dba9I23m+RP4wkfezV03vIufv3qEf9raXOkmmZkNi2GZo5A0F7gMeDqV7pS0SdIqSVNTbRbwama33anWV306cCQi2kvqvb3+SkmNkhqbmyv7B/o/Xz6bi6ZP5E//zxZOtnZUtC1mZsNhyEEhqQ54HPhkRBwF7gMuBhYBe4GvDvU1+hMR90fE4ohY3NDQMNIvV1Z1Icdf3vzL7Dx4gv/1k20VbYuZ2XAYUlBIqqIYEn8XEd8DiIh9EdEREZ3AAxSHlgD2AHMyu89Otb7qB4Epkgol9VHvV99Rz02LLuDBf3mF1474q1PNbGwbylVPAh4EXoqIv87UZ2Y2uxnYnJbXAMsl1UiaBywAngGeBRakK5yqKU54r4nibPBTwC1p/xXAE4Nt79n2R9e9kwj4k3/YTEenJ7bNbOwaSo/iV4HfAa4puRT2K5JekLQJ+DXgUwARsQV4FHgR+CFwR+p5tAN3Ak9SnBB/NG0L8Bng05KaKM5ZPDiE9p5Vc6ZN5H/c+C7Wv7yfL//w5Uo3x8xs0HSuXca5ePHiaGxsrHQzAIgIPv/EFr69YRe3Xnkht79vHu94W12lm2Vm1oOkjRGxuLd1hd6KNjwkcfeHF3KspZ3HN+7m37Yf4Mef/o8U8r4h3szGDv/FGmGFfI57fmsR9956GTsPnuAfN+2tdJPMzM6Ig+IsuW7hDN719vP44tqXePXQiUo3x8xswBwUZ0kuJ+699TJOtXbw/q88xX95YAMHj7VUullmZv1yUJxFl8w4j8c/8Sv8wbUL2LjrMDd/419p2v9mpZtlZlaWg+Isu2TGeXz6Q5fwyMolnGhtZ8WqZ2lp90d9mNno5aCokMsunMpff3QRe46c9Jcemdmo5qCooPcvqOfyC6dw7/ptHD3VVunmmJn1ykFRQZL4/IffzYFjLfzBwz/jJy/vq3STzMx68A13FbZozhQ+9cFL+Nr6bfzT1mZ++6oLWTRnCr/+nplMrPbbY2aV54/wGCVa2zv57Pde4PHnivMVE6ryvGf2+Vw4bSIfu3o+F02fSE0hX+FWmtm5qtxHeDgoRpnW9k427T7Cmp+/xouvHeWlvUc53tpBTSHH7/3qPD75wQXUVjkwzGx4+bOexpDqQo7Fc6exeO40APYfPcWTL+7juV2H+eY/b+epl/dz+UVT+MA738bCmZOZeX6tPzvKzEaUexRjyA83v87X1m9jz+ETHD1V/IbYfE5cMKWWefV1/PF1l/Ce2VMq20gzG5M89HSOaevo5Lldh9l18AT/fqj42LDjIMda2vnwey5g9tQJTK+rYXpdNfV11cyrr2PapOpKN9vMRjEPPZ1jqvI5rpo/navmTz9d23/0FJ9/YgvrXtrHoeOtPfaZMbmGhvNqmHFeLdf80tv4zSvmUF3wkJWZ9c89inNQS3sHh4+3ceBYC83HWti+/xgv7j3K4eOt7Dp0gh3Nx7m4YRJfueW9XHHR1Eo318xGAQ892WkRwVNb9/Mn/7CF1944yZJ505k1dQLz6icxv34Ss6dOZFpdNdMmVjOh2ldXmY0XDgrr4VhLO1/78S9o3HWY146cZN/Rnh95XluVY/qkGqZOqmLqxGrePrmW+Q111BRyTKjOM2fqRCbW5Kkt5JlUk+eCKROo8hVYZmPSmJ6jkLQU+BqQB/42Ir5U4SadE+pqCnzu1xeefn6spZ1Xmo+z942THD7RysHjrRw+3sqh420cOt7CoRNtvLR3P9/dWP4DDPM5UciJ+Q11TJ9UzcTqPJNqCtRW5Xn75FqmTKxiQlWe2uo8tSlwusKotpCnpipHTSFPPqeR/hWY2QCN6qCQlAe+DnwI2A08K2lNRLxY2Zade+pqCvzy7PP55dnn97lNRHC8tYOOjuDoqTZ2Hz7JqbYOWto7OHqqndeOnKS9IzjZ1sH25mMcPVmcJzne2s7J1k4OnMEXNVXniyEyoSrPxOo8tennhOo8UyZWc8GUWqrzOQq5HIW8Tm8/MT0KuRz5vKjK5cjnRFVe1FYV989LSJBTMYyk4uduiWJNAgEIhMhl1ivVlCM9f6ve7Xi89RrKvIbZWDSqgwK4EmiKiB0Akh4BlgEOigqQRF1N8Z/M+ROrmDNt4hntf6qtg+Mt7Zxs6+BUWwcnWzs50dpO87EW3jzVnkKnk1NtHZxq6+RkazsnWjs42dbBydYOTrR28OapdnYePM6Tm1to6+xkLI6c9ho8KlkmBY8ygVQSPJwOo+77dW1beryRDqoRj8ERfoGRbv/Z+P3/wbUL+PB7Lxj2Y4/2oJgFvJp5vhu4qnQjSSuBlQAXXnjh2WmZnbHaqvywf/xIZ2fQ1tlJa3snJ9s6ONFSDJT2zk7aO4P2jqC9s5O2jkgB1EFHZxABAXSmhaB7rWuZiGKts/izqx6nt4lMLR2v2/o+jhfd9zt9nChzvEyta5m0X9njnd525Ix0Xo/0XOqI///GiP/+iy9w/oSqETn+aA+KAYmI+4H7oTiZXeHm2FmUy4maXJ6aQp7zaqvgvEq3yOzcM9ovUdkDzMk8n51qZmZ2loz2oHgWWCBpnqRqYDmwpsJtMjMbV0b10FNEtEu6E3iS4uWxqyJiS4WbZWY2rozqoACIiLXA2kq3w8xsvBrtQ09mZlZhDgozMyvLQWFmZmU5KMzMrKxz7tNjJTUDuwa5ez1wYBibU2nn0vmcS+cC59b5+FxGrzM5n4sioqG3FedcUAyFpMa+PmZ3LDqXzudcOhc4t87H5zJ6Ddf5eOjJzMzKclCYmVlZDoru7q90A4bZuXQ+59K5wLl1Pj6X0WtYzsdzFGZmVpZ7FGZmVpaDwszMynJQJJKWStoqqUnSXZVuz5mStFPSC5Kel9SYatMkrZO0Lf2cWul29kXSKkn7JW3O1Hptv4ruTe/VJkmXV67lPfVxLl+QtCe9P89LujGz7rPpXLZKur4yre6dpDmSnpL0oqQtkv4w1cfqe9PX+Yy590dSraRnJP08ncufpvo8SU+nNn8nfUUDkmrS86a0fu6AX6z41Yvj+0HxI8y3A/OBauDnwMJKt+sMz2EnUF9S+wpwV1q+C/hypdtZpv1XA5cDm/trP3Aj8AOKXxO8BHi60u0fwLl8AfjjXrZdmP691QDz0r/DfKXPIdO+mcDlafk84BepzWP1venrfMbc+5N+x3VpuQp4Ov3OHwWWp/o3gf+Wlj8BfDMtLwe+M9DXco+i6EqgKSJ2REQr8AiwrMJtGg7LgNVpeTVwU+WaUl5E/BQ4VFLuq/3LgIeiaAMwRdLMs9LQAejjXPqyDHgkIloi4hWgieK/x1EhIvZGxHNp+U3gJYrfZT9W35u+zqcvo/b9Sb/jY+lpVXoEcA3wWKqXvjdd79ljwLWSNJDXclAUzQJezTzfTfl/PKNRAD+StFHSylSbERF70/LrwIzKNG3Q+mr/WH2/7kzDMasyw4Bj5lzSUMVlFP/Pdcy/NyXnA2Pw/ZGUl/Q8sB9YR7HHcyQi2tMm2faePpe0/g1g+kBex0Fx7nhfRFwO3ADcIenq7Moo9jfH7LXQY739wH3AxcAiYC/w1Yq25gxJqgMeBz4ZEUez68bie9PL+YzJ9yciOiJiETCbYk/nXSPxOg6Koj3AnMzz2ak2ZkTEnvRzP/B9iv9o9nV1+9PP/ZVr4aD01f4x935FxL70H3Un8ABvDV+M+nORVEXxj+rfRcT3UnnMvje9nc9Yfn8AIuII8BTwHygO93V9e2m2vafPJa0/Hzg4kOM7KIqeBRakqwWqKU70rKlwmwZM0iRJ53UtA9cBmymew4q02Qrgicq0cND6av8a4LZ0hc0S4I3MMMioVDJOfzPF9weK57I8XZEyD1gAPHO229eXNIb9IPBSRPx1ZtWYfG/6Op+x+P5IapA0JS1PAD5Ecc7lKeCWtFnpe9P1nt0C/CT1BvtX6Zn70fKgeLXGLyiO8X2u0u05w7bPp3hlxs+BLV3tpzj+uB7YBvwYmFbptpY5h4cpdvnbKI6r3t5X+yle7fH19F69ACyudPsHcC7fTm3dlP6DnZnZ/nPpXLYCN1S6/SXn8j6Kw0qbgOfT48Yx/N70dT5j7v0B3gP8LLV5M/D5VJ9PMcyagO8CNalem543pfXzB/pa/ggPMzMry0NPZmZWloPCzMzKclCYmVlZDgozMyvLQWFmZmU5KMzMrCwHhZmZlfX/ASSJjNc3PUeoAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#不同类型的新闻出现的次数\n", - "plt.plot(user_click_merge['category_id'].value_counts().values)" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#出现次数比较少的新闻类型, 有些新闻类型,基本上就出现过几次\n", - "plt.plot(user_click_merge['category_id'].value_counts().values[150:])" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 1.630633e+06\n", - "mean 2.043012e+02\n", - "std 6.382198e+01\n", - "min 0.000000e+00\n", - "25% 1.720000e+02\n", - "50% 1.970000e+02\n", - "75% 2.290000e+02\n", - "max 6.690000e+03\n", - "Name: words_count, dtype: float64" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#新闻字数的描述性统计\n", - "user_click_merge['words_count'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEJCAYAAAB4yveGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAsUUlEQVR4nO3deZxcVZn/8c+TFUgCCaQJSAgJEkBAEcgAioMLyuYMYRQVNyJmJjMj+tNxG1BnUBBEGVFQCEYWg8gSEAUhJIQQlpi1Q8hCFtJZOnu60+l0p5N0p5fn90fd7lR3qrqWruV23+/79epXV506deu5Vbeee+ucc881d0dERKKhV7EDEBGRwlHSFxGJECV9EZEIUdIXEYkQJX0RkQhR0hcRiZCUSd/MTjOzt+L+as3sW2Z2tJnNMLM1wf8hQX0zs3vMrMzMlprZuXHLGhfUX2Nm4/K5YiIicijLZJy+mfUGtgAXADcAu9z9DjO7ERji7v9tZlcC3wCuDOrd7e4XmNnRQCkwBnBgEXCeu1fndI1ERCSpTJt3LgHWuns5MBaYHJRPBq4Obo8FHvGYecBgMzseuAyY4e67gkQ/A7i8qysgIiLp65Nh/WuBx4Pbw9x9W3B7OzAsuH0CsCnuOZuDsmTlSQ0dOtRHjhyZYYgiItG2aNGine5ekuixtJO+mfUDrgJu6viYu7uZ5WQ+BzObAEwAGDFiBKWlpblYrIhIZJhZebLHMmneuQJ40913BPd3BM02BP8rgvItwIlxzxselCUrb8fdJ7n7GHcfU1KScEclIiJZyiTpf56DTTsAzwGtI3DGAc/GlV8XjOK5EKgJmoGmA5ea2ZBgpM+lQZmIiBRIWs07ZjYA+ATw73HFdwBTzGw8UA58NiifSmzkThmwD7gewN13mdmtwMKg3i3uvqvLayAiImnLaMhmoY0ZM8bVpi8ikhkzW+TuYxI9pjNyRUQiRElfRCRClPRFRCJESV8i6Y01lZRX7S12GCIFl+kZuSI9wpcfXADAhjs+WeRIRApLR/oiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFK+iIiEaKkLyISIWklfTMbbGZPm9kqM1tpZh8ws6PNbIaZrQn+DwnqmpndY2ZlZrbUzM6NW864oP4aMxuXr5USEZHE0j3SvxuY5u6nA2cDK4EbgZnuPhqYGdwHuAIYHfxNACYCmNnRwM3ABcD5wM2tOwoRESmMlEnfzI4CLgYeBHD3A+6+GxgLTA6qTQauDm6PBR7xmHnAYDM7HrgMmOHuu9y9GpgBXJ7DdRERkRTSOdIfBVQCD5vZYjN7wMwGAMPcfVtQZzswLLh9ArAp7vmbg7Jk5SIiUiDpJP0+wLnARHc/B9jLwaYcANzdAc9FQGY2wcxKzay0srIyF4sUEZFAOkl/M7DZ3ecH958mthPYETTbEPyvCB7fApwY9/zhQVmy8nbcfZK7j3H3MSUlJZmsi4iIpJAy6bv7dmCTmZ0WFF0CrACeA1pH4IwDng1uPwdcF4ziuRCoCZqBpgOXmtmQoAP30qBMREQKpE+a9b4B/MnM+gHrgOuJ7TCmmNl4oBz4bFB3KnAlUAbsC+ri7rvM7FZgYVDvFnfflZO1EBGRtKSV9N39LWBMgocuSVDXgRuSLOch4KEM4hMRkRzSGbkiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFpJX0z22Bmy8zsLTMrDcqONrMZZrYm+D8kKDczu8fMysxsqZmdG7eccUH9NWY2Lj+rJCIiyWRypP9Rd3+/u48J7t8IzHT30cDM4D7AFcDo4G8CMBFiOwngZuAC4Hzg5tYdhYiIFEZXmnfGApOD25OBq+PKH/GYecBgMzseuAyY4e673L0amAFc3oXXFxGRDKWb9B14ycwWmdmEoGyYu28Lbm8HhgW3TwA2xT13c1CWrFxERAqkT5r1PuTuW8zsWGCGma2Kf9Dd3cw8FwEFO5UJACNGjMjFIkVEJJDWkb67bwn+VwB/IdYmvyNotiH4XxFU3wKcGPf04UFZsvKOrzXJ3ce4+5iSkpLM1kZERDqVMumb2QAzG9R6G7gUWA48B7SOwBkHPBvcfg64LhjFcyFQEzQDTQcuNbMhQQfupUGZiIgUSDrNO8OAv5hZa/3H3H2amS0EppjZeKAc+GxQfypwJVAG7AOuB3D3XWZ2K7AwqHeLu+/K2ZqIiEhKKZO+u68Dzk5QXgVckqDcgRuSLOsh4KHMwxQRkVzQGbkiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFK+iIiEZJ20jez3ma22MyeD+6PMrP5ZlZmZk+aWb+gvH9wvyx4fGTcMm4Kyleb2WU5XxsREelUJkf63wRWxt3/OfArdz8FqAbGB+Xjgeqg/FdBPczsDOBa4EzgcuA+M+vdtfBFRCQTaSV9MxsOfBJ4ILhvwMeAp4Mqk4Grg9tjg/sEj18S1B8LPOHuDe6+HigDzs/BOoiISJrSPdL/NfB9oCW4fwyw292bgvubgROC2ycAmwCCx2uC+m3lCZ4jIiIFkDLpm9k/ARXuvqgA8WBmE8ys1MxKKysrC/GSIiKRkc6R/kXAVWa2AXiCWLPO3cBgM+sT1BkObAlubwFOBAgePwqoii9P8Jw27j7J3ce4+5iSkpKMV0hERJJLmfTd/SZ3H+7uI4l1xL7i7l8EZgHXBNXGAc8Gt58L7hM8/oq7e1B+bTC6ZxQwGliQszUREZGU+qSuktR/A0+Y2U+BxcCDQfmDwB/NrAzYRWxHgbu/bWZTgBVAE3CDuzd34fVFRCRDGSV9d38VeDW4vY4Eo2/cvR74TJLn3wbclmmQ3dHufQfYWXeAU44dWOxQsrazroG6+iZGDh1Q7FBEJEd0Rm6eXHn3G3z8rteKHUaXnH/by3zk/14tdhgikkNK+nmytaa+2CF0WYsXOwIRyTUlfRGRCFHSFxGJECV9EZEIUdIXEYkQJX0RkQhR0hcRiRAlfRGRCFHSFxGJECV9EZEIUdIXEYkQJX0RkQhR0hcRiRAlfRGRCFHSFxGJECV9EZEIUdIXEYkQJX0RkQhR0hcRiZCUSd/MDjOzBWa2xMzeNrOfBOWjzGy+mZWZ2ZNm1i8o7x/cLwseHxm3rJuC8tVmdlne1kpERBJK50i/AfiYu58NvB+43MwuBH4O/MrdTwGqgfFB/fFAdVD+q6AeZnYGcC1wJnA5cJ+Z9c7huuTE1x97k19MW1XsMLps0utr+fKD84sdRrfytT8t4pcvrS52GG121NZz7q0zKKvYU+xQ0vK3JVv5+F2v0aKLK4dayqTvMXXB3b7BnwMfA54OyicDVwe3xwb3CR6/xMwsKH/C3RvcfT1QBpyfi5XIpeeXbuO+V9cWO4wuu33qKt5Ys7PYYXQrU5dt5zevlBU7jDbTlm9n194DPDK3vNihpOU7U5ZQVlFHY0tLsUORTqTVpm9mvc3sLaACmAGsBXa7e1NQZTNwQnD7BGATQPB4DXBMfHmC54iISAGklfTdvdnd3w8MJ3Z0fnq+AjKzCWZWamallZWV+XoZEZFIymj0jrvvBmYBHwAGm1mf4KHhwJbg9hbgRIDg8aOAqvjyBM+Jf41J7j7G3ceUlJRkEp5Ij+RqIpccSmf0TomZDQ5uHw58AlhJLPlfE1QbBzwb3H4uuE/w+Cvu7kH5tcHonlHAaGBBjtZDpMcxK3YE0hVVdQ3MWLGj2GEcok/qKhwPTA5G2vQCprj782a2AnjCzH4KLAYeDOo/CPzRzMqAXcRG7ODub5vZFGAF0ATc4O7NuV0dESkWRz9J4n3l4YUs21LD8p9cxsD+6aTawkgZibsvBc5JUL6OBKNv3L0e+EySZd0G3JZ5mCLR1d2SqaGfKAAbqvYC0ByyIaw6I1ckpJQ6JR+U9EVEIkRJP0SaW5wr7n6D6W9vL3YoBTmr8vEFG/nC7+fl/XWk6z73u7lMWbgpdUUJPSX9ENl7oImV22r57pQlxQ6Fhqb8n1V50zPLmLO2Ku+v092FYcjm/PW7+P6flxY7DMkBJX2RsNKYze4tBDvrRJT0RSQnwvCLJExa346w7buV9EVCrrvl0rAluWIL29uhpC8SUmFLFtIzKOmLiESIkr6ISB54SDs5lPRFQi6kuUPSZCHr5FDSl4RCtp1GUnf7DLRv6h6U9EUkp7rZvipvwroTVNIPEf2Ml8S0YXRnYdsJKumHUdi2EikKTVEs+aCkLyI5pd8lMWH95a6kLyI5od8liYWtQ15JXyTkwnrE2FE3CTPylPQlobAdnUSRPgPJByV9EZE8COu1jZX0RUTyKGyjsFImfTM70cxmmdkKM3vbzL4ZlB9tZjPMbE3wf0hQbmZ2j5mVmdlSMzs3blnjgvprzGxc/lZLpOfoLm360l5YP7d0jvSbgO+4+xnAhcANZnYGcCMw091HAzOD+wBXAKODvwnARIjtJICbgQuA84GbW3cUInKocB0fSrbC1jeTMum7+zZ3fzO4vQdYCZwAjAUmB9UmA1cHt8cCj3jMPGCwmR0PXAbMcPdd7l4NzAAuz+XKiEjxhHVWSWkvozZ9MxsJnAPMB4a5+7bgoe3AsOD2CcCmuKdtDsqSlUsrfWckgbB2CEr3lHbSN7OBwJ+Bb7l7bfxjHtvF52TLNLMJZlZqZqWVlZW5WGS3E7Jfg1IkYWsWkJ4hraRvZn2JJfw/ufszQfGOoNmG4H9FUL4FODHu6cODsmTl7bj7JHcf4+5jSkpKMlkXyaGwjTgQ6W7C+vssndE7BjwIrHT3u+Ieeg5oHYEzDng2rvy6YBTPhUBN0Aw0HbjUzIYEHbiXBmUiIj1PkPXD9outTxp1LgK+DCwzs7eCsh8AdwBTzGw8UA58NnhsKnAlUAbsA64HcPddZnYrsDCod4u778rFSoj0ZOoflVxKmfTdfTbJm5kvSVDfgRuSLOsh4KFMAhSJqu7WxKZ9U/egM3IlobD9JBWR3FDSl27t0xPn8KO/Lit2GHnV1SPoq347m5E3vkBFbT0X3j6Tia+uzUlc0j0p6Uu3tqi8mkfnbSx2GPmRo19bSzfXADBv/S6219bz82mrcrNg6VRYz69Q0u+CnXUNrKusK3YYeaHWnZ5HZ8x2zY7aejbt2pd2/da3O2x9M+mM3pEkPvTzV6hvbGHDHZ8sdigiBdXS4jS2tNC/T+9ih1IwF9w+EyDj73vY+sd0pJ+Gij317DvQdEh5fWNLTl8nrD8Hk2loaubjd73GnLKdxQ4Fd2djVfpHYZ2pqK2nvrGZO6ev4qZnit9fkOgAvbnFWZvDX5lVdQ3sqW9MWa/1Pf7uU0s47UfT2j2mHxLdQ49P+i0tzrenvMWSTbuzXsb5t83k6nv/nrugUrCwHRokUV61j7KKOm5+7u1ih8LjCzZx8Z2zWFTe9VM/zr99Jtc9uIB7Z63l8QXF6y/obCv49cvvcMkvX6OsIv3E39JJVj7vpy9z8S9mpVzGxXfO4uUVO3hm8SEn00s30eOTfmVdA8+8uYV/e6S0S8t5Z0d2R1XVew906XUlpr6xOeGvrVaLN1YDsLZiL7PX7OTvXfz1sWBDuM8bLN0QW9+K2vqcLbN6X+ojfYCV22pTV5LQ6vFJv5C++oeFXHH3G+3Kzrl1Bm8GCakQKmrrmb+uql3ZvgNNjLzxBaYs3JTkWV2zpqIurQ7tnXUNzFmbXTL+x1/M4oz/nc6BpsRNaq0/jhznSw/O54sPzG/3eFNzC9OWb8tLZ2Zzi9PSkvvltv7i66zZL5NXTbbqSzfvzmApqV+zuzbzvP5OJTVp7vjSEda3QUk/h15ZVZHwKGjF1sIdGf3Tb2bzuUnz2pXtqG0A4L5Xy3L6WvFf7o/98rWU9T/3u7l84ffzU9ZLpHJPbB1+OWN1wsdbR0gky70TX13Lfzz6JtPf3pHV63fm3T+YylX3zs75ctuadxKsUzYtgMnem6t+m5umy27SKpnQ7n0HuO6hBfz7o11rEUgkbG+Lkn4BFHKPXxEkxzBaW7m3y8vYUr0/YXlrwknW8bq1Jva8XXlqblu+pThNHsU4qg7Lkfyq7bU8vWhzTpZ1oDn2C7KsouvbaKuwDpFV0g+8vGIH33xicdv92vpGPnv/3IzG5SYV0g9/Y9U+FpUnbnpKpzO5GEd2yd7JQsayrWY/10ycU7D+mkSdpvHNWenKVRJK9pqdLX5nXQPXTJzTrg+iucUZ/4eFLFh/sP/kqdJN/DjNgQGX//oNvvvUkvSCTlv+munCQkk/8K+PlPLsW1vb7k9btp0FG3Zxz8w1XV52OFN+bCTGpyfOKXYYOdL5FyuX+91Jr6+jtLy6qCNYsjnhJ9u3oOPOIpv38vH5Gyktr+aRueVtZVV1DcxcVcENj73ZVva9p5fyhzkbsow0e63vZ0iPz3IqMkm/Yk8DD81e37YB/3nRZjZX52ZcdyqF2JC27N7PU6Xpd9Tu3tf5UWp8Snls/sa2NnWINZG8s2NPcb4gSV4z3YOpbA669h9obh9C25mW6dvb0NRuHHz13gM0NDUfUq+2vrFtlFI6sWbyGTyRo+GnhfzYl2zazazVFakrdlHIDsbzKjJJH+CW51cwu2wnTc0tfOepJVwzcW7Sut3tRKnP/W4u33t6aad14tfoU/elf4T/g78s42t/WtR2/4q7X+fSX72eaYh5leo725Ud1F3JOo8zSBTvv+Ul3vvjl9run3PrDL784IJD6r3vxy9x0R2vpFxeNknqzY27M39SkY299+9c//DC1BVzJJff+rBmkB6f9Dt+2ePPoq3Yk3qMcy6OAArRoVOZYQfuup2ZdVhVxbVft44GKoZkO+NUn1Pr87L5OPfUtz8/oPXzzGRZjc2Hxh3flh0v3fHyUJjEcsjm2wPbQPJ5oB+2HxGRm3vH3dPqWMnldp3uovL9XUq21iu31VJWUce2mv0YxiffdzzHHXlYyuX97rXCT9Gb7D1Kt407JzvxtmXl9+scpSaHQnl1dQX1jc1U7Gngug+MPOTxsI64yaXoJX0OJr90Pt5czJCX6XZU6C97xxPKnizdxEvfurh9pQTrEKZT8Qv5nrW16YcgKRcjSXXntPiVuKaii0eXMHLoACA/O/Cw7j96fPNOR+5xw906+VDy0ba3dPNuxv52NvWNzcxZu5PP3D+Hpub0Jm2bv66K55duTfp4LuNNNBQxLNtv8iP97J6XjicWbuJ7cUMDW7Jo3sm1Qg4DzGXrTpj6ypoSnK2Wj+jCcHAQr0cn/cbmFr7/5847N1PJZZv+T/62giWba1i+pYZvP7mEhRuqqaxLr338c5Pm8fXHFqeu2FkcOa4XJqmSYFuTTJap+qm4k4Da3p8QfJuLM4Aq81cNwVt1iPiY2n79d8eNP0Mpk76ZPWRmFWa2PK7saDObYWZrgv9DgnIzs3vMrMzMlprZuXHPGRfUX2Nm4/KzOu29saaS19+p7FAajk81nV8bGS0vy8cSSdRkEJa2zmw7cg9WzEEMWQzZzEZnO6jOpmjIt5BsCjl18PvYA1eug3SO9P8AXN6h7EZgpruPBmYG9wGuAEYHfxOAiRDbSQA3AxcA5wM3t+4oCi3WvFPgjtwOy3Kgl7XOFZP/jSzTV+iOm31hr04UNO8U8ei1sH0YudsiwpRTe8W9iWG7ulU+pUz67v460HFs2VhgcnB7MnB1XPkjHjMPGGxmxwOXATPcfZe7VwMzOHRHkjONzS389PkV1O4/dCreH/xlGdOWb097Wcm+XMu31LTdTjaVQas1FXtiy0rw2KLyav44r5ym5hZun7oyrZjmrq3ivFtn8IXfH5xYLZ3vUlXdAR6cvT7l6yT6Ym6o2sfehiY+cVfyidUa4/onlmzazYwViSc3a2nxdtdpbU4yE9ik19ceMoFd0jb9VEM2g+fdn4OLgnd2GbyNVfu4a8Y7BTtivG3qyrT7hdKxqHwXj84rZ/66Kh5fsJHqvQc45YcvtqtzX9x7OPLGF7h3Vhkjb3yhrSz+81yxtZYf/GUZ98yMTfY3a3Ulzy1J3jeVD+uTDE+OXV+5vF1ZbX0TD/99fcav8VTpJv66eAt3vLgqLzOu5lK2o3eGufu24PZ2YFhw+wQg/rTQzUFZsvK8+NuSrTwwe33CYYfV+xr5j0cXJXhWe6naLf/pNwdnVfz0xDmdXkJtSulmfnHN2QeX7dAr2N1+84m3ACgZ2K9du3FnPh8k+zlrq1LUbK+uoYlbn1+R0XPiTXx1LWs6uWjH1GXbGPv+2Mc6NrjoTKL3ZdHGaibGJY7ZZTv58Kklh9S7feoqetkq1v0s9eXpUnbkBp9npucnJFxWJ6N3vjp5IWUVdVxz7nBGHHNEl16nsx1Z60NlFXVMf3sHn3zf8Rkte9Lraxkz8mjOHdH+B/enO5yweOf0xCemdVZn1uoKrnxvLJ4r72k/Mmzltlr+3+OLuersd2UUb7amLtvG1/70Jg9cN4aPnzGs3WOt8/Z86cKT2m1AP/nbCq6/aFRGrxN/YuTFo4fywVOGZh90nnW5I9djhzQ527WZ2QQzKzWz0srKju3x6WntlW/MyRFQ7n72te84ar/cRCMJiiXZUWpziqPXpgQnICXS8Uiosyaujm9LGN6lzk70qm+MTa1QyOaXVJ9LIrdPXZXWWdmdXbgmmUI0Wabr7a2xX+SrthduFtRsPo9Cyjbp7wiabQj+t06OsQU4Ma7e8KAsWfkh3H2Su49x9zElJYce/aUjF9+3fH5usRPEcvt6Oe3ITVLeK0+JLBeL7ZUquDz00fRKkNkL9X0v1JDNbNYnjO3jqdYjl29nyHN+1kn/OaB1BM444Nm48uuCUTwXAjVBM9B04FIzGxJ04F4alOXFwSsOZe9Hf40NVsrlNVLjvwwdE0aYjo6STmoWgi9ztuP0cxpDJy/a+isp5U4oJL78YHYXtelMLhLob2auaddPkHUspJcLcvlpdXytsE2tnLJN38weBz4CDDWzzcRG4dwBTDGz8UA58Nmg+lTgSqAM2AdcD+Duu8zsVqD1dLhb3D1vFyFtfYvTuWDGN59YzODD+6a13K2797O9tp7+fQ7dV8Z3CKVzWcKOnUsdZ3JMpmPTy/x1VZx94uCkG/W9s8qyunBIou002zxWs7+RQf0PbmpvrGl/ycRcfCkaklxGMR86G7LZ0vYrIL1lrd+5l1HBWaEdvbVpd9vt196p5JRjB1Lf2Myg/n3YujvxxWQy1fGz6Cib9/X7Ty9l+ZYa6hqSNw3915Nvtc3nVLmngW9PeYu+vQ5+r34545129V9ZtYOjDu/H0QP68cqqCq7/4Mh2jy8qr+a04wYxsH/7lNbY0tL2+P0FmjYk3U78uoYm6hub6W3G4CP6FmznkDLpu/vnkzx0SYK6DtyQZDkPAQ9lFF2WMnnv4ufQT+WDncx+2PrLAEh4Qlj8l3TmqkOnir0x7opPnYXf8UpBn5s0j8+fPyJp/XQ64jpKtslms1HuqW/k7J+8xISLT24r++2srly2MXF0qeZgz+0Z1q1DNg99Pw6erZvee/XR/3s16SCAh/++oe32uIcOnZEzrOoamtqN8EnkLx2m8Hjmzc6n9PjqH9pfxjB+QEJtfSOfnjiHD59awuSvnt+u3u9eWwfEdpqvHXLOzkG5TLjpbmtn3XywseMnV53JuA47snzpkWfkhuzXFBCb777VtprUs3smsynB5QLX7NiT9fISSXakks37WhvMUPm3Ag/T6yinQyiTHOlX7Klnb0P6c+HHa2puyfoqbTm5uls31tp53tppW3RZbGqFuGZAqx6Z9EMrXx2hlt82yVaJOi5TaW3myFWfRRi6PtqmdOjwdpx/20z2Hshu9M4dL67iH38xK6t47py+mvKq3F3btfvK/luQ2+9P5htpNt+tbPXIpB+GxJA3CVbOzHLbfJFkYdm06R888zh5nRD+MOtU23z6RtuRfUeZdnr/PcNzLjrK9HoKPUoONv5ij94pZL+/kn4B5etzzfUGk3x+m/QmNWv/nOCxHH0m2S4ml5tES1vzjnHmzYkHoWX6mXT1Mwxjk2ahJPvllYlcjkzL7pSbwn2APXI+/TDm/Dunr87qiksdJUysBdpgsvlStR7pd9amXoiElY+L4nQWd1OLc/3DC9iyez8//uczUy6z48/7uRke+YdtWGAhFWoCvFz56+It7aZxAR3pd1kYZ8pbsH4XCzfE5ujpyvcz0ar1yvGnmGyYXqp2x+8+teSQsdWZTiz35MKNjLzxBXbUJu7szvVnW5HkdTqTTqf08i01zFpdyTs76vjCA6nHwnd8a7/ycGajdQrZJhxWuX4L7ns1NqdQsia8ZFJto9968i0emN1+fp9Cfnw9M+kXO4ACy/WRfi7b9Fufku5P3j8vig3dSzZJVvxiSjekf6pH/PPip4FI9jrJLN28u+12Lo+uu7qsB2evb5vvSGK6MvFZfWMzj86NnXtTvS+z81yyedXWc2mamlvyftDaM5N+CI/043WteefQdZtdtpMDOT45KdFbmNUp+WmM3onfaSWa1ybZl/ea++cmLE8lfsK9TPPCVb89mFg7nfqii236mT7/b0u2siTuZK4oif9OjLzxBX4VnNg1Ls1fS80tfsj36vT/mcaetuG3mX0Y2eSfhRuq2V5Tzyk/fJHHcjgLQCI9NOkXO4LccHeeKt3U7szG37+e+bSv2fjZi4dOv/yzF1clqNm5m4KTzlJ9JgeaWnh8wcaDnaRxX7Qzbp7Wdjvbzzb+i/hS3JTPXRlK2lkuyHSxap7putaDh7tnruGC219OebZxq3f/YCo/m3rotr0nOMck47mrstykNgTDbp9dnN9zWnpkR26IJqxMKN3wSsur+d7TS3lh2ba2sgM5nDu9M79/Izc7lxeDaxekOvr5zStr+M0rB8/Ujc+B9Y1dX+dkr55sLv905LJZreP7k4t1jopEm9aO2syGsP6xw7z68Qq1P259mXxfR7iHJv1wZ/1UUz+0HuXWBUcar67OborpYknU8bW3k7mFHltQztRl7S9s85kkTTevvVOZ9kRctz6/gv/48LsZdFgfdidpl11bWUfN/kbOfNeRaS0z3vqdya8t8MzizqcV6OjNjbszfn2JaT1C3p5Fp3w6/uvJt5i37mD/Uev2d98Xz6W+sfmQefrnr9/F5Wcdl3BZnU1V3TZRZJ7Tl4W5/XvMmDFeWlqaumIHf5xXzv/EzYXT3Rw9oB9v/s8nmLlyB+MnZ77+El6t8+zkYgZJifneZadlNcdUoZw2bBBjRg7hxitO5/tPL2379ZvM6ccNYtq3Lu7Sa5rZIncfk+ixHnmk390b9fcdaGLfgaaMr4wl4adkn3vbanIz42i+rN6xh9U79vCn+el10K7antu5tDrqmR25xQ6gi+obWzjjf6fz4OzCdNqKdGePzsvvaJdi+NR9+Rt+2yOTftgvTCwi0pk3N+5O2g/VVT0y6Svli0h39/5bZuRluT0z6Svri4gk1COT/hH9ehc7BBGRUOqRSf+kYxJfc1REJOp6ZNLvOG2piIjE9Mik39CU/OxPEZEoK3jSN7PLzWy1mZWZ2Y35eI1ehbwigYhIN1LQpG9mvYF7gSuAM4DPm9kZuX6dQYf1zfUiRUR6hEIf6Z8PlLn7Onc/ADwBjC1wDCIikVXopH8CsCnu/uagLKeymTFRuq+vf/SUYocgRVb6o4/ndfkXnXJMXpdfSKGbcM3MJgATAEaMGJHVMs4dMYQ3/+cTAPTtbQw6rC/fnvIWV551PPsbmxnQvzd7G5rZUVvP2ScOZsmm3Vx+1nG8/s5OVm6rZe66Kq45bzj/fPa7OGZAP/YfaGbIgH4sKq+mvrGZYUf2Z099Ez99YSVPTLiQvr178cjcDazctodLTj+Wh+es51PnDOe04waxdfd+Tj/uSAYd1ofP/G4u/fv04oeffA+1+5uoa2ji8rOOo3Z/I4s37uaS9xzLvHVVnDx0IBffOSvhupUM6s9Prz6LX0xbxTXnncj2mv2cMORwzh91DEs376a8ah/nnTSEqroGqvfFpgx+Y81OBh/RlzUVddx+9XtZvrWGmv2NrN+5lwXrd/HlC0/io6cfy4U/m8mlZwzjhWXbGNCvDx87/ViufO/x9O1tjBo6gA1V+ygZ2J85a3dy0SlDWbalhmFHHsbwIYcDsP9AM1V7G3h3yUD21DdhBof17c26yr2cP+rotnXYU9/I715bx/S3t/PhU0sYVTKAC0YdQ/8+vTj2yP7sbWhmQ9Vehg85nJdXVPDx9xxLbX0TFbX1nPmuo9i5t4F+vXvR0NTMUYf3o2RQf/7t4pM5ol9vKvY00NTcwjED+7Nh515Wb99DyaD+HHtkfyr3NHDhycfQN3huU7Ozo7ae4486nDUVexh8eD8cZ/AR/dhZ18DKbbUMHdif2v2NNDY7v3r5Hc5615HMXFnRdlWlZD5z3nD+9R9PpqmlhfrGZo476nAG9u+De2z5VXUN9O/bm8P79uZAUwurttdyy/MrOOfEIexvbKa2vpFTjx3EuScNbltmRW0DTy7cxIINuzh/1NGs2lbLNeedyLAj+3P9RaN4a9NuNu3ax+hhA9td4Ssbf/rXC7jolKFt919esYMj+vVmUXk1HzntWO5/bS1ba/azOG5K6NOPG8SW6v3saWjiiH69OXpAP1panK999BR+9NflfOiUoazfuZd/PvtdnHbcQLburueD7z6G+sYWRg0dQMmg/uw70MTijbsp37WPy84YRv8+vRl4WB96B/1022r28+i8ck46ZgC1+xs50NxCv969GDqwP7/9wjl8/bHFDOrfh5e+fTG79h5g2+56ZpftpHcv41/OOYHRwwayYmstzy3ZyvgPjWLwEf0Y2L8Pjc0t7G9s5sjD+rJ8Sw0jjjmC8p37eHReORecfDSfOnc4ALX1jRzWpzf9+vRiT30je+qb2FFbz8klA6nd38jh/XrT1Oys3rGHl97e3jbJ2hcvGMFXPzSKgf370KeXsal6P/PXVXHqcYOYtmw7o4cN5KcvxC5cdMqxA7n2H07kCxdkl/9SKejUymb2AeDH7n5ZcP8mAHf/WaL62U6tLCISZZ1NrVzo5p2FwGgzG2Vm/YBrgecKHIOISGQVtHnH3ZvM7OvAdKA38JC7v13IGEREoqzgbfruPhWYWujXFRGRHnpGroiIJKakLyISIUr6IiIRoqQvIhIhSvoiIhFS0JOzMmVmlUB5FxYxFNiZo3ByRTGlL4xxKab0hTGuqMR0kruXJHog1Em/q8ysNNlZacWimNIXxrgUU/rCGJdiUvOOiEikKOmLiERIT0/6k4odQAKKKX1hjEsxpS+McUU+ph7dpi8iIu319CN9ERGJ0+2TfqoLrZtZfzN7Mnh8vpmNDElc3zazFWa21MxmmtlJxY4prt6nzczNLO8jCtKJycw+G7xXb5vZY/mOKZ24zGyEmc0ys8XBZ3hlnuN5yMwqzGx5ksfNzO4J4l1qZufmM54M4vpiEM8yM5tjZmcXO6a4ev9gZk1mdk0YYjKzj5jZW8F2/lregnH3bvtHbHrmtcDJQD9gCXBGhzpfA+4Pbl8LPBmSuD4KHBHc/s98x5VOTEG9QcDrwDxgTLFjAkYDi4Ehwf1jQ/L5TQL+M7h9BrAhzzFdDJwLLE/y+JXAi4ABFwLz8/0+pRnXB+M+uysKEVeqmOI+41eIzfh7TbFjAgYDK4ARwf28befd/Ug/nQutjwUmB7efBi4xMyt2XO4+y933BXfnAcOLHVPgVuDnQH2e40k3pn8D7nX3agB3rwhJXA60Xoz5KGBrPgNy99eBXZ1UGQs84jHzgMFmdnw+Y0onLnef0/rZUZjtPJ33CuAbwJ+BQmxP6cT0BeAZd98Y1M9bXN096adzofW2Ou7eBNQA+b7KcaYXgB9P7Cgtn1LGFDQJnOjuL+Q5lrRjAk4FTjWzv5vZPDO7PCRx/Rj4kpltJna0+I0CxNWZTLe5YijEdp6SmZ0A/AswsdixxDkVGGJmr5rZIjO7Ll8vFLoLo0eNmX0JGAN8uMhx9ALuAr5SzDgS6EOsiecjxI4SXzez97r77mIGBXwe+IO7/zK49vMfzewsd28pclyhZGYfJZb0P1TsWIBfA//t7i35/9Gftj7AecAlwOHAXDOb5+7v5OOFurMtwIlx94cHZYnqbDazPsR+ileFIC7M7OPAD4EPu3tDkWMaBJwFvBp8EY4DnjOzq9w9X1enT+d92kysHbgRWG9m7xDbCSzMU0zpxjUeuBzA3eea2WHE5lApSHNBAmltc8VgZu8DHgCucPd8f/fSMQZ4ItjOhwJXmlmTu/+1iDFtBqrcfS+w18xeB84Gcp70897Rk+fOkT7AOmAUBzvczuxQ5wbad+ROCUlc5xDrLBwdlveqQ/1XyX9Hbjrv0+XA5OD2UGJNGMeEIK4Xga8Et99DrE3f8hzXSJJ3BH6S9h25CwqxXaUR1wigDPhgoeJJFVOHen+gAB25abxP7wFmBtveEcBy4Kx8xNGtj/Q9yYXWzewWoNTdnwMeJPbTu4xYR8q1IYnrTmAg8FRwxLHR3a8qckwFlWZM04FLzWwF0Ax8z/N8tJhmXN8Bfm9m/0WsU/crHnx788HMHifWxDU06Ee4GegbxHs/sX6FK4kl2H3A9fmKJcO4/pdYH9p9wXbe5HmeXCyNmAouVUzuvtLMpgFLgRbgAXfvdMhp1rHkcTsVEZGQ6e6jd0REJANK+iIiEaKkLyISIUr6IiIRoqQvIhIS6U4WF1c/48kINXpHRCQkzOxioI7YPEpnpag7GpgCfMzdq83sWE9jzh4d6YuIhIQnmJjNzN5tZtOCOXneMLPTg4eymoxQSV9EJNwmAd9w9/OA7wL3BeVZTUbYrc/IFRHpycxsILFrErSeuQ/QP/if1WSESvoiIuHVC9jt7u9P8FhWkxGqeUdEJKTcvZZYQv8MtF0Ws/WSk38ldpSPmQ0l1tyzLtUylfRFREIimJhtLnCamW02s/HAF4HxZrYEeJuDV3GbDlQFkxHOIs3JCDVkU0QkQnSkLyISIUr6IiIRoqQvIhIhSvoiIhGipC8iEiFK+iIiEaKkLyISIUr6IiIR8v8Bhwm8q0Q0/foAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(user_click_merge['words_count'].values)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户点击的新闻类型的偏好\n", - "\n", - "此特征可以用于度量用户的兴趣是否广泛。" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUlUlEQVR4nO3dfZBc1Xnn8e8zM3pBaCwkNBJCAiQbsKwEy8CYwoEihTG2wXGwY5dDditWHGrZsp3EjpNdw9q1dtXGu3YqNvFWsomJIaESyoGAMSQFwRhjezeJJY+MAAsEEuJFEnoZAXpBGAlJZ//oK2UkzfRtzfR097nz/VRNze3Tt/s+Z27rp9unT98bKSUkSfnrancBkqTmMNAlqSIMdEmqCANdkirCQJekiuhp5cZmz56dFi5c2MpNSlL2Vq5cuT2l1Fe2XksDfeHChQwMDLRyk5KUvYh4rpH1HHKRpIow0CWpIgx0SaoIA12SKsJAl6SKMNAlqSIMdEmqiCwC/a6HN/J3P25oGqYkTVhZBPo9q17g9oEN7S5DkjpaFoEuSSpnoEtSRWQT6F4pT5LqyyLQI6LdJUhSx8si0CVJ5Qx0SaqIbAI94SC6JNWTRaA7gi5J5bIIdElSOQNdkioim0B3Hrok1ZdFoDsNXZLKZRHokqRy2QS6Qy6SVF8mge6YiySVySTQJUllDHRJqohsAt0hdEmqL4tAd9qiJJXLItAlSeUMdEmqiGwCPTkRXZLqyiLQHUKXpHJZBLokqZyBLkkVYaBLUkVkEejOQ5ekclkEuiSpXEOBHhG/HxGrI+JnEfGtiJgaEYsiYnlErIuI2yJi8ngXK0kaWWmgR8R84PeA/pTSLwLdwNXAV4AbUkpnAi8D14xnoU5Dl6T6Gh1y6QFOiIgeYBqwGXgncEdx/y3AB5peXSGciS5JpUoDPaW0CfgT4HlqQb4TWAnsSCntL1bbCMwf7vERcW1EDETEwODgYHOqliQdo5Ehl5nAVcAi4FTgROC9jW4gpXRjSqk/pdTf19c36kIlSfU1MuTyLuCZlNJgSul14NvARcBJxRAMwAJg0zjVCEDyjOiSVFcjgf48cGFETIuIAC4DHgceAj5crLMMuHt8SnQeuiQ1opEx9OXUPvz8KfBY8Zgbgc8Cn4mIdcDJwE3jWKckqURP+SqQUvoC8IWjmtcDFzS9IknSqGTzTVHnoUtSfVkEumPoklQui0CXJJUz0CWpIrIJdIfQJam+LALdc7lIUrksAl2SVC6bQE/OW5SkuvIIdEdcJKlUHoEuSSploEtSRWQT6I6gS1J9WQS6Q+iSVC6LQJcklTPQJaki8gl0B9Elqa4sAj08f64klcoi0CVJ5Qx0SaqIbALdIXRJqi+LQHcEXZLKZRHokqRyBrokVUQ2ge750CWpviwC3WnoklQui0CXJJUz0CWpIrIJdEfQJam+LALdIXRJKpdFoEuSyhnoklQR2QS609Alqb4sAt3zoUtSuYYCPSJOiog7ImJNRDwREe+IiFkR8UBErC1+zxzvYiVJI2v0CP3rwD+nlBYDS4EngOuAB1NKZwEPFrclSW1SGugRMQO4BLgJIKW0L6W0A7gKuKVY7RbgA+NTYk1yJrok1dXIEfoiYBD464h4OCK+GREnAnNTSpuLdbYAc4d7cERcGxEDETEwODg4qiIdQZekco0Eeg9wHvAXKaVzgT0cNbySaqdCHPYQOqV0Y0qpP6XU39fXN9Z6JUkjaCTQNwIbU0rLi9t3UAv4rRExD6D4vW18Sqxx2qIk1Vca6CmlLcCGiHhz0XQZ8DhwD7CsaFsG3D0uFYJjLpLUgJ4G1/td4NaImAysBz5G7T+D2yPiGuA54CPjU6IkqRENBXpKaRXQP8xdlzW1GknSqGXxTVFwDF2SymQR6OEguiSVyiLQJUnlDHRJqggDXZIqIotA9+y5klQui0CXJJUz0CWpIrIJ9OREdEmqK4tAdwhdksplEeiSpHIGuiRVRDaB7gi6JNWXRaA7D12SymUR6JKkcga6JFVENoHuNHRJqi+LQPd86JJULotAlySVM9AlqSKyCfTkTHRJqiuLQHceuiSVyyLQJUnlsgl0py1KUn1ZBLpDLpJULotAlySVM9AlqSKyCXSH0CWpvkwC3UF0SSqTSaBLkspkE+hOW5Sk+rII9NcPHGT7K3vbXYYkdbQsAv3nrx+gd2pPu8uQpI7WcKBHRHdEPBwR/1TcXhQRyyNiXUTcFhGTx6vIOb1TnOYiSSWO5wj9U8ATQ25/BbghpXQm8DJwTTMLG6o7ggMOoktSXQ0FekQsAN4HfLO4HcA7gTuKVW4BPjAO9QHQ3RUcOGigS1I9jR6h/ynwX4GDxe2TgR0ppf3F7Y3A/OEeGBHXRsRARAwMDg6Orsiu4KBH6JJUV2mgR8SvANtSSitHs4GU0o0ppf6UUn9fX99onqI25OIRuiTV1cjUkYuAX42IK4GpwBuArwMnRURPcZS+ANg0XkV2BZjnklRf6RF6Sun6lNKClNJC4Grg+yml/wg8BHy4WG0ZcPe4FdlV++r/QVNdkkY0lnnonwU+ExHrqI2p39Scko7VXZwQ3ZkukjSy4/q2TkrpB8APiuX1wAXNL+lYh47QDxxMTOpuxRYlKT9ZfFN0589fB2Dv/oMla0rSxJVFoJ86YyqAM10kqY4sAr2nu1bm/oMeoUvSSPII9GIMff8Bj9AlaSR5BPqhI3QDXZJGlEegHzpCd8hFkkaURaDvO1AL8hf37GtzJZLUubII9FNnnAD4TVFJqieLQJ86qVbmoSN1SdKxsgj0ScWHovv8YpEkjSiLQO/prn0o+tyLr7a5EknqXFkE+uzpUwCYMimLciWpLbJIyCk9DrlIUpksAn2ygS5JpfII9OJD0Uc27mhvIZLUwbII9ENf/T8020WSdKxsEnLJvDfw1NZX2l2GJHWsbAJ9z779nDjZyxVJ0kiyCfSz5/by6Kad7S5DkjpWNoH+2usHiHYXIUkdLJtAP/f0mezdf9ATdEnSCLIJ9JRqQf7Czp+3uRJJ6kzZBPpb5r0BgMHde9tciSR1pmwCfcYJkwBYs2V3myuRpM6UTaAvPqUXgE0vO+QiScPJJtB7p9aO0Fc8+1KbK5GkzpRNoE/u6WLxKb28+Ipj6JI0nGwCHeCkaZN4enAPe/bub3cpktRxsgr0S87uA+ClPfvaXIkkdZ6sAv3MvukA3PaTDW2uRJI6T1aB/stvrh2hv+KQiyQdI6tAn9LTTV/vFP7mX5/l1X2GuiQNlVWgA1z0ppMBvzEqSUcrDfSIOC0iHoqIxyNidUR8qmifFREPRMTa4vfM8S8XrjhnHgB/8t2nWrE5ScpGI0fo+4E/SCktAS4EPhkRS4DrgAdTSmcBDxa3x92Fb6wdoT+7fU8rNidJ2SgN9JTS5pTST4vl3cATwHzgKuCWYrVbgA+MU41HmHHCJN6/9FQe27STex/b3IpNSlIWjmsMPSIWAucCy4G5KaVDiboFmDvCY66NiIGIGBgcHBxLrYf92rnzAfju6i1NeT5JqoKGAz0ipgN3Ap9OKe0ael+qnax82CtPpJRuTCn1p5T6+/r6xlTsIZcunsPiU3r5zqoXWPGM53aRJGgw0CNiErUwvzWl9O2ieWtEzCvunwdsG58Sh/f+pacCcNfDm1q5WUnqWI3McgngJuCJlNLXhtx1D7CsWF4G3N388kb2yUvPZOHJ01i+/kV+8GRL/y+RpI7UyBH6RcBvAu+MiFXFz5XAl4HLI2It8K7idktddOZsNrz8Kv/noadbvWlJ6jg9ZSuklP4fECPcfVlzyzk+X/rgOWzbvZdHNuzgtp88z0f6T6P2hkKSJp7svil6tCXz3sC23Xv57J2P8cLO19pdjiS1TfaB/vuXn82f/YdzAbhz5UbWbNlV8ghJqqbsAx3gjFknAvC1B57iD25/pM3VSFJ7VCLQz1kwg4HPv4srfvEUtu56jR89NcimHV5MWtLEUolAB5g9fQoLZ5/I9lf28dGbV/Cfbhlod0mS1FKVCXSAT112Fnd+/Je4fMlctu56jSe37Gb94CvUvsgqSdVWqUCfOqmb88+Yydlzp/Pinn28509/xDu/+kP+8VFP4iWp+krnoefo2kvexDnzZ/Da6wf59G2reGZwDzte3UcQzJg2qd3lSdK4iFYOR/T396eBgdaNbaeUOPvz9/H6gX/v4/VXLOY///KbWlaDJI1VRKxMKfWXrVfJI/RDIoIbP9p/+GIYNzzwFOsHvTCGpGqqdKADXPrmOfDm2vKty5/nH1Zu4DuramdojIAvvv8XuPqC09tYoSQ1R+UDfajPXfkWfvzMi4dv/92/PccjG3dy9QVtLEqSmmRCBfqli+dw6eI5h28/sHord6/axL+s2364racr+J+/ds7ha5dKUi4mVKAf7ROXnnlEmKeU+M6qFxh49iUDXVJ2Kj3LZTTO/tx9zJ4+mQUzpx3R3tUF/+U9izn/jJltqkzSRNXoLJdKfbGoGX7rooWccfKJdHfFET8/Xv8SP/TKSJI62IQechnOf7vyLcO2n/OF+/mnxzazfvvw0x7n9E7l8+97C11dXmBDUnsY6A1631vnseLZl3h887HnW9/92n4Gd+/lYxct5LRZ04Z5tCSNPwO9QV/+0FtHvO/exzbziVt/yg3fe4qZ0ybXfZ63LpjBVW+b3+zyJMlAb4az5/Yye/oUvrt6a9319u4/QO/USQa6pHFhoDfBmXOmM/D5d5Wu9+X71vDN/7uev/rR+oafu6sr+NWlp9LXO2UsJUqaAAz0FjprznT2H0x86d4njutxr+7dz+9edtY4VSWpKgz0FvrQ+Qu44pxTOHgcU//f/kff4+ENO7i7OP/MaJw+axrnnu78eanqDPQWmzb5+P7k82eewPfXbOP7a0Y/B35KTxdr/sd7iXBKpVRlBnqHu+sTv8S23XtH/fjbBzbwjR+u50drtzOlp3nfI+vuCpYuOInJTXxOSWNjoHe43qmT6J06+qssLT6lF4BlN69oVkmH/fdfWcJvX7yo6c8raXQM9Ip7/1tP5bSZ09h34GBTn3fZzStYu+2VwxcPaYepk7o5ZcbUtm1f6jQGesX1dHfRv3BW05931omT+daK5/nWiueb/tzH486Pv4Pzz2h+/6QcGegalZuWvZ2123a3bfvbdu3lf923hme2v8qSeTPaVsfRpk7q8sNntY2nz1WWtu1+jQu+9GC7yzjGh85bwFc/srTdZahivEi0Km1O71Ru+PWlbN01+hlAzXbHyo08tbV971okA13Z+uC5C9pdwhEef2EX//joC5zzxfvbXcqE9JnLz+ZjF03sWVcGutQk11y8iJOn1z/bpsbHXQ9vYuC5lw30sTw4It4LfB3oBr6ZUvpyU6qSMrT0tJNYetpJ7S5jQlr53Mv8YM02Lv/aD9tdyohuWvZ2Tj95fK+XMOpAj4hu4M+By4GNwE8i4p6U0uPNKk6SGnHNxYu4f/WWdpdRVyu+VT2WI/QLgHUppfUAEfH3wFWAgS6ppa5623yvM8DYLhI9H9gw5PbGou0IEXFtRAxExMDg4OAYNidJqmfc3wOklG5MKfWnlPr7+vrGe3OSNGGNJdA3AacNub2gaJMktcFYAv0nwFkRsSgiJgNXA/c0pyxJ0vEa9YeiKaX9EfE7wP3Upi3enFJa3bTKJEnHZUzz0FNK9wL3NqkWSdIYeLkZSaoIA12SKqKlp8+NiEHguVE+fDawvYnl5MA+Twz2ufrG2t8zUkql875bGuhjEREDjZwPuErs88Rgn6uvVf11yEWSKsJAl6SKyCnQb2x3AW1gnycG+1x9LelvNmPokqT6cjpClyTVYaBLUkVkEegR8d6IeDIi1kXEde2u53hFxLMR8VhErIqIgaJtVkQ8EBFri98zi/aIiP9d9PXRiDhvyPMsK9ZfGxHLhrSfXzz/uuKx0YY+3hwR2yLiZ0Paxr2PI22jjX3+YkRsKvb1qoi4csh91xf1PxkR7xnSPuzruzjx3fKi/bbiJHhExJTi9rri/oUt6u9pEfFQRDweEasj4lNFe2X3c50+d+Z+Til19A+1E389DbwRmAw8Aixpd13H2YdngdlHtf0xcF2xfB3wlWL5SuA+IIALgeVF+yxgffF7ZrE8s7hvRbFuFI+9og19vAQ4D/hZK/s40jba2OcvAn84zLpLitfuFGBR8Zrurvf6Bm4Hri6W/xL4eLH8CeAvi+Wrgdta1N95wHnFci/wVNGvyu7nOn3uyP3c0n/0o/yDvgO4f8jt64Hr213XcfbhWY4N9CeBeUNeNE8Wy98AfuPo9YDfAL4xpP0bRds8YM2Q9iPWa3E/F3JkuI17H0faRhv7PNI/9CNet9TOUvqOkV7fRaBtB3qK9sPrHXpssdxTrBdt2N93U7umcOX38zB97sj9nMOQS0OXuutwCfhuRKyMiGuLtrkppc3F8hZgbrE8Un/rtW8cpr0TtKKPI22jnX6nGGK4ecjQwPH2+WRgR0pp/1HtRzxXcf/OYv2WKd7+nwssZ4Ls56P6DB24n3MI9Cq4OKV0HnAF8MmIuGTonan2X3Cl54+2oo8d8nf8C+BNwNuAzcBX21rNOIiI6cCdwKdTSruG3lfV/TxMnztyP+cQ6Nlf6i6ltKn4vQ24C7gA2BoR8wCK39uK1Ufqb732BcO0d4JW9HGkbbRFSmlrSulASukg8FfU9jUcf59fBE6KiJ6j2o94ruL+GcX64y4iJlELtltTSt8umiu9n4frc6fu5xwCPetL3UXEiRHRe2gZeDfwM2p9OPTp/jJqY3MU7R8tZghcCOws3mreD7w7ImYWb+/eTW2sbTOwKyIuLGYEfHTIc7VbK/o40jba4lDoFD5IbV9Drc6ri5kLi4CzqH0AOOzruzgKfQj4cPH4o/9+h/r8YeD7xfrjqvjb3wQ8kVL62pC7KrufR+pzx+7ndnywMIoPIq6k9uny08Dn2l3Pcdb+RmqfaD8CrD5UP7WxsAeBtcD3gFlFewB/XvT1MaB/yHP9NrCu+PnYkPb+4gX1NPBntOcDsm9Re+v5OrVxwGta0ceRttHGPv9t0adHi3+Q84as/7mi/icZMhNppNd38dpZUfwt/gGYUrRPLW6vK+5/Y4v6ezG1oY5HgVXFz5VV3s91+tyR+9mv/ktSReQw5CJJaoCBLkkVYaBLUkUY6JJUEQa6JFWEgS5JFWGgS1JF/H85cMkmMcaqfgAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), reverse=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从上图中可以看出有一小部分用户阅读类型是极其广泛的,大部分人都处在20个新闻类型以下。" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idcategory_id
count250000.000000250000.000000
mean124999.5000004.573188
std72168.9279864.419800
min0.0000001.000000
25%62499.7500002.000000
50%124999.5000003.000000
75%187499.2500006.000000
max249999.00000095.000000
\n", - "
" + "source": [ + "#点击次数在前50的用户\n", + "plt.plot(user_click_item_count[:50])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "点击次数排前50的用户的点击次数都在100次以上。思路:我们可以定义点击次数大于等于100次的用户为活跃用户,这是一种简单的处理思路, 判断用户活跃度,更加全面的是再结合上点击时间,后面我们会基于点击次数和点击时间两个方面来判断用户活跃度。" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXEAAAD4CAYAAAAaT9YAAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAARV0lEQVR4nO3dfYxc1X3G8eexd7ExEDAYjEPYrkOQFZekKUxT2lKgJQHHSuWGphJIDaRYWaUBKUitKJQqRWlTNYnaSFWippvaMonASZsUGSVtg4tSXKkYYqd+WQqYlwLxSzAvcYgIBYxP/5i7u6Nl786dmTt7z5n7/UjWzt6Z3fmdnfGjM+ece65DCAIApGlB1QUAALpHiANAwghxAEgYIQ4ACSPEASBhQ/P5ZMuWLQujo6Pz+ZQAkLydO3c+H0I4fbb75jXER0dHtWPHjvl8SgBInu2n8+5jOAUAEkaIA0DCCHEASBghDgAJI8QBIGFtQ9z2RtuHbU/Mct8f2g62l/WnPADAXIr0xDdJWjPzoO2zJV0u6ZmSawIAFNR2nXgIYZvt0Vnu+oKkmyRtKbuome59+Fnt/uGRfj/Nm5yy5Dh99FdHtWCB5/25AaCIrk72sb1O0oEQwm577oCzPSZpTJJGRka6eTrdt+85fW177lr3vpjcZv2SVafrnNNPnNfnBoCiOg5x20sk/YmaQylthRDGJY1LUqPR6OoKFJ9ed54+ve68bn60a9/ec1A33PnfeuMYF80AEK9uVqecI2mlpN22n5L0Nkk/sH1mmYXFggsfAYhZxz3xEMJeSWdMfp8FeSOE8HyJdVXOYhwcQPyKLDHcLOl+Sats77e9vv9lVW9yqD+IrjiAeBVZnXJ1m/tHS6smIpP9cIZTAMSMMzZzTPXECXEAESPE22A4BUDMCPFcTGwCiB8hnoPhFAApIMRz0A8HkAJCvA164gBiRojnaLcnDADEgBDPMbVOnNUpACJGiOdgYhNACgjxHNOn3QNAvAhxAEgYIZ5jchfDwHgKgIgR4nkYTgGQAEI8B7sYAkgBIZ5jep04KQ4gXoQ4ACSMEM/BcAqAFBDiOVgnDiAFhHiO6SWGFRcCAHMgxHOw/xWAFBDibXCyD4CYEeI5WGAIIAWEeB52MQSQAEI8x9TEJn1xABEjxHMwsQkgBYR4O3TEAUSMEM/BxCaAFBDiOSY3wGJiE0DMCPEcjIkDSAEhnoOr3QNIASHeBsMpAGLWNsRtb7R92PZEy7E/t73H9i7b99h+a3/LnH/sYgggBUV64pskrZlx7PMhhHeHEN4j6duSPlVyXRFgUBxA/IbaPSCEsM326IxjL7V8e4IGsMM6tKAZ4us3fV8LIp/lPGHRQm25/iKNnLak6lIAzLO2IZ7H9mckXSPpJ5J+Y47HjUkak6SRkZFun27erX7rW3TTmlX66f8drbqUOR088oq27DqoA0deIcSBGuo6xEMIt0q61fYtkm6Q9Gc5jxuXNC5JjUYjmR778MIF+sSl76i6jLa2P/mCtuw6yCoaoKbKWJ1yh6TfKeH3oAtTAz1kOFBLXYW47XNbvl0n6ZFyykG3yHCgntoOp9jeLOlSScts71dz2GSt7VWSjkl6WtLH+1kk8rE9AFBvRVanXD3L4Q19qAVdiHzhDIA+44zNxLE9AFBvhHjizGXkgFojxAcEGQ7UEyGevMmJTWIcqCNCPHFMbAL1RognjsvIAfVGiCfO7JkL1BohPiBYYgjUEyGeuKnhFDIcqCVCPHFMbAL1RognzmLvFKDOCPHEMa8J1BshPiA42QeoJ0J8QBDhQD0R4oljYhOoN0I8cUxsAvVGiCfOXGQTqDVCfEDQEwfqiRBPHEsMgXojxBNnMbMJ1BkhnjguzwbUGyGeOC6UDNQbIT4g6IkD9USIJ46JTaDeCPHkMbEJ1BkhnrjpiU364kAdEeKJox8O1BshPiDoiAP1RIgnbvJq9ywxBOqJEE8cwylAvbUNcdsbbR+2PdFy7PO2H7G9x/Zdtk/pa5XIxRmbQL0V6YlvkrRmxrGtks4LIbxb0j5Jt5RcFwpiP3Gg3obaPSCEsM326Ixj97R8u13Sh0uuCx26b99zOvLK61WX0bMVJy/W2netqLoMIBltQ7yA6yR9I+9O22OSxiRpZGSkhKdDq5OXDOvERUO6e/dB3b37YNXllGLvbZfrpMXDVZcBJKGnELd9q6Sjku7Ie0wIYVzSuCQ1Gg0+9Jfs5OOHteNP36dXjx6rupSe3fnAM/rsvz2io2/wNgGK6jrEbX9U0gclXRY4XbBSi4cXavHwwqrL6Nnxw80pGt5MQHFdhbjtNZJuknRJCOFn5ZaEurJZMAl0qsgSw82S7pe0yvZ+2+slfVHSSZK22t5l+8t9rhM1wD4wQOeKrE65epbDG/pQC2pu+gIXAIrijE1Eh444UBwhjniwDwzQMUIc0WBaE+gcIY5omEFxoGOEOKJDhgPFEeKIBpt5AZ0jxBGNqXXi9MWBwghxRIOJTaBzhDiiwQUugM4R4ogOGQ4UR4gjGtMTm8Q4UBQhjngwnAJ0jBBHNJjYBDpHiANAwghxRGPyohAMpwDFEeKIxvTWKaQ4UBQhjmiwThzoHCEOAAkjxBGN6b1TABRFiCManOwDdI4QRzToiQOdI8QRHTriQHGEOAAkjBBHNMxFNoGOEeKIxlSEk+FAYYQ4osHEJtA5QhzRoScOFEeIIxpmM1qgY4Q4osHV7oHOEeKIBhObQOcIcUSDXQyBzrUNcdsbbR+2PdFy7HdtP2T7mO1Gf0tE3TCcAhRXpCe+SdKaGccmJF0paVvZBaHOmNgEOjXU7gEhhG22R2cce1hqPcMO6N2C7O30e//wgIYWDv5I37lnnKg7P3Zh1WUgcW1DvFe2xySNSdLIyEi/nw4J++WVp+m6X1upV15/o+pS+m7vgSP6rydeqLoMDIC+h3gIYVzSuCQ1Gg0GO5Hr5CXD+tRvra66jHnxha37NHHgparLwAAY/M+sQISmV+LQr0FvCHGgQmQ4elVkieFmSfdLWmV7v+31tj9ke7+kX5H0Hdvf7XehwCCZuhRdxXUgfUVWp1ydc9ddJdcC1AYLu1AWhlOACkxvMUBfHL0hxIEKsHc6ykKIAxWiI45eEeJABSbPdmafGPSKEAeAhBHiQAXYdhdlIcSBCnApOpSFEAcqRE8cvSLEgQpwPVGUhRAHKsBgCspCiAMVYGITZSHEgQqwARbKQogDFWLvFPSKEAcqwN4pKAshDgAJI8SBCkztnUJXHD0ixIEKTC0xJMTRI0IcqBAn+6BXhDhQAdaJoyyEOFABzthEWQhxoALTF4UAekOIAxWYHk4hxtEbQhyoEBGOXhHiQAUmx8TpiKNXhDhQBTO1iXIQ4kAFpnriDKigR4Q4UAFPpzjQE0IcqBAZjl4R4kAFpi4KQYqjR4Q4UAHmNVGWtiFue6Ptw7YnWo6danur7ceyr0v7WyYwWJjYRFmK9MQ3SVoz49jNku4NIZwr6d7sewAFsQEWyjLU7gEhhG22R2ccXifp0uz27ZL+Q9Ifl1kYUAf/sveQli45ruoyorRoeIHev3q5Fg0trLqUqLUN8RzLQwiHsts/krQ874G2xySNSdLIyEiXTwcMlmUnLpIk/cV3Hq64krj9/Ucu0BU/f2bVZUSt2xCfEkIItnM/FIYQxiWNS1Kj0eDDIyDpsncu1/ZbLtNrR49VXUqUnn7xZX1kw4N6lb9PW92G+LO2V4QQDtleIelwmUUBdXDmyYurLiFar73RDG92eWyv2yWGd0u6Nrt9raQt5ZQDACzB7ESRJYabJd0vaZXt/bbXS/orSe+3/Zik92XfA0ApyPDiiqxOuTrnrstKrgUAJLVc+YjRlLY4YxNAtDgZqj1CHEB0uGhGcYQ4gOgwsVkcIQ4gOuzyWBwhDiA6U3vLVFtGEghxANHiZJ/2CHEA0SLC2yPEAUSHic3iCHEA0TGD4oUR4gCiw5WPiiPEAUSLec32CHEA0WE0pThCHEB0zD6GhRHiAKLDhaSLI8QBRIeJzeIIcQDRoifeHiEOID5MbBZGiAOIDhObxRHiAKJjrgpRGCEOIDrTE5tohxAHEC064u0R4gCiM321e1K8HUIcQHSY1iyOEAcQHfZOKY4QBxAdLpRcHCEOIFpkeHuEOID4TG2ARYy3Q4gDiA7X2CyOEAcQHTK8OEIcQHSm14lXXEgCCHEA0WI/8fZ6CnHbn7Q9Yfsh2zeWVBOAmmP/q+K6DnHb50n6mKT3SvoFSR+0/Y6yCgNQX0xsFjfUw8++U9IDIYSfSZLt+yRdKelzZRQGoL4mT/b5yn8+qW/u3F9xNeX4yyvfpV8aPbX039tLiE9I+ozt0yS9ImmtpB0zH2R7TNKYJI2MjPTwdADqYvHwAn38knP0zIsvV11KaY4fXtiX3+teFtPbXi/pE5JelvSQpFdDCDfmPb7RaIQdO96U8wCAOdjeGUJozHZfTxObIYQNIYQLQggXS/qxpH29/D4AQGd6GU6R7TNCCIdtj6g5Hn5hOWUBAIroKcQlfSsbE39d0vUhhCO9lwQAKKqnEA8h/HpZhQAAOscZmwCQMEIcABJGiANAwghxAEhYTyf7dPxk9nOSnu7yx5dJer7EclJAm+uBNtdDL23+uRDC6bPdMa8h3gvbO/LOWBpUtLkeaHM99KvNDKcAQMIIcQBIWEohPl51ARWgzfVAm+uhL21OZkwcAPBmKfXEAQAzEOIAkLAkQtz2GtuP2n7c9s1V19ML20/Z3mt7l+0d2bFTbW+1/Vj2dWl23Lb/Nmv3Htvnt/yea7PHP2b72qraMxvbG20ftj3Rcqy0Ntq+IPsbPp79bOVXZMxp8222D2Sv9S7ba1vuuyWr/1HbV7Qcn/W9bnul7Qey49+wfdz8tW52ts+2/T3b/5NdLP2T2fGBfa3naHN1r3UIIep/khZKekLS2yUdJ2m3pNVV19VDe56StGzGsc9Jujm7fbOkz2a310r6VzUv/n2hmtc0laRTJT2ZfV2a3V5addta2nOxpPMlTfSjjZIezB7r7Gc/EGmbb5P0R7M8dnX2Pl4kaWX2/l4413td0j9Kuiq7/WVJfxBBm1dIOj+7fZKaF4VZPciv9Rxtruy1TqEn/l5Jj4cQngwhvCbp65LWVVxT2dZJuj27fbuk3245/tXQtF3SKbZXSLpC0tYQwoshhB9L2ippzTzXnCuEsE3SizMOl9LG7L63hBC2h+a7/Kstv6syOW3Os07S10MIr4YQ/lfS42q+z2d9r2e9z9+U9M3s51v/fpUJIRwKIfwgu/1TSQ9LOksD/FrP0eY8fX+tUwjxsyT9sOX7/Zr7jxa7IOke2zvdvIi0JC0PIRzKbv9I0vLsdl7bU/yblNXGs7LbM4/H6oZs6GDj5LCCOm/zaZKOhBCOzjgeDdujkn5R0gOqyWs9o81SRa91CiE+aC4KIZwv6QOSrrd9ceudWY9joNd91qGNmb+TdI6k90g6JOmvK62mT2yfKOlbkm4MIbzUet+gvtaztLmy1zqFED8g6eyW79+WHUtSCOFA9vWwpLvU/Fj1bPbRUdnXw9nD89qe4t+krDYeyG7PPB6dEMKzIYQ3QgjHJH1Fzdda6rzNL6g59DA043jlbA+rGWZ3hBD+OTs80K/1bG2u8rVOIcS/L+ncbMb2OElXSbq74pq6YvsE2ydN3pZ0uaQJNdszOSN/raQt2e27JV2TzepfKOkn2cfU70q63PbS7GPb5dmxmJXSxuy+l2xfmI0fXtPyu6IyGWSZD6n5WkvNNl9le5HtlZLOVXMCb9b3etab/Z6kD2c/3/r3q0z2998g6eEQwt+03DWwr3Vemyt9rauc6S36T81Z7X1qzubeWnU9PbTj7WrOQu+W9NBkW9QcB7tX0mOS/l3SqdlxS/pS1u69khotv+s6NSdJHpf0+1W3bUY7N6v5kfJ1Ncf01pfZRkmN7D/JE5K+qOzM4wjb/LWsTXuy/8wrWh5/a1b/o2pZcZH3Xs/eOw9mf4t/krQogjZfpOZQyR5Ju7J/awf5tZ6jzZW91px2DwAJS2E4BQCQgxAHgIQR4gCQMEIcABJGiANAwghxAEgYIQ4ACft/AbwTsfQSxAYAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " user_id category_id\n", - "count 250000.000000 250000.000000\n", - "mean 124999.500000 4.573188\n", - "std 72168.927986 4.419800\n", - "min 0.000000 1.000000\n", - "25% 62499.750000 2.000000\n", - "50% 124999.500000 3.000000\n", - "75% 187499.250000 6.000000\n", - "max 249999.000000 95.000000" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user_click_merge.groupby('user_id')['category_id'].nunique().reset_index().describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户查看文章的长度的分布\n", - "\n", - "通过统计不同用户点击新闻的平均字数,这个可以反映用户是对长文更感兴趣还是对短文更感兴趣。" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从上图中可以发现有一小部分人看的文章平均词数非常高,也有一小部分人看的平均文章次数非常低。\n", - "\n", - "大多数人偏好于阅读字数在200-400字之间的新闻。" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 51, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "#挑出大多数人的区间仔细看看\n", - "plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True)[1000:45000])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "可以发现大多数人都是看250字以下的文章" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idwords_count
count250000.000000250000.000000
mean124999.500000205.830189
std72168.92798647.174030
min0.0000008.000000
25%62499.750000187.500000
50%124999.500000202.000000
75%187499.250000217.750000
max249999.0000003434.500000
\n", - "
" + "source": [ + "#点击次数排名在[25000:50000]之间\n", + "plt.plot(user_click_item_count[25000:50000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看出点击次数小于等于两次的用户非常的多,这些用户可以认为是非活跃用户" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻点击次数分析" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:42:14.526476Z", + "start_time": "2020-11-13T15:42:14.463642Z" + } + }, + "outputs": [], + "source": [ + "item_click_count = sorted(user_click_merge.groupby('click_article_id')['user_id'].count(), reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T15:42:16.198000Z", + "start_time": "2020-11-13T15:42:16.044455Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " user_id words_count\n", - "count 250000.000000 250000.000000\n", - "mean 124999.500000 205.830189\n", - "std 72168.927986 47.174030\n", - "min 0.000000 8.000000\n", - "25% 62499.750000 187.500000\n", - "50% 124999.500000 202.000000\n", - "75% 187499.250000 217.750000\n", - "max 249999.000000 3434.500000" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#更加详细的参数\n", - "user_click_merge.groupby('user_id')['words_count'].mean().reset_index().describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 用户点击新闻的时间分析" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "#为了更好的可视化,这里把时间进行归一化操作\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "mm = MinMaxScaler()\n", - "user_click_merge['click_timestamp'] = mm.fit_transform(user_click_merge[['click_timestamp']])\n", - "user_click_merge['created_at_ts'] = mm.fit_transform(user_click_merge[['created_at_ts']])\n", - "\n", - "user_click_merge = user_click_merge.sort_values('click_timestamp')" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
182499901623000.00000043201252552810.989186193
22499981609740.00000241121132552810.989092259
302499851609740.0000034117182882810.989092259
502499791623000.00000441171252222810.989186193
252499881609740.0000044117121217172810.989092259
\n", - "
" + "source": [ + "plt.plot(item_click_count)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " user_id click_article_id click_timestamp click_environment \\\n", - "18 249990 162300 0.000000 4 \n", - "2 249998 160974 0.000002 4 \n", - "30 249985 160974 0.000003 4 \n", - "50 249979 162300 0.000004 4 \n", - "25 249988 160974 0.000004 4 \n", - "\n", - " click_deviceGroup click_os click_country click_region \\\n", - "18 3 20 1 25 \n", - "2 1 12 1 13 \n", - "30 1 17 1 8 \n", - "50 1 17 1 25 \n", - "25 1 17 1 21 \n", - "\n", - " click_referrer_type rank click_cnts category_id created_at_ts \\\n", - "18 2 5 5 281 0.989186 \n", - "2 2 5 5 281 0.989092 \n", - "30 2 8 8 281 0.989092 \n", - "50 2 2 2 281 0.989186 \n", - "25 2 17 17 281 0.989092 \n", - "\n", - " words_count \n", - "18 193 \n", - "2 259 \n", - "30 259 \n", - "50 193 \n", - "25 259 " - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "user_click_merge.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "def mean_diff_time_func(df, col):\n", - " df = pd.DataFrame(df, columns={col})\n", - " df['time_shift1'] = df[col].shift(1).fillna(0)\n", - " df['diff_time'] = abs(df[col] - df['time_shift1'])\n", - " return df['diff_time'].mean()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [], - "source": [ - "# 点击时间差的平均值\n", - "mean_diff_click_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'click_timestamp'))" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(sorted(mean_diff_click_time.values, reverse=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从上图可以发现不同用户点击文章的时间差是有差异的" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [], - "source": [ - "# 前后点击文章的创建时间差的平均值\n", - "mean_diff_created_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'created_at_ts'))" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.plot(sorted(mean_diff_created_time.values, reverse=True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "从图中可以发现用户先后点击文章,文章的创建时间也是有差异的" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Defaulting to user installation because normal site-packages is not writeable\n", - "Looking in indexes: https://mirrors.aliyun.com/pypi/simple\n", - "Collecting gensim\n", - " Downloading https://mirrors.aliyun.com/pypi/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)\n", - "\u001b[K |████████████████████████████████| 24.2 MB 91.0 MB/s eta 0:00:01\n", - "\u001b[?25hRequirement already satisfied: six>=1.5.0 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.15.0)\n", - "Requirement already satisfied: numpy>=1.11.3 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.19.1)\n", - "Requirement already satisfied: scipy>=0.18.1 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.5.4)\n", - "Requirement already satisfied: numpy>=1.11.3 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.19.1)\n", - "Collecting smart-open>=1.8.1\n", - " Downloading https://mirrors.aliyun.com/pypi/packages/e3/cf/6311dfb0aff3e295d63930dea72e3029800242cdfe0790478e33eccee2ab/smart_open-4.0.1.tar.gz (117 kB)\n", - "\u001b[K |████████████████████████████████| 117 kB 96.7 MB/s eta 0:00:01\n", - "\u001b[?25hBuilding wheels for collected packages: smart-open\n", - " Building wheel for smart-open (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for smart-open: filename=smart_open-4.0.1-py3-none-any.whl size=108249 sha256=50eb67320a58790e8b173971aeb6af7b636d48259d7c9de759612e58e334215b\n", - " Stored in directory: /home/admin/.cache/pip/wheels/c3/14/fc/a0e523e5d2f13d083ce0af09d4e2861d8e2ec65fc466fb1dff\n", - "Successfully built smart-open\n", - "Installing collected packages: smart-open, gensim\n", - "Successfully installed gensim-3.8.3 smart-open-4.0.1\n" - ] - } - ], - "source": [ - "# 安装gensim\n", - "!pip install gensim" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "from gensim.models import Word2Vec\n", - "import logging, pickle\n", - "\n", - "# 需要注意这里模型只迭代了一次\n", - "def trian_item_word2vec(click_df, embed_size=16, save_name='item_w2v_emb.pkl', split_char=' '):\n", - " click_df = click_df.sort_values('click_timestamp')\n", - " # 只有转换成字符串才可以进行训练\n", - " click_df['click_article_id'] = click_df['click_article_id'].astype(str)\n", - " # 转换成句子的形式\n", - " docs = click_df.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()\n", - " docs = docs['click_article_id'].values.tolist()\n", - "\n", - " # 为了方便查看训练的进度,这里设定一个log信息\n", - " logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", - "\n", - " # 这里的参数对训练得到的向量影响也很大,默认负采样为5\n", - " w2v = Word2Vec(docs, size=16, sg=1, window=5, seed=2020, workers=24, min_count=1, iter=10)\n", - " \n", - " # 保存成字典的形式\n", - " item_w2v_emb_dict = {k: w2v[k] for k in click_df['click_article_id']}\n", - " \n", - " return item_w2v_emb_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "item_w2v_emb_dict = trian_item_word2vec(user_click_merge)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_type
25667190841199197150704527612941171202
25668190841285298150704530292041171202
25669190841156624150704663888541171202
25670190841129029150704666888541171202
107739164226214800150713140246441171212
\n", - "
" + "source": [ + "plt.plot(item_click_count[:100])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以看出点击次数最多的前100篇新闻,点击次数大于1000次" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAApy0lEQVR4nO3deXxU5dn/8c812YBAWJKwB8Mui7INm4q7bFpR64K1D6go9Vd3q60+9lV92tqntrW2tmqLdcEWAbW1UkERcaEuLAEBWSXsYQmRsAfIdv/+mMPTMSYkZDszk+/79ZpXzlznPnOuczLJNefc95ljzjlERKRhC/idgIiI+E/FQEREVAxERETFQEREUDEQEREg3u8EqistLc1lZmb6nYaISFRZunTpV8659LLxqC0GmZmZZGVl+Z2GiEhUMbOt5cV1mkhERFQMRERExUBERKhCMTCzF8xsj5mtKhO/08zWmdlqM/tVWPwhM8s2s/VmNiosPtqLZZvZg2Hxzma2yIvPNLPE2to4ERGpmqocGbwEjA4PmNkFwDign3OuD/AbL94bGA/08ZZ5xszizCwOeBoYA/QGrvfaAjwOPOmc6wbsAybVdKNEROTUVFoMnHMLgPwy4f8H/NI5d9xrs8eLjwNmOOeOO+c2A9nAEO+R7Zzb5JwrBGYA48zMgAuB173lpwJX1GyTRETkVFW3z6AHMMI7vfORmQ324h2A7WHtcrxYRfFUYL9zrrhMXERE6lF1i0E80AoYBjwAvOp9yq9TZjbZzLLMLCsvL69arzH10y28s2oXJaX66m4RkROqe9FZDvAPF7oZwmIzKwXSgB1ARli7jl6MCuJ7gRZmFu8dHYS3/wbn3BRgCkAwGDzl/+alpY7pi7exbvchMlo15qazOnPt4AyaJkXttXciIrWiukcG/wQuADCzHkAi8BUwCxhvZklm1hnoDiwGlgDdvZFDiYQ6mWd5xeQD4GrvdScCb1Yzp0oFAsbsu0bwp+8OpE2zRvz0rTUM/8V8fjFnLTv2H62r1YqIRDyr7E5nZjYdOJ/QJ/9c4BHgr8ALQH+gELjfOfe+1/5h4GagGLjHOfe2Fx8L/A6IA15wzj3mxbsQ6lBuBXwOfPdEx/TJBINBV9Ovo1i+fT/Pf7yZOV/sAmBM37bcMqIL/TNa1Oh1RUQilZktdc4FvxGP1tte1kYxOGHH/qNM/XQL0xdt49DxYgad1pJbzunMyD5tiQvUeVeIiEi9UTGogsPHi3ktazsvfLKZ7flHyWjVmBvP6sy1wY40a5RQq+sSEfGDisEpKCl1zFuTy/Mfb2LJln00S4pn/JAMJp6VSceWTepknSIi9UHFoJrK9iuM7tuWW9WvICJRSsWghnZ6/QqvLN7GoWPFDO+Syv87vysjuqdRD5dYiIjUChWDWnL4eDEzFm/juX9vIvfgcfq0T+G287oypm9b4uP0JbAiEtlUDGrZ8eIS3vx8J39asJFNeUfo1KoJk8/twtWDOtIoIc63vERETkbFoI6UljreXZPLsx9tZMX2/aQ1TeKmszP57rDTaN5YI5BEJLKoGNQx5xwLN+Xz7EcbWfBlHk2T4rlhWCcmnd2Z1imN/E5PRARQMahXq3Yc4M8LNjF75U7iAwG+PagDk8/tSue0ZL9TE5EGTsXAB1v3HmHKgk28tjSHopJSxvRty23ndeXMji38Tk1EGigVAx/lHTrOi59s5q8Lt3LoWDHB01oybkAHLj2jHa2SdZdPEak/KgYR4NCxIqYv3sZrWTls2HOY+IAxonsa4/p34JLebUjWV2mLSB1TMYggzjnW7T7Em8t38q8VO9mx/yiNEgJc0rst4/q159we6STG65oFEal9KgYRqrTUsXTbPt5cvoPZK3exr6CI5o0TGHtGO8b1b8+QzFYE9M2pIlJLVAyiQFFJKR9v+Io3l+/g3TW5FBSW0K55I77Vrz3j+rend7sUffWFiNSIikGUKSgs5r21e5i1fAcfrs+juNTRrXVTLu/XnvFDMmjdTNcuiMipUzGIYvuOFPL2qt28uXwHizbn07NNM+bcPUI33hGRU1ZRMVAvZRRomZzId4Z2Yub3hvPU9QNYn3uIf63Y6XdaIhJDKi0GZvaCme0xs1VhsUfNbIeZLfceY8PmPWRm2Wa23sxGhcVHe7FsM3swLN7ZzBZ58ZlmpoH3J3HZGe3o1S6FJ9/7kqKSUr/TEZEYUZUjg5eA0eXEn3TO9fcecwDMrDcwHujjLfOMmcWZWRzwNDAG6A1c77UFeNx7rW7APmBSTTYo1gUCxgOjerB1bwGvZeX4nY6IxIhKi4FzbgGQX8XXGwfMcM4dd85tBrKBId4j2zm3yTlXCMwAxlloaMyFwOve8lOBK05tExqeC3q2ZmCnFjw1fwPHikr8TkdEYkBN+gzuMLOV3mmkll6sA7A9rE2OF6songrsd84Vl4mXy8wmm1mWmWXl5eXVIPXoZmY8MOp0dh88xt8WbvU7HRGJAdUtBs8CXYH+wC7gidpK6GScc1Occ0HnXDA9Pb0+VhmxhndN5ZxuaTzz4UYOHy+ufAERkZOoVjFwzuU650qcc6XAc4ROAwHsADLCmnb0YhXF9wItzCy+TFyq4P5RPck/UsiLH2/2OxURiXLVKgZm1i7s6ZXAiZFGs4DxZpZkZp2B7sBiYAnQ3Rs5lEiok3mWC13k8AFwtbf8RODN6uTUEPXPaMElvdswZcEm9hcU+p2OiESxqgwtnQ58BvQ0sxwzmwT8ysy+MLOVwAXAvQDOudXAq8Aa4B3gdu8Iohi4A5gLrAVe9doC/Ai4z8yyCfUhPF+rWxjjfjCyB4cLi/nzgk1+pyIiUUxXIMeAu2d8ztzVu1nwwwv0NRUiclK6AjmG3XtxD4pKHM98sNHvVEQkSqkYxIDMtGSuDXZk2qKt5Owr8DsdEYlCKgYx4s4Lu2NmPDV/g9+piEgUUjGIEe1bNOa7Q0/j9aU5bMw77Hc6IhJlVAxiyPcv6EqjhDienPel36mISJRRMYghaU2TuPnszry1cherdx7wOx0RiSIqBjHm1nO7kNIonife1dGBiFSdikGMad44ge+d15X31+1h6daqftmsiDR0KgYx6KazM0lrmsiv564nWi8qFJH6pWIQg5okxnP7Bd1YuCmfT7L3+p2OiEQBFYMY9Z2hnWjfvBG/nrtORwciUikVgxiVFB/H3Rd3Z0XOAeatyfU7HRGJcCoGMezbAzvSOS2ZJ979kpJSHR2ISMVUDGJYfFyAey/pwfrcQ7y1cqff6YhIBFMxiHGXndGO09s247fzvqSopNTvdEQkQqkYxLhAwLh/ZE+27i3g9aU5fqcjIhFKxaABuKhXawZ0asFT8zdwrKjE73REJAJV5baXL5jZHjNbVc68H5iZM7M077mZ2VNmlm1mK81sYFjbiWa2wXtMDIsP8m6hme0ta7W1cRJiZjwwqie7Dhxj2qJtfqcjIhGoKkcGLwGjywbNLAMYCYT/dxkDdPcek4FnvbatgEeAocAQ4BEza+kt8yxwa9hy31iX1NxZXdM4u1sqz3yQzeHjxX6nIyIRJr6yBs65BWaWWc6sJ4EfAm+GxcYBL7vQVU4LzayFmbUDzgfmOefyAcxsHjDazD4EUpxzC734y8AVwNvV3SCp2P0je3LlM59y5yvL6N6mGfEBCz3iAsTHedOBE9NhsbgACQEjLmAkxAXo26E56c2S/N4cEalFlRaD8pjZOGCHc25FmbM6HYDtYc9zvNjJ4jnlxCta72RCRxx06tSpOqk3aAM6teTGszL5+9IcPtu0l+ISR3E1rj9Ia5rErDvOpn2LxnWQpYj44ZSLgZk1Af6b0CmieuWcmwJMAQgGg7qKqhoevbwPj17e5/+eO+coKQ0VheJSR3FJKUUloVhRSak37z+xvMPHueuVz5k0NYvXbxtOclK1Pk+ISISpzl9yV6AzcOKooCOwzMyGADuAjLC2Hb3YDkKnisLjH3rxjuW0l3piZqHTQXFVX+YP3xnAzS8t4Z6Zy/nzdwcRCKjPXyTanfLQUufcF8651s65TOdcJqFTOwOdc7uBWcAEb1TRMOCAc24XMBcYaWYtvY7jkcBcb95BMxvmjSKawNf7ICQCnd+zNT+5rDfz1uTy+Nx1fqcjIrWg0iMDM5tO6FN9mpnlAI84556voPkcYCyQDRQANwE45/LN7GfAEq/dT090JgPfJzRiqTGhjmN1HkeBiWdlkp13mD9/tIlu6U25JphR+UIiErEsWr/eOBgMuqysLL/TaNCKSkq56cUlLNq8l79NGsrQLql+pyQilTCzpc65YNm4rkCWakuIC/D0DQPJaNWE2/62lK17j/idkohUk4qB1Ejzxgm8MHEwDpg0NYsDR4v8TklEqkHFQGosMy2ZZ28YxJavjnDHK8so1rejikQdFQOpFcO7pvLYlX3594av+Olba/xOR0ROka4Yklpz3eBOZO85zHP/3ky31k2ZMDzT75REpIpUDKRWPTimF5vyjvA//1pDZmoy5/ZI9zslEakCnSaSWhUXMH5//QC6t27K7dOWkb3nkN8piUgVqBhIrWuaFM9fJgZJSghw80tZ5B8p9DslEamEioHUiY4tmzBlQpDdB49x29+WUlisEUYikUzFQOrMwE4t+fXVZ7J4cz4Pv/EF0Xq1u0hDoA5kqVPj+ndg457DPPV+Nt1aN+V753X1OyURKYeKgdS5ey7uwca8I/zynXV0SW/KJb3b+J2SiJSh00RS5wIB4zfX9OOMDs25e8bnrNpxwO+URKQMFQOpF40T4/jLhCApjRIY9/Qn3DI1i/fW5OqrK0QihE4TSb1pndKIN24/i5c/28prWTm8tzaX1s2SuHpQR64NZpCZlux3iiINlu5nIL4oKinlg3V7mLlkOx+s30Opg+FdUrlucAaj+7alUcIp3IdTRKqsovsZqBiI73YfOMbfl+Uwc8l2tuUXkNIonisHdOC6wZ3o3T7F7/REYoqKgUS80lLHwk17mbFkO++s3k1hcSlndGjOdYMzuLx/e1IaJfidokjUq/adzszsBTPbY2arwmI/M7OVZrbczN41s/Ze3MzsKTPL9uYPDFtmoplt8B4Tw+KDzOwLb5mnzMxqvrkSjQIB46xuaTx1/QAW//dFPPqt3hSVlPLjf65iyGPv8YNXV7B4c74uXhOpA5UeGZjZucBh4GXnXF8vluKcO+hN3wX0ds7dZmZjgTuBscBQ4PfOuaFm1grIAoKAA5YCg5xz+8xsMXAXsAiYAzzlnHu7ssR1ZNAwOOdYmXOAmVnbmbV8J4ePF3PR6a35y8Qg+twgcuqqfWTgnFsA5JeJHQx7mkzoHzzAOEJFwznnFgItzKwdMAqY55zLd87tA+YBo715Kc65hS5UlV4Grjj1zZNYZWb0y2jBL648g8UPX8QdF3Rj/ro9zF292+/URGJKta8zMLPHzGw7cAPwEy/cAdge1izHi50snlNOvKJ1TjazLDPLysvLq27qEqWaJMZzz8Xd6dGmKf/79jqOF5f4nZJIzKh2MXDOPeycywCmAXfUXkonXecU51zQORdMT9dNUxqi+LgAD1/am617C3j5061+pyMSM2rjCuRpwLe96R1ARti8jl7sZPGO5cRFKnRej3TO65HOU+9v0L0SRGpJtYqBmXUPezoOWOdNzwImeKOKhgEHnHO7gLnASDNraWYtgZHAXG/eQTMb5o0imgC8Wd2NkYbjx5f2oqCwhN+996XfqYjEhEq/jsLMpgPnA2lmlgM8Aow1s55AKbAVuM1rPofQSKJsoAC4CcA5l29mPwOWeO1+6pw70Sn9feAloDHwtvcQOanubZpx/ZAMpi3axoThp9GtdTO/UxKJarroTKLW3sPHOf/XHxLMbMmLNw3xOx2RqFDtoaUikSq1aRJ3XNiND9bnseBLjS4TqQkVA4lqN56dSUarxjw2ey0lpdF5lCsSCVQMJKolxcfx0JherM89xMwl2ytfQETKpWIgUW9M37YMzmzJb+et59CxIr/TEYlKKgYS9cyMH1/am68OF/LMhxv9TkckKqkYSEzol9GCqwZ04PmPN7M9v8DvdESijoqBxIwHRvckYPD4O+sqbywiX6NiIDGjXfPGTD63K2+t3MXSrfv8TkckqqgYSEz53rldaN0siZ+9tYZSDTUVqTIVA4kpyUnxPDCqJ8u37+dfK3f6nY5I1FAxkJjz7YEd6dM+hcffXsexIt3zQKQqVAwk5gQCoaGmOw8c4/mPN/udjkhUUDGQmDS8ayoje7fhmQ+y2XPomN/piEQ8FQOJWQ+N7UVhSSm/fVf3PBCpjIqBxKzOaclMGJ7JzKztrNl50O90RCKaioHEtLsu7E7zxgn8fPYaovXeHSL1QcVAYlrzJgncc1F3Pt24l/lr9/idjkjEqrQYmNkLZrbHzFaFxX5tZuvMbKWZvWFmLcLmPWRm2Wa23sxGhcVHe7FsM3swLN7ZzBZ58ZlmlliL2yfCDcNOo0t6Mr+Ys5aiklK/0xGJSFU5MngJGF0mNg/o65w7E/gSeAjAzHoD44E+3jLPmFmcmcUBTwNjgN7A9V5bgMeBJ51z3YB9wKQabZFIGQlxAR4e24tNXx3hbwu3+p2OSESqtBg45xYA+WVi7zrnir2nC4GO3vQ4YIZz7rhzbjOQDQzxHtnOuU3OuUJgBjDOzAy4EHjdW34qcEXNNknkmy48vTXndEvjd+9tYH9Bod/piESc2ugzuBl425vuAITfbirHi1UUTwX2hxWWE/FymdlkM8sys6y8PN3zVqrOzHj40l4cOlbE4++sV2eySBk1KgZm9jBQDEyrnXROzjk3xTkXdM4F09PT62OVEkN6tUvhprM7M33xNm59OUtHCCJhql0MzOxG4DLgBvefj1k7gIywZh29WEXxvUALM4svExepEz++tBePfqs3H32Zx6VPfczn2/RV1yJQzWJgZqOBHwKXO+fCbys1CxhvZklm1hnoDiwGlgDdvZFDiYQ6mWd5ReQD4Gpv+YnAm9XbFJHKmRk3nt2Z1287CzO49s+f8fzHm3XaSBq8qgwtnQ58BvQ0sxwzmwT8EWgGzDOz5Wb2JwDn3GrgVWAN8A5wu3OuxOsTuAOYC6wFXvXaAvwIuM/Msgn1ITxfq1soUo5+GS2YfecIzu/Zmp+9tYbb/raUA0eL/E5LxDcWrZ+IgsGgy8rK8jsNiXLOOZ7/eDO/fHsd7Vo04pnvDOKMjs39TkukzpjZUudcsGxcVyBLg2Zm3DKiCzO/N5ySEse3n/2Ulz/botNG0uCoGIgAg05ryey7RnB2t1R+8uZq7njlcw4d02kjaThUDEQ8LZMTeX7iYB4cczrvrN7Nt/7wMat3HvA7LZF6oWIgEiYQMG47ryszJg/jaFEJVz7zKdMWbdVpI4l5KgYi5Ric2Yo5d41gaOdWPPzGKu6ZuZwjx4srX1AkSqkYiFQgtWkSU28awv0je/CvFTv51h8/Zt1u3SRHYpOKgchJBALGHRd2Z9otwzh0rJgrnv6Ef36ui+Ql9qgYiFTB8K6pzLlrBGd2bMH9r61gZc5+v1MSqVUqBiJVlN4sief+K0h6syTumbmco4UlfqckUmtUDEROQfMmCfzmmn5syjvCL+as9TsdkVqjYiByis7ulsYt53Tmrwu38sE63VdZYoOKgUg13D+qJ6e3bcYDr69k7+HjfqcjUmMqBiLV0Cghjiev68/Bo0U89I8vdFGaRD0VA5Fq6tUuhQdG9eTdNbm8mrW98gVEIpiKgUgNTDqnM8O7pPI//1rD1r1H/E5HpNpUDERqIBAwnri2H/EB456ZyykuKfU7JZFqUTEQqaH2LRrz8yvP4PNt+3nmw41+pyNSLSoGIrXg8n7tGde/Pb+fv4Hl2/f7nY7IKavKPZBfMLM9ZrYqLHaNma02s1IzC5Zp/5CZZZvZejMbFRYf7cWyzezBsHhnM1vkxWeaWWJtbZxIffrpuL60aZbEvTOXU1CobziV6FKVI4OXgNFlYquAq4AF4UEz6w2MB/p4yzxjZnFmFgc8DYwBegPXe20BHgeedM51A/YBk6q3KSL+at44gSeu7c+WvUf4+WxdnSzRpdJi4JxbAOSXia11zq0vp/k4YIZz7rhzbjOQDQzxHtnOuU3OuUJgBjDOzAy4EHjdW34qcEV1N0bEb8O7pjJ5RBdeWbSN+Wtz/U5HpMpqu8+gAxA+4DrHi1UUTwX2O+eKy8TLZWaTzSzLzLLy8vJqNXGR2nLfyB70apfCj/6+kq90dbJEiajqQHbOTXHOBZ1zwfT0dL/TESlXUnwcv7uuPwePFfPg31fq6mSJCrVdDHYAGWHPO3qxiuJ7gRZmFl8mLhLVerZtxo9Gn857a/cwfbGuTpbIV9vFYBYw3sySzKwz0B1YDCwBunsjhxIJdTLPcqGPTB8AV3vLTwTerOWcRHxx01mZnNMtjZ+9tYbNX+nqZIlsVRlaOh34DOhpZjlmNsnMrjSzHGA4MNvM5gI451YDrwJrgHeA251zJV6fwB3AXGAt8KrXFuBHwH1mlk2oD+H52t1EEX8EAsZvrulHYnyAe2Yup0hXJ0sEs2g9nxkMBl1WVpbfaYhUavbKXdz+yjLuvqg7917Sw+90pIEzs6XOuWDZeFR1IItEo0vPbMdVAzrwxw+yWbZtn9/piJRLxUCkHjw6rg9tUxpx78zlHDmuq5Ml8qgYiNSDlEYJPHldf7blF/DorNUabioRR8VApJ4M6dyK28/vxmtLc3j+481+pyPyNfGVNxGR2nLfJT3YmHeYx+asJaNVE0b1aet3SiKAjgxE6lUgYPz22v6c2bEF98xYzsqc/X6nJAKoGIjUu8aJcfxlQpBWyYlMmprFjv1H/U5JRMVAxA/pzZJ46abBHCsq4eYXl3DoWJHfKUkDp2Ig4pPubZrx7A2D2Jh3mO9PW6YrlMVXKgYiPjqnexqPXdmXf2/4ikc05FR8pNFEIj67bnAntuwt4NkPN5KZ2oTJ53b1OyVpgFQMRCLAAyN7sm1vAb+Ys46Mlk0Yc0Y7v1OSBkaniUQiQCBgPHFtPwZ0asE9M5fzub7DSOqZioFIhGiUEMdzE4K0Tkni1pez2J5f4HdK0oCoGIhEkLSmSbx442AKi0u5+aUlHDiqIadSP1QMRCJMt9bN+NN/DWLzV0f4/rSlGnIq9ULFQCQCndU1jf+96gw+yd7Lj99YpSGnUueqctvLF8xsj5mtCou1MrN5ZrbB+9nSi5uZPWVm2Wa20swGhi0z0Wu/wcwmhsUHmdkX3jJPmZnV9kaKRKNrghnceWE3ZmZt59mPNvqdjsS4qhwZvASMLhN7EJjvnOsOzPeeA4wBunuPycCzECoewCPAUGAI8MiJAuK1uTVsubLrEmmw7rukB5f3a8+v3lnPWyt3+p2OxLBKi4FzbgGQXyY8DpjqTU8FrgiLv+xCFgItzKwdMAqY55zLd87tA+YBo715Kc65hS50HPxy2GuJNHhmxq+uPpPgaS2579UVLN2qIadSN6rbZ9DGObfLm94NtPGmOwDbw9rleLGTxXPKiZfLzCabWZaZZeXl5VUzdZHo0ighjikTgrRr3ohbX85i214NOZXaV+MOZO8Tfb30bjnnpjjngs65YHp6en2sUiQitEpO5MUbB1NS6rjq2U/5w/wN7D183O+0JIZUtxjkeqd48H7u8eI7gIywdh292MniHcuJi0gZXdKbMu2WofRq14wn5n3J8F++z/2vrWDVjgN+pyYxoLrFYBZwYkTQRODNsPgEb1TRMOCAdzppLjDSzFp6HccjgbnevINmNswbRTQh7LVEpIy+HZrz10lDmXfvuVwb7Mjslbu47A8fc+2fPmPOF7so1jUJUk1W2fhlM5sOnA+kAbmERgX9E3gV6ARsBa51zuV7/9D/SGhEUAFwk3Muy3udm4H/9l72Mefci148SGjEUmPgbeBOV4VB1cFg0GVlZZ3CporEngMFRbyatZ2pn20hZ99R2jdvxH8Nz2T84AxaJif6nZ5EIDNb6pwLfiMerRezqBiI/EdJqWP+2lxe/GQLn23aS1J8gCsHdODGszM5vW2K3+lJBFExEGkg1u0+yNRPt/CPZTs4XlzK8C6p3Hh2Jhf3akNcQNd0NnQqBiINzL4jhcxYsp2/fraFnQeO0bFlYyYOz+Syfu1o3jiBxglx6IL/hkfFQKSBKi4pZd6a0CmkxVv+c/2oGTRJiKNJUjzJiXE0SYynSeLXnycnhcUT40hOiic5KZ6URvGkNE4gpVECKY3jad44gaT4OB+3UqqqomKgO52JxLj4uABjzmjHmDPasXrnAZZu3UdBYQkFx4s5UlhCQWExBYUlHDkemj5wtIhd+4+GYoXFFBwvobAKo5SS4gNegQgvFGWfx5OanMhpqcmcltqEJon6FxQp9JsQaUD6tG9On/bNT3m5opLSUAEpLObwsWIOHivi4NETP4s4eKzY+/mf+P6CQrblF/xfvKjkm2chWjdLItMrDJlp3s/UZDqlNiGlUUJtbLJUkYqBiFQqIS5A88YBmjdOgFOvJTjnOFZUysFjRew5eJyt+UfYureArXuPsGVvAQs25PHa0pyvLRM6gmjiFYtQoTgttQldWzdVoagDKgYiUufMjMaJcTROjKNNSiPO6PjNilJQWMy2/AK2fPWfIrF17xEWbc7njeU7ONG9GRcwgqe15JLebbi4Vxsy05LreWtikzqQRSTiHSsqIWdfqFB8vn0f89fuYd3uQwB0a92Ui3q15pJebRjQqaWGz1ZCo4lEJKZszy/gvbW5zF+7h4Wb9lJc6miVnMiFp7fm4l5tGNE9jeQknfwoS8VARGLWwWNFfLQ+j/fW5vLBuj0cPFZMYnyAs7qmcnGvNlzUqzXtmjf2O82IoGIgIg1CUUkpWVv28d7aXN5bm8tW7/4PfTukcHGvUD9Dn/YpDfaCOxUDEWlwnHNszDvMvDV7eG9tLsu27cM56NCiMZf0bsMlvdswpHMrEuJqfGuXqKFiICIN3leHj/P+2j28uyaXf2/I43hxKSmN4rnw9NZc0rst5/VMp2mM9zOoGIiIhCkoLObfG75i3ppc5q/NZV9BEYlxAc7qlsrI3m25uFdrWqc08jvNWqdiICJSgeKSUpZu3ce7a3KZtyaXbfmhfoYBnVpwSe82jOzdlm6tm/qcZe1QMRARqQLnHOtzDzFvdS7vrsnlC++2ol3SkrmkdxtG921L/4wWUdsBrWIgIlINO/cf5b21oSOGzzaGrmfo0aYp4wd34qqBHWjRJLruKKdiICJSQweOFvH2F7uYvngbK3IOkBgfYGzftowf0omhnVtFxdFCnRQDM7sbuBUw4Dnn3O/MrBUwE8gEthC6P/I+7/7IvwfGEro/8o3OuWXe60wEfuy97M+dc1MrW7eKgYj4afXOA8xYvJ1/fr6DQ8eL6ZKezPjBGXx7YEdSmyb5nV6Far0YmFlfYAYwBCgE3gFuAyYD+c65X5rZg0BL59yPzGwscCehYjAU+L1zbqhXPLKAIOCApcAg59y+k61fxUBEIkFBYTGzV+5ixpLtLN26j4Q4Y2SftnxnSCeGd0klEGHflVQXN7fpBSxyzhV4K/gIuAoYB5zvtZkKfAj8yIu/7ELVZ6GZtTCzdl7bec65fO915gGjgek1yE1EpF40SYznmmAG1wQz+DL3ENMXb+Mfy3Ywe+UuOrVqwvghGVw9qCOtm0X2MNWaXHa3ChhhZqlm1oTQJ/4MoI1zbpfXZjfQxpvuAGwPWz7Hi1UU/wYzm2xmWWaWlZeXV4PURURqX482zXjkW31Y9N8X8fvx/WnXvBG/emc9Z/3v+3zvr1l8uH4PJaWR2U9b7SMD59xaM3sceBc4AiwHSsq0cWZWa1vunJsCTIHQaaLael0RkdrUKCGOcf07MK5/BzblHWbmku28tjSHuatz6ZyWzO+u60+/jBZ+p/k1NfpCDufc8865Qc65c4F9wJdArnf6B+/nHq/5DkJHDid09GIVxUVEol6X9KY8NLYXCx+6iD9+ZwCFxaVc/adPeemTzUTSaM4aFQMza+397ESov+AVYBYw0WsyEXjTm54FTLCQYcAB73TSXGCkmbU0s5bASC8mIhIzEuMDXHZme2bfdQ7ndk/n0X+t4fZXlnHwWJHfqQE1v+3l380sFSgCbnfO7TezXwKvmtkkYCtwrdd2DqF+hWxCQ0tvAnDO5ZvZz4AlXrufnuhMFhGJNS2aJPLchCDP/XsTv5q7ntU7P+bp7wykb4dq3Fy6FumiMxERn2RtyeeOVz4nv6CQn1zWmxuGdqrzC9cqGlracL7EW0QkwgQzWzH7rnMY3iWVH/9zFXfNWM7h48W+5KJiICLio9SmSbx442AeGNWT2St3cvkfPmbtroP1noeKgYiIzwIB4/YLuvHKrcM4fLyYK57+hJlLttXraCMVAxGRCDGsSyqz7xrB4MxW/OjvX/CDV1dQUFg/p41UDEREIkh6sySm3jyEey/uwRvLd3D5Hz/hy9xDdb5eFQMRkQgTFzDuvrg7f5s0lP0FhYz74ye8vjSnTtepYiAiEqHO7pbGnLtG0C+jOfe/toIfvr6Co4UllS9YDSoGIiIRrHVKI6bdMoy7LuzGa0tzuOLpT8g9eKzW11PTK5BFRKSOxQWM+0b2JJjZimmLttIqufZvtaliICISJc7tkc65PdLr5LV1mkhERFQMRERExUBERFAxEBERVAxERAQVAxERQcVARERQMRAREaL4tpdmlkfoHsvVkQZ8VYvp1DblVzPKr2aUX81Een6nOee+ceVa1BaDmjCzrPLuARoplF/NKL+aUX41E+n5VUSniURERMVAREQabjGY4ncClVB+NaP8akb51Uyk51euBtlnICIiX9dQjwxERCSMioGIiMR2MTCz0Wa23syyzezBcuYnmdlMb/4iM8usx9wyzOwDM1tjZqvN7O5y2pxvZgfMbLn3+El95eetf4uZfeGtO6uc+WZmT3n7b6WZDazH3HqG7ZflZnbQzO4p06Ze95+ZvWBme8xsVVislZnNM7MN3s+WFSw70Wuzwcwm1mN+vzazdd7v7w0za1HBsid9L9Rhfo+a2Y6w3+HYCpY96d96HeY3Myy3LWa2vIJl63z/1ZhzLiYfQBywEegCJAIrgN5l2nwf+JM3PR6YWY/5tQMGetPNgC/Lye984C0f9+EWIO0k88cCbwMGDAMW+fi73k3oYhrf9h9wLjAQWBUW+xXwoDf9IPB4Ocu1AjZ5P1t60y3rKb+RQLw3/Xh5+VXlvVCH+T0K3F+F3/9J/9brKr8y858AfuLX/qvpI5aPDIYA2c65Tc65QmAGMK5Mm3HAVG/6deAiM7P6SM45t8s5t8ybPgSsBTrUx7pr0TjgZReyEGhhZu18yOMiYKNzrrpXpNcK59wCIL9MOPw9NhW4opxFRwHznHP5zrl9wDxgdH3k55x71zlX7D1dCHSs7fVWVQX7ryqq8rdeYyfLz/u/cS0wvbbXW19iuRh0ALaHPc/hm/9s/6+N9wdxAEitl+zCeKenBgCLypk93MxWmNnbZtanfjPDAe+a2VIzm1zO/Krs4/ownor/CP3cfwBtnHO7vOndQJty2kTKfryZ0JFeeSp7L9SlO7zTWC9UcJotEvbfCCDXObehgvl+7r8qieViEBXMrCnwd+Ae59zBMrOXETr10Q/4A/DPek7vHOfcQGAMcLuZnVvP66+UmSUClwOvlTPb7/33NS50viAix3Kb2cNAMTCtgiZ+vReeBboC/YFdhE7FRKLrOflRQcT/LcVyMdgBZIQ97+jFym1jZvFAc2BvvWQXWmcCoUIwzTn3j7LznXMHnXOHvek5QIKZpdVXfs65Hd7PPcAbhA7Hw1VlH9e1McAy51xu2Rl+7z9P7olTZ97PPeW08XU/mtmNwGXADV7B+oYqvBfqhHMu1zlX4pwrBZ6rYL1+77944CpgZkVt/Np/pyKWi8ESoLuZdfY+PY4HZpVpMws4MXLjauD9iv4Yapt3jvF5YK1z7rcVtGl7og/DzIYQ+n3VS7Eys2Qza3ZimlBH46oyzWYBE7xRRcOAA2GnROpLhZ/I/Nx/YcLfYxOBN8tpMxcYaWYtvdMgI71YnTOz0cAPgcudcwUVtKnKe6Gu8gvvg7qygvVW5W+9Ll0MrHPO5ZQ308/9d0r87sGuyweh0S5fEhpp8LAX+ymhNz5AI0KnF7KBxUCXesztHEKnDFYCy73HWOA24DavzR3AakKjIxYCZ9Vjfl289a7wcjix/8LzM+Bpb/9+AQTr+febTOife/OwmG/7j1BR2gUUETpvPYlQH9R8YAPwHtDKaxsE/hK27M3e+zAbuKke88smdL79xHvwxOi69sCck70X6im/v3rvrZWE/sG3K5uf9/wbf+v1kZ8Xf+nEey6sbb3vv5o+9HUUIiIS06eJRESkilQMRERExUBERFQMREQEFQMREUHFQEREUDEQERHg/wMY38td1sX9QQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(item_click_count[:20])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "点击次数最多的前20篇新闻,点击次数大于2500。思路:可以定义这些新闻为热门新闻, 这个也是简单的处理方式,后面我们也是根据点击次数和时间进行文章热度的一个划分。" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(item_click_count[3500:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以发现很多新闻只被点击过一两次。思路:可以定义这些新闻是冷门新闻" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻共现频次:两篇新闻连续出现的次数" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count
count433597.000000
mean3.184139
std18.851753
min1.000000
25%1.000000
50%1.000000
75%2.000000
max2202.000000
\n", + "
" + ], + "text/plain": [ + " count\n", + "count 433597.000000\n", + "mean 3.184139\n", + "std 18.851753\n", + "min 1.000000\n", + "25% 1.000000\n", + "50% 1.000000\n", + "75% 2.000000\n", + "max 2202.000000" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = user_click_merge.sort_values('click_timestamp')\n", + "tmp['next_item'] = tmp.groupby(['user_id'])['click_article_id'].transform(lambda x:x.shift(-1))\n", + "union_item = tmp.groupby(['click_article_id','next_item'])['click_timestamp'].agg({'count'}).reset_index().sort_values('count', ascending=False)\n", + "union_item[['count']].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "由统计数据可以看出,平均共现次数3.18,最高为2202。\n", + "\n", + "说明用户看的新闻,相关性是比较强的。" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#画个图直观地看一看\n", + "x = union_item['click_article_id']\n", + "y = union_item['count']\n", + "plt.scatter(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAD4CAYAAADvsV2wAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAATdElEQVR4nO3df6xkZX3H8fe37Aq2EPmxN7pd9nKhmhgxuOB1hUANISHlV+CPYrqkRUTNNoopVlsrmiCamIhNlSpG3ApF1Cr4syuFWFqwahuW7OKy/BK9KgYQ3AVkkarU1W//mLMwd5hzZ+7MmTt3znm/ksmeOeeZOd89s/dzn32ec85EZiJJqr/fG3cBkqSlYeBLUkMY+JLUEAa+JDWEgS9JDbFiXDtetWpVzszMjGv3kjSRtm3b9mhmTg3y2rEF/szMDFu3bh3X7iVpIkXETwZ9rUM6ktQQBr4kNYSBL0kNYeBLUkMY+JLUEH0HfkTsExHfjYjru2zbNyKujYi5iNgSETOVVilJGtpievgXAveWbHsj8PPMfDHwEeDSYQuTJFWrr/PwI+JQ4HTgA8DbuzQ5C7ikWP4ScHlERI7g3sv3PfIL/m3HT0u3n/CSKdYffnDVu5WkidfvhVeXAe8EDijZvgZ4ACAz90TEbuAQ4NH2RhGxEdgIMD09PUC5MLfzKT52y1zXbZlw648f57q/PG6g95akOusZ+BFxBrAzM7dFxInD7CwzNwGbAGZnZwfq/Z9+1GpOP+r0rtv+/FO38vRvfjd4gZJUY/2M4R8PnBkR9wNfAE6KiM92tHkIWAsQESuAFwCPVVhn3/z+LknqrmfgZ+ZFmXloZs4AG4CbM/MvOpptBs4rls8u2pi9krSMDHzztIh4P7A1MzcDVwKfiYg54HFavxiWXBD4e0aSultU4GfmN4FvFssXt63/NfDaKguTJFWrVlfaRoy7AklavmoV+OCkrSSVqV3gS5K6q13gO2crSd3VLvAlSd3VKvAjwjF8SSpRq8CXJJWrVeB7VqYklatV4APO2kpSiVoFvhdeSVK5WgU+eOGVJJWpXeBLkrqrVeAHDuFLUplaBb4kqVytAj+ctZWkUrUKfIB02laSuqpV4Nu/l6RytQp8cNJWksrULvAlSd3VKvAj7OFLUplaBb4kqVzNAt9pW0kqU7PA9146klSmVoHvdVeSVK5n4EfEfhFxW0TcERF3R8T7urR5fUTsiojtxeNNoym3t3TWVpK6WtFHm6eBkzLzqYhYCXwnIm7MzFs72l2bmW+tvkRJUhV6Bn62usxPFU9XFo9l2Y12REeSyvU1hh8R+0TEdmAncFNmbunS7E8jYkdEfCki1pa8z8aI2BoRW3ft2jV41ZKkResr8DPzt5m5DjgUWB8RL+9o8nVgJjOPAm4CPl3yPpsyczYzZ6empoYouzsnbSWp3KLO0snMJ4BbgFM61j+WmU8XTz8FvLKS6gbgnK0kddfPWTpTEXFgsfx84GTgex1tVrc9PRO4t8Ia+xaO4ktSqX7O0lkNfDoi9qH1C+K6zLw+It4PbM3MzcBfRcSZwB7gceD1oyq4F++HL0nd9XOWzg7g6C7rL25bvgi4qNrSJElVqt2Vto7hS1J3tQp8SVK5WgW+p2VKUrlaBT4s00uAJWkZqFXge1qmJJWrVeCDd8uUpDK1C3xJUnf1CvxwDF+SytQr8CVJpWoV+E7ZSlK5WgU+4JiOJJWoVeCHV15JUqlaBT7YwZekMrULfElSd7UK/MALrySpTK0CX5JUrlaB75ytJJWrVeCDk7aSVKZWgW8HX5LK1Srwwa84lKQytQt8SVJ3tQr8iCAdxZekrmoV+JKkcrUKfCdtJalcz8CPiP0i4raIuCMi7o6I93Vps29EXBsRcxGxJSJmRlJtH5y0laTu+unhPw2clJmvANYBp0TEsR1t3gj8PDNfDHwEuLTSKvtlF1+SSq3o1SBbN6d5qni6snh09qPPAi4plr8EXB4RkWO4sc3j//t//O0X7xj49RvWr+WVhx1cYUWStDz0DHyAiNgH2Aa8GPh4Zm7paLIGeAAgM/dExG7gEODRjvfZCGwEmJ6eHq7yLtbPHMytP3yM/557tHfjLh558tckGPiSaqmvwM/M3wLrIuJA4KsR8fLMvGuxO8vMTcAmgNnZ2cp7/xvWT7Nh/eC/SI7/4M0VViNJy8uiztLJzCeAW4BTOjY9BKwFiIgVwAuAxyqob8k56Suprvo5S2eq6NkTEc8HTga+19FsM3BesXw2cPM4xu8lSeX6GdJZDXy6GMf/PeC6zLw+It4PbM3MzcCVwGciYg54HNgwsopHzCt1JdVVP2fp7ACO7rL+4rblXwOvrbY0SVKVanWl7bD8AhVJdWbgd3JER1JNGfht7OFLqjMDv4MdfEl1ZeBLUkMY+G2CwMsHJNWVgS9JDWHgt3HSVlKdGfgdHNCRVFcGfhs7+JLqzMDv4JytpLoy8CWpIQz8NhHhGL6k2jLwJakhDPw2TtpKqjMDv4NX2kqqKwO/nV18STVm4Hewfy+prgx8SWoIA79NgF18SbVl4EtSQxj4bcLbZUqqMQO/QzqmI6mmDPw29u8l1VnPwI+ItRFxS0TcExF3R8SFXdqcGBG7I2J78bh4NOWOntddSaqrFX202QO8IzNvj4gDgG0RcVNm3tPR7tuZeUb1JUqSqtCzh5+ZD2fm7cXyL4B7gTWjLmwcIuzhS6qvRY3hR8QMcDSwpcvm4yLijoi4MSKOLHn9xojYGhFbd+3atfhqJUkD6zvwI2J/4MvA2zLzyY7NtwOHZeYrgI8BX+v2Hpm5KTNnM3N2ampqwJJHJ5y2lVRjfQV+RKykFfafy8yvdG7PzCcz86li+QZgZUSsqrTSJeJpmZLqqp+zdAK4Erg3Mz9c0uZFRTsiYn3xvo9VWehS8LorSXXWz1k6xwPnAndGxPZi3buBaYDMvAI4G3hzROwBfgVsyAm9sfxkVi1JvfUM/Mz8Dj2uScrMy4HLqypKklQ9r7TtYAdfUl0Z+JLUEAZ+G++WKanODPwOTtpKqisDv439e0l1ZuA/h118SfVk4EtSQxj4bbxbpqQ6M/AlqSEM/DYRjuBLqi8DX5IawsBv4/3wJdWZgd9hQm/yKUk9GfiS1BAGfhsnbSXVmYEvSQ1h4LcJvPBKUn0Z+JLUEAZ+O++HL6nGDPwOjuhIqisDX5IawsBv05q0tY8vqZ4MfElqCAO/jXO2kuqsZ+BHxNqIuCUi7omIuyPiwi5tIiI+GhFzEbEjIo4ZTbmSpEGt6KPNHuAdmXl7RBwAbIuImzLznrY2pwIvKR6vBj5R/DlR7OBLqrOegZ+ZDwMPF8u/iIh7gTVAe+CfBVyTrRnPWyPiwIhYXbx2otz50G7OvXLLuMt4jvOPn+Gkl75w3GVImmD99PCfEREzwNFAZyKuAR5oe/5gsW5e4EfERmAjwPT09CJLHb0zjvpDvr7jpzz19J5xlzLP3Q89ydQB+xr4kobSd+BHxP7Al4G3ZeaTg+wsMzcBmwBmZ2eX3fmPbzjhcN5wwuHjLuM5Trj05nGXIKkG+jpLJyJW0gr7z2XmV7o0eQhY2/b80GKdqrLsfj1KmjT9nKUTwJXAvZn54ZJmm4HXFWfrHAvsnsTxe0mqs36GdI4HzgXujIjtxbp3A9MAmXkFcANwGjAH/BI4v/JKG8wvZpFUhX7O0vkOPc5YLM7OuaCqoiRJ1fNK2wkQXiEgqQIG/oTwpm6ShmXgTwDv8SOpCgb+hLB/L2lYBr4kNYSBPwFaX8wy7iokTToDX5IawsCfABHhGL6koRn4ktQQBv4E8KxMSVUw8CeEF15JGpaBL0kNYeBPAu+WKakCBr4kNYSBPwEC7OJLGpqBL0kNYeBPgPB2mZIqYOBPiHRMR9KQDHxJaggDfwJ4t0xJVTDwJakhDPwJEGEPX9LwDHxJaggDfwKE98uUVIGegR8RV0XEzoi4q2T7iRGxOyK2F4+Lqy9TnpYpaVgr+mhzNXA5cM0Cbb6dmWdUUpEkaSR69vAz81vA40tQi0o4aSupClWN4R8XEXdExI0RcWRZo4jYGBFbI2Lrrl27Ktq1JKkfVQT+7cBhmfkK4GPA18oaZuamzJzNzNmpqakKdt0cdvAlDWvowM/MJzPzqWL5BmBlRKwaujJJUqWGDvyIeFEUt3OMiPXFez427PvqWd4tU1IVep6lExGfB04EVkXEg8B7gZUAmXkFcDbw5ojYA/wK2JB+43blPKKShtUz8DPznB7bL6d12qYkaRnzStsJ0BrQsYsvaTgGviQ1hIE/AbzwSlIVDHxJaggDfwJ4VqakKhj4E8IRHUnDMvAngPfDl1QFA39CeC2bpGEZ+JLUEAb+BIhwDF/S8Ax8SWoIA38COGUrqQoG/oRwzlbSsAz8SeCVV5IqYOBPCDv4koZl4EtSQxj4EyDwwitJwzPwJakhDPwJ4JytpCoY+JLUEAb+BLCDL6kKBv6EcM5W0rAMfElqCAN/AkQE6aVXkobUM/Aj4qqI2BkRd5Vsj4j4aETMRcSOiDim+jIlScPqp4d/NXDKAttPBV5SPDYCnxi+LLVz0lZSFVb0apCZ34qImQWanAVck61LQW+NiAMjYnVmPlxVkYLbf/IEJ3/4v8ZdhqQK/Nmr1vKmPz5iyffbM/D7sAZ4oO35g8W65wR+RGyk9b8ApqenK9h1M5x73GF84+5Hxl2GpIqs2n/fsey3isDvW2ZuAjYBzM7OOgvZp7PWreGsdWvGXYakCVfFWToPAWvbnh9arJMkLSNVBP5m4HXF2TrHArsdv5ek5afnkE5EfB44EVgVEQ8C7wVWAmTmFcANwGnAHPBL4PxRFStJGlw/Z+mc02N7AhdUVpEkaSS80laSGsLAl6SGMPAlqSEMfElqiBjXl2NHxC7gJwO+fBXwaIXlVMnaFm+51gXWNojlWhfUo7bDMnNqkB2MLfCHERFbM3N23HV0Y22Lt1zrAmsbxHKtC6zNIR1JaggDX5IaYlIDf9O4C1iAtS3ecq0LrG0Qy7UuaHhtEzmGL0lavEnt4UuSFsnAl6SmyMyJetD6ft37aN2d810j3M/9wJ3AdmBrse5g4CbgB8WfBxXrA/hoUdMO4Ji29zmvaP8D4Ly29a8s3n+ueG0sUMtVwE7grrZ1I6+lbB991HYJre9E2F48TmvbdlGxn/uAP+n1uQKHA1uK9dcCzyvW71s8nyu2z3TUtRa4BbgHuBu4cLkctwVqG+txA/YDbgPuKOp63xDvVUm9fdR2NfDjtmO2bkw/B/sA3wWuXy7HrGuWjCowR/EoDuoPgSOA5xUf/stGtK/7gVUd6z6094AD7wIuLZZPA24s/pEdC2xp+4fyo+LPg4rlvQFzW9E2iteeukAtrwGOYX6ojryWsn30UdslwN90afuy4jPbt/jH+sPiMy39XIHrgA3F8hXAm4vltwBXFMsbgGs79rWa4occOAD4frH/sR+3BWob63Er/h77F8sraYXJsYt9ryrr7aO2q4Gzuxyzpf45eDvwLzwb+GM/Zl2zZBRhOaoHcBzwjbbnFwEXjWhf9/PcwL8PWN32Q3tfsfxJ4JzOdsA5wCfb1n+yWLca+F7b+nntSuqZYX6ojryWsn30UdsldA+ueZ8X8I3iM+36uRY/eI8CKzo//72vLZZXFO0W+l/SvwInL6fj1qW2ZXPcgN8Hbgdevdj3qrLekuPVXtvVdA/8Jfs8aX3L338CJwHXD3L8R33M9j4mbQy/7AvTRyGBf4+IbcWXrwO8MJ/9Nq9HgBf2qGuh9Q92Wb8YS1FL2T768daI2BERV0XEQQPWdgjwRGbu6VLbM68ptu8u2j9HRMwAR9PqFS6r49ZRG4z5uEXEPhGxndYw3U20epeLfa8q620/VvNqy8y9x+wDxTH7SETs/Xbwpfw8LwPeCfyueD7I8R/JMes0aYG/lE7IzGOAU4ELIuI17Ruz9Ws1x1JZh6WoZZH7+ATwR8A64GHgH0ZUVk8RsT/wZeBtmflk+7ZxH7cutY39uGXmbzNzHa1e63rgpUtdQ5nO2iLi5bR6uy8FXkVrmObvRlzDvM8zIs4AdmbmtlHutyqTFvhL9oXpmflQ8edO4Ku0/vH/LCJWAxR/7uxR10LrD+2yfjGWopayfSwoM39W/HD+DvgnWsdukNoeAw6MiBUd6+e9V7H9BUX7Z0TESlqB+rnM/EqPv9OSHrdutS2X41bU8gStieXjBnivKut9jrbaTsnMh7PlaeCfGfyYDfp5Hg+cGRH3A1+gNazzjwv8fcZyzJ7Ra8xnOT1ojXn9iNakxt4JjCNHsJ8/AA5oW/4fWjPlf8/8yZsPFcunM3+C6LZi/cG0ziA4qHj8GDi42NY5QXRaj5pmmD9OPvJayvbRR22r25b/GvhCsXwk8yemfkRrUqr0cwW+yPyJqbcUyxcwf/Lruo6aArgGuKxj/diP2wK1jfW4AVPAgcXy84FvA2cs9r2qrLeP2la3HdPLgA+O8efgRJ6dtB37MetaY9VhOeoHrdn379MaW3zPiPZxRHFg954C9p5i/SG0Jmd+APxH2z+UAD5e1HQnMNv2Xm+gddrUHHB+2/pZ4K7iNZez8ITj52n9F/83tMbq3rgUtZTto4/aPlPsewewmflB9p5iP/fRdmZS2edafBa3FTV/Edi3WL9f8Xyu2H5ER10n0Pqv9w7aTnNcDsdtgdrGetyAo2idWrij+HtdPMR7VVJvH7XdXByzu4DP8uyZPEv6c1C0O5FnA3/sx6zbw1srSFJDTNoYviRpQAa+JDWEgS9JDWHgS1JDGPiS1BAGviQ1hIEvSQ3x/4tppPoWqYdUAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(union_item['count'].values[40000:])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "大概有75000个pair至少共现一次" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 新闻文章信息" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#不同类型的新闻出现的次数\n", + "plt.plot(user_click_merge['category_id'].value_counts().values)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#出现次数比较少的新闻类型, 有些新闻类型,基本上就出现过几次\n", + "plt.plot(user_click_merge['category_id'].value_counts().values[150:])" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 1.630633e+06\n", + "mean 2.043012e+02\n", + "std 6.382198e+01\n", + "min 0.000000e+00\n", + "25% 1.720000e+02\n", + "50% 1.970000e+02\n", + "75% 2.290000e+02\n", + "max 6.690000e+03\n", + "Name: words_count, dtype: float64" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#新闻字数的描述性统计\n", + "user_click_merge['words_count'].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(user_click_merge['words_count'].values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户点击的新闻类型的偏好\n", + "\n", + "此特征可以用于度量用户的兴趣是否广泛。" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAAUlUlEQVR4nO3dfZBc1Xnn8e8zM3pBaCwkNBJCAiQbsKwEy8CYwoEihTG2wXGwY5dDditWHGrZsp3EjpNdw9q1dtXGu3YqNvFWsomJIaESyoGAMSQFwRhjezeJJY+MAAsEEuJFEnoZAXpBGAlJZ//oK2UkzfRtzfR097nz/VRNze3Tt/s+Z27rp9unT98bKSUkSfnrancBkqTmMNAlqSIMdEmqCANdkirCQJekiuhp5cZmz56dFi5c2MpNSlL2Vq5cuT2l1Fe2XksDfeHChQwMDLRyk5KUvYh4rpH1HHKRpIow0CWpIgx0SaoIA12SKsJAl6SKMNAlqSIMdEmqiCwC/a6HN/J3P25oGqYkTVhZBPo9q17g9oEN7S5DkjpaFoEuSSpnoEtSRWQT6F4pT5LqyyLQI6LdJUhSx8si0CVJ5Qx0SaqIbAI94SC6JNWTRaA7gi5J5bIIdElSOQNdkioim0B3Hrok1ZdFoDsNXZLKZRHokqRy2QS6Qy6SVF8mge6YiySVySTQJUllDHRJqohsAt0hdEmqL4tAd9qiJJXLItAlSeUMdEmqiGwCPTkRXZLqyiLQHUKXpHJZBLokqZyBLkkVYaBLUkVkEejOQ5ekclkEuiSpXEOBHhG/HxGrI+JnEfGtiJgaEYsiYnlErIuI2yJi8ngXK0kaWWmgR8R84PeA/pTSLwLdwNXAV4AbUkpnAi8D14xnoU5Dl6T6Gh1y6QFOiIgeYBqwGXgncEdx/y3AB5peXSGciS5JpUoDPaW0CfgT4HlqQb4TWAnsSCntL1bbCMwf7vERcW1EDETEwODgYHOqliQdo5Ehl5nAVcAi4FTgROC9jW4gpXRjSqk/pdTf19c36kIlSfU1MuTyLuCZlNJgSul14NvARcBJxRAMwAJg0zjVCEDyjOiSVFcjgf48cGFETIuIAC4DHgceAj5crLMMuHt8SnQeuiQ1opEx9OXUPvz8KfBY8Zgbgc8Cn4mIdcDJwE3jWKckqURP+SqQUvoC8IWjmtcDFzS9IknSqGTzTVHnoUtSfVkEumPoklQui0CXJJUz0CWpIrIJdIfQJam+LALdc7lIUrksAl2SVC6bQE/OW5SkuvIIdEdcJKlUHoEuSSploEtSRWQT6I6gS1J9WQS6Q+iSVC6LQJcklTPQJaki8gl0B9Elqa4sAj08f64klcoi0CVJ5Qx0SaqIbALdIXRJqi+LQHcEXZLKZRHokqRyBrokVUQ2ge750CWpviwC3WnoklQui0CXJJUz0CWpIrIJdEfQJam+LALdIXRJKpdFoEuSyhnoklQR2QS609Alqb4sAt3zoUtSuYYCPSJOiog7ImJNRDwREe+IiFkR8UBErC1+zxzvYiVJI2v0CP3rwD+nlBYDS4EngOuAB1NKZwEPFrclSW1SGugRMQO4BLgJIKW0L6W0A7gKuKVY7RbgA+NTYk1yJrok1dXIEfoiYBD464h4OCK+GREnAnNTSpuLdbYAc4d7cERcGxEDETEwODg4qiIdQZekco0Eeg9wHvAXKaVzgT0cNbySaqdCHPYQOqV0Y0qpP6XU39fXN9Z6JUkjaCTQNwIbU0rLi9t3UAv4rRExD6D4vW18Sqxx2qIk1Vca6CmlLcCGiHhz0XQZ8DhwD7CsaFsG3D0uFYJjLpLUgJ4G1/td4NaImAysBz5G7T+D2yPiGuA54CPjU6IkqRENBXpKaRXQP8xdlzW1GknSqGXxTVFwDF2SymQR6OEguiSVyiLQJUnlDHRJqggDXZIqIotA9+y5klQui0CXJJUz0CWpIrIJ9OREdEmqK4tAdwhdksplEeiSpHIGuiRVRDaB7gi6JNWXRaA7D12SymUR6JKkcga6JFVENoHuNHRJqi+LQPd86JJULotAlySVM9AlqSKyCfTkTHRJqiuLQHceuiSVyyLQJUnlsgl0py1KUn1ZBLpDLpJULotAlySVM9AlqSKyCXSH0CWpvkwC3UF0SSqTSaBLkspkE+hOW5Sk+rII9NcPHGT7K3vbXYYkdbQsAv3nrx+gd2pPu8uQpI7WcKBHRHdEPBwR/1TcXhQRyyNiXUTcFhGTx6vIOb1TnOYiSSWO5wj9U8ATQ25/BbghpXQm8DJwTTMLG6o7ggMOoktSXQ0FekQsAN4HfLO4HcA7gTuKVW4BPjAO9QHQ3RUcOGigS1I9jR6h/ynwX4GDxe2TgR0ppf3F7Y3A/OEeGBHXRsRARAwMDg6Orsiu4KBH6JJUV2mgR8SvANtSSitHs4GU0o0ppf6UUn9fX99onqI25OIRuiTV1cjUkYuAX42IK4GpwBuArwMnRURPcZS+ANg0XkV2BZjnklRf6RF6Sun6lNKClNJC4Grg+yml/wg8BHy4WG0ZcPe4FdlV++r/QVNdkkY0lnnonwU+ExHrqI2p39Scko7VXZwQ3ZkukjSy4/q2TkrpB8APiuX1wAXNL+lYh47QDxxMTOpuxRYlKT9ZfFN0589fB2Dv/oMla0rSxJVFoJ86YyqAM10kqY4sAr2nu1bm/oMeoUvSSPII9GIMff8Bj9AlaSR5BPqhI3QDXZJGlEegHzpCd8hFkkaURaDvO1AL8hf37GtzJZLUubII9FNnnAD4TVFJqieLQJ86qVbmoSN1SdKxsgj0ScWHovv8YpEkjSiLQO/prn0o+tyLr7a5EknqXFkE+uzpUwCYMimLciWpLbJIyCk9DrlIUpksAn2ygS5JpfII9OJD0Uc27mhvIZLUwbII9ENf/T8020WSdKxsEnLJvDfw1NZX2l2GJHWsbAJ9z779nDjZyxVJ0kiyCfSz5/by6Kad7S5DkjpWNoH+2usHiHYXIUkdLJtAP/f0mezdf9ATdEnSCLIJ9JRqQf7Czp+3uRJJ6kzZBPpb5r0BgMHde9tciSR1pmwCfcYJkwBYs2V3myuRpM6UTaAvPqUXgE0vO+QiScPJJtB7p9aO0Fc8+1KbK5GkzpRNoE/u6WLxKb28+Ipj6JI0nGwCHeCkaZN4enAPe/bub3cpktRxsgr0S87uA+ClPfvaXIkkdZ6sAv3MvukA3PaTDW2uRJI6T1aB/stvrh2hv+KQiyQdI6tAn9LTTV/vFP7mX5/l1X2GuiQNlVWgA1z0ppMBvzEqSUcrDfSIOC0iHoqIxyNidUR8qmifFREPRMTa4vfM8S8XrjhnHgB/8t2nWrE5ScpGI0fo+4E/SCktAS4EPhkRS4DrgAdTSmcBDxa3x92Fb6wdoT+7fU8rNidJ2SgN9JTS5pTST4vl3cATwHzgKuCWYrVbgA+MU41HmHHCJN6/9FQe27STex/b3IpNSlIWjmsMPSIWAucCy4G5KaVDiboFmDvCY66NiIGIGBgcHBxLrYf92rnzAfju6i1NeT5JqoKGAz0ipgN3Ap9OKe0ael+qnax82CtPpJRuTCn1p5T6+/r6xlTsIZcunsPiU3r5zqoXWPGM53aRJGgw0CNiErUwvzWl9O2ieWtEzCvunwdsG58Sh/f+pacCcNfDm1q5WUnqWI3McgngJuCJlNLXhtx1D7CsWF4G3N388kb2yUvPZOHJ01i+/kV+8GRL/y+RpI7UyBH6RcBvAu+MiFXFz5XAl4HLI2It8K7idktddOZsNrz8Kv/noadbvWlJ6jg9ZSuklP4fECPcfVlzyzk+X/rgOWzbvZdHNuzgtp88z0f6T6P2hkKSJp7svil6tCXz3sC23Xv57J2P8cLO19pdjiS1TfaB/vuXn82f/YdzAbhz5UbWbNlV8ghJqqbsAx3gjFknAvC1B57iD25/pM3VSFJ7VCLQz1kwg4HPv4srfvEUtu56jR89NcimHV5MWtLEUolAB5g9fQoLZ5/I9lf28dGbV/Cfbhlod0mS1FKVCXSAT112Fnd+/Je4fMlctu56jSe37Gb94CvUvsgqSdVWqUCfOqmb88+Yydlzp/Pinn28509/xDu/+kP+8VFP4iWp+krnoefo2kvexDnzZ/Da6wf59G2reGZwDzte3UcQzJg2qd3lSdK4iFYOR/T396eBgdaNbaeUOPvz9/H6gX/v4/VXLOY///KbWlaDJI1VRKxMKfWXrVfJI/RDIoIbP9p/+GIYNzzwFOsHvTCGpGqqdKADXPrmOfDm2vKty5/nH1Zu4DuramdojIAvvv8XuPqC09tYoSQ1R+UDfajPXfkWfvzMi4dv/92/PccjG3dy9QVtLEqSmmRCBfqli+dw6eI5h28/sHord6/axL+s2364racr+J+/ds7ha5dKUi4mVKAf7ROXnnlEmKeU+M6qFxh49iUDXVJ2Kj3LZTTO/tx9zJ4+mQUzpx3R3tUF/+U9izn/jJltqkzSRNXoLJdKfbGoGX7rooWccfKJdHfFET8/Xv8SP/TKSJI62IQechnOf7vyLcO2n/OF+/mnxzazfvvw0x7n9E7l8+97C11dXmBDUnsY6A1631vnseLZl3h887HnW9/92n4Gd+/lYxct5LRZ04Z5tCSNPwO9QV/+0FtHvO/exzbziVt/yg3fe4qZ0ybXfZ63LpjBVW+b3+zyJMlAb4az5/Yye/oUvrt6a9319u4/QO/USQa6pHFhoDfBmXOmM/D5d5Wu9+X71vDN/7uev/rR+oafu6sr+NWlp9LXO2UsJUqaAAz0FjprznT2H0x86d4njutxr+7dz+9edtY4VSWpKgz0FvrQ+Qu44pxTOHgcU//f/kff4+ENO7i7OP/MaJw+axrnnu78eanqDPQWmzb5+P7k82eewPfXbOP7a0Y/B35KTxdr/sd7iXBKpVRlBnqHu+sTv8S23XtH/fjbBzbwjR+u50drtzOlp3nfI+vuCpYuOInJTXxOSWNjoHe43qmT6J06+qssLT6lF4BlN69oVkmH/fdfWcJvX7yo6c8raXQM9Ip7/1tP5bSZ09h34GBTn3fZzStYu+2VwxcPaYepk7o5ZcbUtm1f6jQGesX1dHfRv3BW05931omT+daK5/nWiueb/tzH486Pv4Pzz2h+/6QcGegalZuWvZ2123a3bfvbdu3lf923hme2v8qSeTPaVsfRpk7q8sNntY2nz1WWtu1+jQu+9GC7yzjGh85bwFc/srTdZahivEi0Km1O71Ru+PWlbN01+hlAzXbHyo08tbV971okA13Z+uC5C9pdwhEef2EX//joC5zzxfvbXcqE9JnLz+ZjF03sWVcGutQk11y8iJOn1z/bpsbHXQ9vYuC5lw30sTw4It4LfB3oBr6ZUvpyU6qSMrT0tJNYetpJ7S5jQlr53Mv8YM02Lv/aD9tdyohuWvZ2Tj95fK+XMOpAj4hu4M+By4GNwE8i4p6U0uPNKk6SGnHNxYu4f/WWdpdRVyu+VT2WI/QLgHUppfUAEfH3wFWAgS6ppa5623yvM8DYLhI9H9gw5PbGou0IEXFtRAxExMDg4OAYNidJqmfc3wOklG5MKfWnlPr7+vrGe3OSNGGNJdA3AacNub2gaJMktcFYAv0nwFkRsSgiJgNXA/c0pyxJ0vEa9YeiKaX9EfE7wP3Upi3enFJa3bTKJEnHZUzz0FNK9wL3NqkWSdIYeLkZSaoIA12SKqKlp8+NiEHguVE+fDawvYnl5MA+Twz2ufrG2t8zUkql875bGuhjEREDjZwPuErs88Rgn6uvVf11yEWSKsJAl6SKyCnQb2x3AW1gnycG+1x9LelvNmPokqT6cjpClyTVYaBLUkVkEegR8d6IeDIi1kXEde2u53hFxLMR8VhErIqIgaJtVkQ8EBFri98zi/aIiP9d9PXRiDhvyPMsK9ZfGxHLhrSfXzz/uuKx0YY+3hwR2yLiZ0Paxr2PI22jjX3+YkRsKvb1qoi4csh91xf1PxkR7xnSPuzruzjx3fKi/bbiJHhExJTi9rri/oUt6u9pEfFQRDweEasj4lNFe2X3c50+d+Z+Til19A+1E389DbwRmAw8Aixpd13H2YdngdlHtf0xcF2xfB3wlWL5SuA+IIALgeVF+yxgffF7ZrE8s7hvRbFuFI+9og19vAQ4D/hZK/s40jba2OcvAn84zLpLitfuFGBR8Zrurvf6Bm4Hri6W/xL4eLH8CeAvi+Wrgdta1N95wHnFci/wVNGvyu7nOn3uyP3c0n/0o/yDvgO4f8jt64Hr213XcfbhWY4N9CeBeUNeNE8Wy98AfuPo9YDfAL4xpP0bRds8YM2Q9iPWa3E/F3JkuI17H0faRhv7PNI/9CNet9TOUvqOkV7fRaBtB3qK9sPrHXpssdxTrBdt2N93U7umcOX38zB97sj9nMOQS0OXuutwCfhuRKyMiGuLtrkppc3F8hZgbrE8Un/rtW8cpr0TtKKPI22jnX6nGGK4ecjQwPH2+WRgR0pp/1HtRzxXcf/OYv2WKd7+nwssZ4Ls56P6DB24n3MI9Cq4OKV0HnAF8MmIuGTonan2X3Cl54+2oo8d8nf8C+BNwNuAzcBX21rNOIiI6cCdwKdTSruG3lfV/TxMnztyP+cQ6Nlf6i6ltKn4vQ24C7gA2BoR8wCK39uK1Ufqb732BcO0d4JW9HGkbbRFSmlrSulASukg8FfU9jUcf59fBE6KiJ6j2o94ruL+GcX64y4iJlELtltTSt8umiu9n4frc6fu5xwCPetL3UXEiRHRe2gZeDfwM2p9OPTp/jJqY3MU7R8tZghcCOws3mreD7w7ImYWb+/eTW2sbTOwKyIuLGYEfHTIc7VbK/o40jba4lDoFD5IbV9Drc6ri5kLi4CzqH0AOOzruzgKfQj4cPH4o/9+h/r8YeD7xfrjqvjb3wQ8kVL62pC7KrufR+pzx+7ndnywMIoPIq6k9uny08Dn2l3Pcdb+RmqfaD8CrD5UP7WxsAeBtcD3gFlFewB/XvT1MaB/yHP9NrCu+PnYkPb+4gX1NPBntOcDsm9Re+v5OrVxwGta0ceRttHGPv9t0adHi3+Q84as/7mi/icZMhNppNd38dpZUfwt/gGYUrRPLW6vK+5/Y4v6ezG1oY5HgVXFz5VV3s91+tyR+9mv/ktSReQw5CJJaoCBLkkVYaBLUkUY6JJUEQa6JFWEgS5JFWGgS1JF/H85cMkmMcaqfgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), reverse=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从上图中可以看出有一小部分用户阅读类型是极其广泛的,大部分人都处在20个新闻类型以下。" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idcategory_id
count250000.000000250000.000000
mean124999.5000004.573188
std72168.9279864.419800
min0.0000001.000000
25%62499.7500002.000000
50%124999.5000003.000000
75%187499.2500006.000000
max249999.00000095.000000
\n", + "
" + ], + "text/plain": [ + " user_id category_id\n", + "count 250000.000000 250000.000000\n", + "mean 124999.500000 4.573188\n", + "std 72168.927986 4.419800\n", + "min 0.000000 1.000000\n", + "25% 62499.750000 2.000000\n", + "50% 124999.500000 3.000000\n", + "75% 187499.250000 6.000000\n", + "max 249999.000000 95.000000" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_click_merge.groupby('user_id')['category_id'].nunique().reset_index().describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户查看文章的长度的分布\n", + "\n", + "通过统计不同用户点击新闻的平均字数,这个可以反映用户是对长文更感兴趣还是对短文更感兴趣。" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从上图中可以发现有一小部分人看的文章平均词数非常高,也有一小部分人看的平均文章次数非常低。\n", + "\n", + "大多数人偏好于阅读字数在200-400字之间的新闻。" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#挑出大多数人的区间仔细看看\n", + "plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True)[1000:45000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "可以发现大多数人都是看250字以下的文章" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idwords_count
count250000.000000250000.000000
mean124999.500000205.830189
std72168.92798647.174030
min0.0000008.000000
25%62499.750000187.500000
50%124999.500000202.000000
75%187499.250000217.750000
max249999.0000003434.500000
\n", + "
" + ], + "text/plain": [ + " user_id words_count\n", + "count 250000.000000 250000.000000\n", + "mean 124999.500000 205.830189\n", + "std 72168.927986 47.174030\n", + "min 0.000000 8.000000\n", + "25% 62499.750000 187.500000\n", + "50% 124999.500000 202.000000\n", + "75% 187499.250000 217.750000\n", + "max 249999.000000 3434.500000" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#更加详细的参数\n", + "user_click_merge.groupby('user_id')['words_count'].mean().reset_index().describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 用户点击新闻的时间分析" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "#为了更好的可视化,这里把时间进行归一化操作\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "mm = MinMaxScaler()\n", + "user_click_merge['click_timestamp'] = mm.fit_transform(user_click_merge[['click_timestamp']])\n", + "user_click_merge['created_at_ts'] = mm.fit_transform(user_click_merge[['created_at_ts']])\n", + "\n", + "user_click_merge = user_click_merge.sort_values('click_timestamp')" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_typerankclick_cntscategory_idcreated_at_tswords_count
182499901623000.00000043201252552810.989186193
22499981609740.00000241121132552810.989092259
302499851609740.0000034117182882810.989092259
502499791623000.00000441171252222810.989186193
252499881609740.0000044117121217172810.989092259
\n", + "
" + ], + "text/plain": [ + " user_id click_article_id click_timestamp click_environment \\\n", + "18 249990 162300 0.000000 4 \n", + "2 249998 160974 0.000002 4 \n", + "30 249985 160974 0.000003 4 \n", + "50 249979 162300 0.000004 4 \n", + "25 249988 160974 0.000004 4 \n", + "\n", + " click_deviceGroup click_os click_country click_region \\\n", + "18 3 20 1 25 \n", + "2 1 12 1 13 \n", + "30 1 17 1 8 \n", + "50 1 17 1 25 \n", + "25 1 17 1 21 \n", + "\n", + " click_referrer_type rank click_cnts category_id created_at_ts \\\n", + "18 2 5 5 281 0.989186 \n", + "2 2 5 5 281 0.989092 \n", + "30 2 8 8 281 0.989092 \n", + "50 2 2 2 281 0.989186 \n", + "25 2 17 17 281 0.989092 \n", + "\n", + " words_count \n", + "18 193 \n", + "2 259 \n", + "30 259 \n", + "50 193 \n", + "25 259 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_click_merge.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "def mean_diff_time_func(df, col):\n", + " df = pd.DataFrame(df, columns={col})\n", + " df['time_shift1'] = df[col].shift(1).fillna(0)\n", + " df['diff_time'] = abs(df[col] - df['time_shift1'])\n", + " return df['diff_time'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "# 点击时间差的平均值\n", + "mean_diff_click_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'click_timestamp'))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(sorted(mean_diff_click_time.values, reverse=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从上图可以发现不同用户点击文章的时间差是有差异的" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "# 前后点击文章的创建时间差的平均值\n", + "mean_diff_created_time = user_click_merge.groupby('user_id')['click_timestamp', 'created_at_ts'].apply(lambda x: mean_diff_time_func(x, 'created_at_ts'))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(sorted(mean_diff_created_time.values, reverse=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "从图中可以发现用户先后点击文章,文章的创建时间也是有差异的" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Looking in indexes: https://mirrors.aliyun.com/pypi/simple\n", + "Collecting gensim\n", + " Downloading https://mirrors.aliyun.com/pypi/packages/2b/e0/fa6326251692056dc880a64eb22117e03269906ba55a6864864d24ec8b4e/gensim-3.8.3-cp36-cp36m-manylinux1_x86_64.whl (24.2 MB)\n", + "\u001b[K |████████████████████████████████| 24.2 MB 91.0 MB/s eta 0:00:01\n", + "\u001b[?25hRequirement already satisfied: six>=1.5.0 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.15.0)\n", + "Requirement already satisfied: numpy>=1.11.3 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.19.1)\n", + "Requirement already satisfied: scipy>=0.18.1 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.5.4)\n", + "Requirement already satisfied: numpy>=1.11.3 in /opt/conda/lib/python3.6/site-packages (from gensim) (1.19.1)\n", + "Collecting smart-open>=1.8.1\n", + " Downloading https://mirrors.aliyun.com/pypi/packages/e3/cf/6311dfb0aff3e295d63930dea72e3029800242cdfe0790478e33eccee2ab/smart_open-4.0.1.tar.gz (117 kB)\n", + "\u001b[K |████████████████████████████████| 117 kB 96.7 MB/s eta 0:00:01\n", + "\u001b[?25hBuilding wheels for collected packages: smart-open\n", + " Building wheel for smart-open (setup.py) ... \u001b[?25ldone\n", + "\u001b[?25h Created wheel for smart-open: filename=smart_open-4.0.1-py3-none-any.whl size=108249 sha256=50eb67320a58790e8b173971aeb6af7b636d48259d7c9de759612e58e334215b\n", + " Stored in directory: /home/admin/.cache/pip/wheels/c3/14/fc/a0e523e5d2f13d083ce0af09d4e2861d8e2ec65fc466fb1dff\n", + "Successfully built smart-open\n", + "Installing collected packages: smart-open, gensim\n", + "Successfully installed gensim-3.8.3 smart-open-4.0.1\n" + ] + } + ], + "source": [ + "# 安装gensim\n", + "!pip install gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.models import Word2Vec\n", + "import logging, pickle\n", + "\n", + "# 需要注意这里模型只迭代了一次\n", + "def trian_item_word2vec(click_df, embed_size=16, save_name='item_w2v_emb.pkl', split_char=' '):\n", + " click_df = click_df.sort_values('click_timestamp')\n", + " # 只有转换成字符串才可以进行训练\n", + " click_df['click_article_id'] = click_df['click_article_id'].astype(str)\n", + " # 转换成句子的形式\n", + " docs = click_df.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()\n", + " docs = docs['click_article_id'].values.tolist()\n", + "\n", + " # 为了方便查看训练的进度,这里设定一个log信息\n", + " logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", + "\n", + " # 这里的参数对训练得到的向量影响也很大,默认负采样为5\n", + " w2v = Word2Vec(docs, size=16, sg=1, window=5, seed=2020, workers=24, min_count=1, iter=10)\n", + " \n", + " # 保存成字典的形式\n", + " item_w2v_emb_dict = {k: w2v[k] for k in click_df['click_article_id']}\n", + " \n", + " return item_w2v_emb_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "item_w2v_emb_dict = trian_item_word2vec(user_click_merge)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idclick_article_idclick_timestampclick_environmentclick_deviceGroupclick_osclick_countryclick_regionclick_referrer_type
25667190841199197150704527612941171202
25668190841285298150704530292041171202
25669190841156624150704663888541171202
25670190841129029150704666888541171202
107739164226214800150713140246441171212
\n", + "
" + ], + "text/plain": [ + " user_id ... click_referrer_type\n", + "25667 190841 ... 2\n", + "25668 190841 ... 2\n", + "25669 190841 ... 2\n", + "25670 190841 ... 2\n", + "107739 164226 ... 2\n", + "\n", + "[5 rows x 9 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 随机选择5个用户,查看这些用户前后查看文章的相似性\n", + "sub_user_ids = np.random.choice(user_click_merge.user_id.unique(), size=15, replace=False)\n", + "sub_user_info = user_click_merge[user_click_merge['user_id'].isin(sub_user_ids)]\n", + "\n", + "sub_user_info.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "# 上一个版本,这个函数使用的是赛题提供的词向量,但是由于给出的embedding并不是所有的数据的embedding,所以运行下面画图函数的时候会报keyerror的错误\n", + "# 为了防止出现这个错误,这里修改为使用word2vec训练得到的词向量进行可视化\n", + "def get_item_sim_list(df):\n", + " sim_list = []\n", + " item_list = df['click_article_id'].values\n", + " for i in range(0, len(item_list)-1):\n", + " emb1 = item_w2v_emb_dict[str(item_list[i])] # 需要注意的是word2vec训练时候使用的是str类型的数据\n", + " emb2 = item_w2v_emb_dict[str(item_list[i+1])]\n", + " sim_list.append(np.dot(emb1,emb2)/(np.linalg.norm(emb1)*(np.linalg.norm(emb2))))\n", + " sim_list.append(0)\n", + " return sim_list" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD4CAYAAAD8Zh1EAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAACJRklEQVR4nOydd3hb5d3+P8/RXtawPOVty1kkhCwSNoSEpMwWymrpotCWskrnr5PSvm/X25ZCoZQOoItVyigbwiaMBBIy7STeey/Z2jq/P47lxImT2ImOEiv6XJcv29LROUeyfOs53+d+7q+QZZk0adKkSTP9kY70CaRJkyZNmsSQFvQ0adKkSRHSgp4mTZo0KUJa0NOkSZMmRUgLepo0adKkCNojdWC32y2XlJQcqcOnSZMmzbTkgw8+6JZlOWui+46YoJeUlLB+/fojdfg0adKkmZYIIRr2d1+65JImTZo0KUJa0NOkSZMmRUgLepo0adKkCGlBT5MmTZoUIS3oadKkSZMiHFTQhRB/FUJ0CiG27Od+IYS4QwixSwixSQixIPGnmSZNmjRpDsZkRuj3A6sOcP9qwDv6dS3wh8M/rTRp0qRJM1UO6kOXZfkNIUTJATa5EPibrOTwviuEcAgh8mRZbkvUSe7JyIcfMvzOO2gzM9G4XOO+SzYbQojDPsZgt5+aDV1IksBk02Gy6Ue/dJisOiRNulKVJk2ao49ELCzyAE17/N48ets+gi6EuBZlFE9RUdEhHcy/YQPdd/5+4jt1OrQuF5pMF1pXJtpMF5pMt/I9/vvYdxeSwTD20Eg4Su3GLra/3UZzVd8Bz8Fo0Y0TerNNhyljD9G36TGP/qw3aff7ITMwsJHevrcRSAihQQgNiNGf0SBGf0ZIo78rtzH6fZ/b9tjP3rcp+xCAACFGf5ZGz035EkLa4+fJ3CaNPTchtGg0psn/IY9WIiGofgaiEdCZRr/Mu7/rzbt/1xohAQOINGkSRVJXisqyfC9wL8CiRYsOqbNG5tVX4/rMZ4j09RHt7SXS07P7e08vkd7R7z09hGprifT0IAeDE+5LsloZzptJq3sJrQYvYfSYdSHmFgWoqNRjKSsi4vYQDAr8QyH8QyFGhsJjP/uHwvS2+mgeChEcjkx8DK0YFXf9PqP9Id3Xicj1h/IyTJpYVIuvdR6SJoykH0GjH0ajG0HSjyBpJj7nQ6Ws9GuUll6f0H0mFVmGZ74GG/4xyQeIPUTfMsEHgAn0ln1v01lo7XGQMe8UrIc4sEmTZiISIegtQOEevxeM3qYaQqdDl52NLjv7oNvKsow8MkKkt5doTw+R3l6G23upq4lQ02lhIGxBkqPkjOwkr+Nd7E0fImJRBoABAElCX1yMweslo7ISg9eL4UQv+uIKhEYzdpxoNEbAF2ZkcLfY7/MhMBiir22EkaEQkqEdw9kNdOxYgbX6IkwZWlx5Rlx5Zhx5Bhw5RhzZRnQGgSxHR79iyERBju2+jRjE75OjRCMhOup30bztI1p3bCfoW0g0ePaEr41GK6MzyeiNynedMYbOKKM3xUZ/jqE1RNGbZHSGKFpTFL0xis4QRdLFFAFERiZGf9/71NbdjsOxGKfzxIT8nZPOe39UxPzkm+GEqyA8DGE/hEdGv+/xc+gA94VHINAPQ23j7w8NgxylIbiAZ/q+S8WHr7Lyh5890s86TQqRCEF/CrheCPEQcCIwoFb9PE406p/05b0QAmGxoDOZ6RiysL1WS+1GiWgkRlaRjdNOysO7OAejZQVwHXIsRnRggEhnF6G6WoI7dhDcuZNgdTVDL700KmIg9Hr0FeUYvV4McaH3ejEX5B60ji/LMjW1f+IL7xgYtL/LDavOpKhvHr0tPra9OUgkFBvbNsNtxJVvxZVvIdNjITPfiiPHjEa7u44fi0Vp2b6VqrVvsOO9tQSGBtGbzFQsXk1b/fGYM7UMdT6Dr3+YisWnUzxvCaFAjOBIhOBIeNz34W7l51AgesDnoNFKGMza0S8dBstC9J4QW7fdwolLnkWns0/q73PUUPMqvPBdmHEuLP8RSOrMk/Q29fPirzciE6Oly4ksywmZ90mTBkAcrKeoEOJB4AzADXQAPwJ0ALIs3yOUd+PvUZwwI8DnZVk+aOrWokWL5EMJ52ps/Cu1dXdw6invotEYD7r9YI+fqnfaqVrbxlBvAINZS+WJucw6KY+sQtuUjh3z+wnW1CoCHxf6nTuJdHSMbSPZbGPibvB6MVQq37VO57h9/eW187i9oYEsUxZ9wT7uOPMOTi04FTkmM9jjp6dlmN7WYXpaffS2DtPfPkIspvytJElgzzFhsvQQ9G2np2kjAV8/WoOB8oUnMvOk0yg5fgGtO4f4750fsfLqORQfZ+OV++9l62svk1tRycdu+AbO3Pz9P9dojJA/SmAvwR/7eXj3bYGRMH0NnYRjWgrP+gkFFfM4bs4d00eoemrgT2eBLQ+++BIxnR4h9Ak//4AvzKO/WE84GGV25od8UHccn/rxUhw55oQeJ01qI4T4QJblRRPed6SaRB+qoP+u+l1+2yKzdu4I+VnLJ9wmEo5S91E3299upWl0grNwppNZJ+dTerwbrU4z4eMOlejAwJi4B3fuJLBjB8Gdu4gNDIxto8lyK6N5byWaGR6uC/+M9qiVJz7xPF9+6cvU9Nfw++W/Z1n+somPEYnR1z5M3cbt1Kx/m47aD4gE+wENkq4UjX4Geks5mflOXB4rmfkWajZ00dc+zGf/9yR0BuVirPqdt3jpT3cSi8Y46/NfYs7pyw9fuGpeYej+a/h3z6/AbCD/rO8x94Tvk593yeHtNxkEBuEvK8DXAde8Qkd4G9u2fxNZltHrM0e/3Oh1buX72Ffm2M86nXN0onj/RKMx/nvHRtpqBvj4LQswbrqXfz6/kDOu8DLn9MIDPvZwkKPRcaXBNNOflBL0f9ft4hdVO/nfgvWsmPuDcfd1Nw+x7e02drzXTnAkgs1lZOZJecxclktGZnIdGLIsE+ns2mc0H9y1i7rFw3z7FD0XbY5xXcblGL51A1e/eDVNg0384ew/sCh3/N+qp7mJqrVvUL32DfraWpA0GornncCMZadSNHcRIwNCGcm37B7RjwyGAIhKfvqzNnLDjTfgzMwAYLC7i+fu+jXN27ZQuexUVnzxqxit1kN+rsM/XUn3mkZCFWW8or0Bs6sHz1m/Yemy/2A2lxzyflUnFoOHroSdL8JV/2EoK5v1H3wSq3UGTucyQqHuvb56kOXwPrsRQoNO55pA7LPGbvvwv1p2vDPCWZ+dwaxlHuQP/sb9fzHjmVvEyq8sSfhTk8NhWr/9bQJbt1H29H8ROl3Cj5HmyJBSgt744v/iWfsr6rJyKDv3z4TcS9i5vpPta9voahxCo5Uom+9m1sn5FMxwIqSj67Jfjkb57nOrebanjT+/XIGzzUfFyy/R4+/h8y98no7hDv644o+UyLlUj4p4V2M9CEHRnLnMOOk0vEtOwmTLOOBx3nmihg+fb0CUtNAZqOH8FZew8OTjxu6PxaKse/Ix1j76TywOF6uvv4XC2XOn9FxCzS10/uT7DL3+LkEdGMLgX3g879i+iNOzkYpz3mbRokeQpKNUTNb8BN78P1j9S0InfJJ16y9ClqMsXvwkBr17n81lWSYSGZxA6Ee/wj3jfo/FFHdV384z6NjwKVwznif7+MfR6ZxkDWlpevEcWrWn8tlfnZnQ8o4cDtNyyy0MvfQyAIV//jPWU05O2P7THFkOJOhHrMHFoSLk43j9o3ksrKhGuv9cBiMVtPnOB/dKTr2sksolORgtR6mAAMGYn9f62ljkzKV44en0/OnPyOEwmaZMbl/8S6555Ut88enPseLdLNwDBvIrZ3Hm575E5dKTsTpdkzpGLCaz8/0O8isdbPe/C0BnW9e4bSRJw4kfv5TiufN55s5f8cht3+XEiz7JskuuRKM98NsiNjxM95/+RO9f7yMqh3nsVMGzS/VcsSbCyg8+Ys7xj7OVi2l4q5XMzN9RXv6NQ3ux1GTLY4qYn3AVsUVfYMumzxMKdbFwwcMTijkoE+w6nR2dzo7FUn7A3cuyTDTqo35LE9UftZM3I8bii+cRjuQTCnXTP/IE+fot7Bw8lYFOf8Lq6HIoRMvXv87QSy+T/Y2v0/2Hexh87tm0oB8jTDtB3/Hkyzhr+miuctLkzadwdoiVjt+C6SHQXgPi88DkhO9I8OT2e/HF4JPei9CHcwkK+OCxh9m1fTMtVVs53WjixVNGePWUAX5/8u0sKj9pysdo2tbLUG+AspMsfLReKb309Ey8WCq3opKrfnEHr95/L+89/ggNmzbwsRu/OeGEqRyLMfj003T+36+JdHbSsayMHy1ooCwzk0fP+wefsV4F2TIrnn+FoflZNO44j81r7sPleu/osjK2boQnvgqFJ8K5v2ZX7S/p63uH2bN+RUbGvIQcQgiBr0fDa3/rxplr4byvLERv3P3vtnGkiXzD+wC07OhLiKDvKeY53/0urs9cRWDHDoZeXoP8ox8h9PrDPkaao5tpt4b9pVMNXP8VmdfPOAGp3k/rUzFaOy4kpCmDNbfBb2bDf2+Grh1H+lQn5NFdT5GtlVnu/RybmutYM7uY1x9/iOCwj5Mvu4obfvFXHrr8cWxmO7es+w47+3ZO+Rhb32zBZNPRF2tCp9OhkQ0MDPbvd3u90cQ5X76J87/2Hfrb2/j7t25ky6svsWc5zr9pEw1XXEnrt76NJiuL5759Kjec0cgp0gj3nvMXCjMK+dFJt/LnE3rZ9uXllG15jMzBbbR+eBXvv3wH4fDAfo+fVHyd8NCnwOyCy/5BW9ezNDX9lcKCz5GX94mEHSY4EuaZuzchhODc6+aNE3MAo6UEo7kTs95Py47+wz6eHArRPFpmyfne93B95ioAMlavJjYwwPA77xz2MdIc/Uw7Qb/5vFsYMmv405lWOm4LYb18NYNrN1Hzp0ZaBz5NKP9c2PgvuGsx/OMSqHllzDt+pKnqqaJ6qIcV2SUIWc9HH75Lps/Px5efz2f/7y6WfuIynLn5FNgK+MvKv6CVtFzz4jXUDdRN+hi+viD1m3uYuSyXHTurqaiowKS1MewfOuhjK5eewlW/vJPcci8v3PM7nr79F/jq6mj99neov/QyQq0t2G/7Pj/+gpn7pHe4YdDPT7NOQ5dZAcAZhWdwftn5/NT1FuIX32FO9d8x+ztpeOVyNqz9H47UfM0YkRA8fBWM9MDl/2Iw1k5V9XdxOpZSUfGdhB0mFo3xwp+3MtjtZ/WXjyPDve+EvNlURMAkyLfU0rqz/7BeGzkUovlrt+B7eY0i5ld9euw+y8knI9lsDD73/CHvP830YdoJeq4tD5d2IWH/OvotGkKfyqX8pRdxfupKBte8Rc1v3qdt+LOE5t4IbR/B3z8Ody+DDx5QVusdQf659V50QuYi78U0bfmIUCBAaZ8P65Bvn22LMor48zl/Rkbmiy98kabBpgn2uC/b17Yix2TcXi1DQ0PMnDkTqzmDQHR4Uo/PcGdxyQ9+yqmf/BSx/z5D/XnnMfDMM2Recw26h//INdp/saV3K7/KO5tre7oQJ9847vHfXvJtXEYX348+Rsl993JCzf0QirHt4fnU7/r3pM5BFWQZnv06NL0LF91F0F3Aps1fQa9zc9xxdyR04vbtx3bRtK2X06+cQb7XOeE2JlMhfqNEvuYDhvuDDHQd2ntzTMzXrCHn+98fJ+YAkl6P7eyzGVqzhlgodEjHSDN9mHaCvu2NjeSFlwJRXh/Op7PrJXTZ2eR+97uUv/QSzssvZ+CZF6j50RO0DV5G+JSfg6SF/94Iv50Dr/wUhtqTft6+kI/nG17lBFOU/L4KOn/7W07b0Uz2kJ/Bl15mZN065Mj4bJUyexl/XvlnQrEQV794Na2+1gMeIxaT2fZWKwUznTR11iKEwOv14rA7iIkQI76Di4Ysy/heXoPrnr9S2drNQKaT17we/pszzKdfvwZf2MdfVvyRVVtfguJTwDM+/t5usHPrSbeys28n90ffYNaD97Gg+e/4wy7eub2Hwf6aqb94ieD9P8GHf4NTbiE2+wK2bLmBcLifefPuQa/PTNhhtr7ZwqZXmjn+rEJmn7z/hVtGUxF+o4YClDp66yGUXfYR809/asLtMlavIjY0xPBbb0/5GGmmF9NO0AM9wxiGYgTNi3lzYIBe3y6Gh2sB0OVkk/v97ykj9ks/ycATT7Hra3+grWMF4Y/9XZkEe+P/4LfHweNfhrZNSTlnORrllf/+nsteDHLjbzW0XHEt1vc/RONyobHZCDc00HDVZ9h58im0fvvbDD7/AlGfMqL2Or3cu+JefGEfX3jhC7QP7//DqHFrD76+IHNO9VBVVUVxcTFms5lMtzJJ3Fzfsd/HAgSqq2n83OdpufEmJJOJovv+yuIXX6L3YxXcEX4E3VCUuxf9mvld9TDQBCfdMOF+Tis4jQvKL+Avm//CTtMg8x+8l3nd/6YvWsma7z1GNJrkkWLt6/D8d6ByFZz1A3bu+h/6+99n1syfYbPNSdhhWnb08caDOyia7eKkiw/sgjEZCwgYNTg0LZisEi07DpzwuTdyKETzzV9TxPwH+xdzAMvSpUh2O4PPPzelY6SZfkw7QZ81exYLh+34M84jRIS1Pi1dXS+O20aXm0vuD39I+Ysv4Lj4E/Q/9hg1V3+f9p2zCV/+Aiz6Amx7Cv54Ktx3LlQ9A7EDZ5dMFTkUwvfmW7T98EfsPO10vN97gJUfyhi9hRiuv441c0qw/s9t2Fadg7DZ8Pzud1jPOB3fa6/TcvPN7Fy2jMZrrqXvwQepiLj449l/pD/Yzxdf/CJdI10THnPrm62YMvTYCyW6urqYOXMmADn5WQC0t3RO+LhIXx9tt95K3cc/QbCqipwf/oDSx/+DaemJ3LPtTzykf5051plcsK6QNT/+Gf4XforsrgTvyv0+/28v+TaZxky+//b3idmtLPvH75gx+Byt0UWsuf6nyaun99bBo5+FzAr4xJ9obX+M5ua/U1T0RXJzL0jYYQa7/Tz/xy1kZJlY+cU5B83M12qthK0OhABPfpiWHZOvo4+J+SuvKGL+qf2LOSi5Q7YVZ+Nb8wqx/SSPpkkNpp2ga11GzvblE9GXYooV85rPSGvnxBM+urw88m69lYrnn8N+0UX0PfIINZdeS/sGB+Gr3oCVP4X+BmW14J0L4d17IHjwycP9EfP7GXzpJVq+9S12nHwKTddcw+DTTxOcV8FvL5J47kdR8n//G+qtemSTiZL5C9EXFCIPDmJZeiL5v/gF3rffovjvf8P5qU8Ramig/ce3sev0M7B++cfc27wCQ20b17zwRXoDveOOPdQboGFzN7NOymPHzmoAZsyYAYCnOAeArs6ecY+Rw2F6//Y3as5ZRf+j/8Z55ZWUv/A8riuvJCSifPuNb/OnzX/iE95P8MDF/+Kan/+BueUWTL46PvKV4R/Zf10+Q5/Bj076Ebv6d3HPR/cgWSycdd9PKAi8y075DN655hvI4X1XXSaU4JDyt5VluOJBBoK1VFX/EJfzFMrLvpmww4T8EZ65exOyLHPudfMwmCdZj3eWAuDJ7GG4P8hg9yRKYqEQzTfdPGkxj5OxajWx4WGG33xzcueWZloy/QTdaSQjKrDHQK85jcFojFfatxEI7r8UofN4yPvJbZQ//xwZF5xP37/+Rc35l9Dx+jCRK1+GT94Plix4/tvwmznwwvegr2FS5xMdHGTgqadovuEGdiw7iZYbbmT49TewrVhBwR/uxvvOWv5+eTYfHadjSV42FssMdr7/DsXHL0BvNKErLAAg1NwMgNBqMS9eTM53vk35C89T9vR/ybrlFiUy+L7H+OmfRvj6z3byxHXn0fHai8ijE13b325FBuackk91dTW5ubk4RwPBnFkZCFlDX+/uy3rfm29Se+FFdPzvzzAddxxlTz5B7ve/h8bhoMffw9UvXM3z9c9zy8JbuHXZregkHbZMN2eUDxPWZvDG5mH+9s3radyy/7LVaQWncVHFRfx1y1/Z2r0VyWDknN9fS2a0io3SSjZ8/gZiw5ObrJ0ysZhSVuuqgk/eR9BqY/Pm6zAYcjjuuN8hSYlZghGLybz01630tY9wzrXHTclPrrOXE5Uk8s27AA5qX4zFxfzVV8n54Q8mLeYAlqUnonE40m6XFGfaLSwSVY+j0eopjGQQtMzF1WfjlaEYn2p/jtLizx/wsfqCAvJ/+lPc115L9x/uofcf/6Dv4YdxXnEFmV98CK2/Dt69G979A7xzFxgydneo0ZuVJgZ6M5GgjqFdfoa29TG8qxdiMlqHGccpM7EtnYN57gyEKQN0Wrpb3ualhhc53RSi0H4ynTu24Ovp4pTLFJ+wvlAJZgo3NWOaM76eK4TAUFGBoaIC97XXEOnuxvfaa4SefYwF72+kd+1N9FktWE49na3yxyj02pGMURobGzn99NPH9iNJEnphZmh4gGBdHZ0//wW+119HV1xEwd13Yz3zjLGl57X9tVy35jp6/D389ozfcnbxHlnqHdsQu15Gd+b3uezKj/PMHb/i0Z9+j8UXXMzJl34KjXbfkek3F3+Tta1r+d5b3+OR8x/BaHWz/EcLePYnu1hn+Bj6T32RmX/5PdrMxE1MAvDaz6DqaTjnZ8RKT2Hzhk8RjgyyaNG/0ekcCTvMu0/UUL+5h9Mur6Rw5tQWtJnMRfiNAkd4Cybb6bTs6NvvRGosFKJlTzG/8sopHUtotdhWrmTg6aeJ+f1IphToLpVmH6adoBMYRBv1UTCSzyanmxPXu3lr5hD/2fQAXz+IoMfRFxWR/7P/xf3lL9F99x/ofeAB+h56CNenrsR19a/RrrgNPnpIWYQSHobQCOGufoY+7GKwqh1/izIq1tlkMmeFsOUPYnSFEWIX7HoWdu0+1uP2DCIuBzft6KB06z3APXxtJoi3NsD7FvQzlDpuuPngtkSt243jkktwXHIJb+x6mQceuIXljUaKtvczUiIof/K31L8dxGvQ471gfH04Q+gpfO8Vah/8A5JeT/Y3v4nzqk8j7bF68N22d7nl1VvQa/Tct+o+jnMfN/4E3rkLtCZYfDU5ZhdX/fx3vPq3P7HuyX/TuHkjF37z+9hc45fNZ+gzuHXZrVy35jru3ng3Ny+8mayCpSz5wjreus/K+xkXIF32KSr+8kf0xcWT+vsdlK2Pwxu/hPmfgqVfYUf1DxgY+JDjjrsTm3VmYo4BVL3bxoYXGznuNA9zzyiY8uNNxiL8RglzXw2eSieto3X0vXNdYqEQLTfehO+118j90Q9xXnHFIZ1vxupV9D/yCL433iTjnP3Pf6SZvkw7QX+338EMsZO8gQhrnAZK6mJsm6VnzVAXKza9w7x5E8fPToS+uJj8X/yczC99ie4//IGev/yV3n89iOtTn8L1hauJ9vYy9NLLDL30EoGtVQAYZszAff0KbCtWYKj0Kv98sry7U01oePT7CNHQEI++9wPmEyJYbidWfAsfPvEwFouBWQuXQPcOpA//TIa3kFBT85Reh9MqziZ8zW/5+mtf55O7TiPPJ6g8fyGtTz7Ogq5uhq78FLWVlVjPOhOtK5NTn7gfrX8E+8UXk/21m9G6xwvvYzse46fv/pQSewl3Lb+LfOteI8Whdtj0MCz8nLLKEtAZjay89gZK5y/k6dt/wcbnn+bUKz+3z7meWnAqH6/4OPdtvY/lRcuZmzWXmUuuo7vtBrY8fyEbsj+OdPkVFN/7R0xzpxYQtg9tm+CJ66BgCZz3W5pbH6Sl9UGKi79CTvbHDm/fex6mZoBX/1GFZ4aTUy7zHtI+TKZCfEYNdDWRf7KDXR90Mtjtx561u2yTKDEHMC9ejCYzk8Hnn0sLeooy7QS9SeQRJki+TyYsg2X2fJb2wbOOBh5971fk5t5D9iRa0+2JoawUz69+qYzY77qbnj//mZ777oNRX7hp/nyyv/lNbCvORj9RD0ghlJKM3gyW3UL5ZtNrtIX6OS/HSNjzMXod5/N6/fOc/cXrYMXHlJWL95xC9txG2psnvxo0zvKi5fzk+J9Tv1ZL3YzNmL/0RZ4NjHBiURELojF8r7xCz71/gliMcGEpr86q4PLrvobWvbu8EZNj3P7h7dy35T5O9pzM/532f1j1E0TpvvdHiEVg2XX73OVdchKOnDz62vbvk4+XXr7/9vd55PxHMGgMLF71fYa6/x916z/L9oLzkK+6isI77sB62mlTfi0AGO5WlvUbHXDZP+j3bWbHjtvIzDyd8rKvHdo+J2CoN8Bz92zC6jSy6trj0BzE0bI/TKZCuowapPAwniJlHy07+scEPRYK0XLDjfhef53cW3+E8/LLD+u8lbLLCgaeeJLYyAiSOd1YI9WYdpOi73Qb+Fz0DBw+RWyNJ5yI690YNo1gs6mBhx56iEAgcEj7NpSX4/nNryl76kmcV15Bzg9/QMXrr1Hy0INkXv2FicX8ADxc/TBuo5OZul7c7uXsfH8tCEH5oqXKBlo9nPcbdHo/FvH+IZ1zdkMlEoI1lsf46otfJRgJUn7KKWR+/nMU//1veN9+i5JHHyHwzdvoc7lobdztRfdH/Hz9ta9z35b7uGzGZfz+rN9PLOZBH6z/C8w6H1xlE56HIy+fvvb9C7pNb+PHJ/2Y2oFa7t54NwBGYz4LL7iMvMrHac1cSnPpmTR95Sv0P/afqb8QkRA88hkY7oTL/0lAL7N5y1cxGj3MmX07QiSmyUMooDhaouEY514377CSPQ2GXAImpeTl1LdisunGFhglWszjZKxajez343v99YTsL83RxbQT9HPn5RNFom1YqWPL5TPRxiSWm/OpiYao9+/i8ccfJxaLHWRP+8fg9ZL73e/iuvJKdDk5h7SPpqEm3m55m+XZxWiEhNt9Bjvff4f8ylnjY3BLTsGvPR5nfity25YpHSMWjbH97VaK5rj5xpk3sXFgI+vy1pFfsLtconU6Mc2dS15RLgAdozG63f5uvvD8F1jTuIZvLf4W3zvxe2j35/zY8A8IDMBJN058P+DMzaO/ve2AXuqTPSdzsfdi7t96P5u6FHdMTvbHmHGOicz8t9mRcz79lYtp+9736P7DH6bmVX/uW9DwNlzwe6K5c9i8+TqiUT/z5t2DTnfg7PjJIsdk1jywnd4WHyuvOQ5XnuWw9ieEhOzwKD/3N5LvddKyo49oMLiHmN+aMDEHMC9aiCbLzeCz6UVGqci0E/QzKrNw4mNzOIwG6DFaceUXMLs5F6OQ6a1ooLq6mjePsN/20R2PIgmJRfouHPaFDPcG6aqvxbtk3xp/sOyzxMIC+cmbphQkVr+5h+GBEHNOzefCsgtZPLCYFmML3137XSKx8TECnuJskKGnu5cdfTu48pkrqRmo4Xdn/o6rZl+1/wYL0Qi8excULoXCxfs9F0euh0goiK+vZ7/bAHxj0TfINmfz/be/TzCqLHKZMeOHFJz1OnZnDR/lXE5k3iy6fncH7T/+MXJ0Egu+1v0ZPrgPTr4Zee4lVO/4EYODHzF79q+wWg6tvj0R7z9dR+2GLk6+xEvxnMS4coRz9IqnvwFPpQNfX5AdN/4/Rcx//GOcl1+WkOOMHU+jIWPlOfjeeGNsNXKa1GHaCbpGIzFT1846wuSGoDEQomLJMtre7eFUm4YPfbV45nh49dVX2blz6tGziSAYDfL4zsc5zbMUXXAn7qzl7Hp/LcCEgq4rnU3nRxlI7evhowcnfZytb7ZgcRgomZtJY2MjRb1FfKbwM7zU8BLffeu7RPdY/Wow6dHIRjb7N/CZ5z5DNBbl/lX3c2bRmQc+yPanoL8RTt7/6BzAkZsHQH972wG3s+qt/HjZj6kbqOOuDXcByqrJufN+Tfbpf8BgGeb9zKswnOSl/6GHab7xJmL+Ayy4qX8Lnvs2eM+B5T+kueUftLU9SmnJDWRnnXPg5zYFdq7rYP2z9cw6OY95Z03d0bI/DLZywjoJua+BvBJlxN9a3aeI+WWXJuw4e5LxsdXIwSC+115TZf9pjhzTTtABZtgjBBDY2kdo8AepWLwUOSpztm0mGqAlr4mcnBwee+wxent7D7q/RPNi/Yv0B/tZkaX842e5z2bn+++QXVKOPTt3n+11hYX015qJmMrhxe/DyMHPebDbT+O2XmadnIekkaiqqkKj0XDjyTdy84Kbea7uOX609kfE5N2lpwZnA8/an6DQVsg/z/0nszNnH/ggsgxr7wRXOVSuPuCm8YYYB5oYjXOS5yQu9l7MA9se4KOujwCwZxxP5cwvknvar4npTLxrugTnuTPwvfIKjZ//ApG+CbJO+uqVOFxXGVz8J/oG1rNz509xu5dTWnrgD6Cp0FE/yJq/bSevws7pV8xIaLs4k6kQv0Ei2rWT4Z99B11oiOAZn1RNzAFMJ5yANjubwefSZZdUY9oJ+vBAkGJzPlY5QqTDT4M/RG6ZF6srk1idncWWME/V/ZezL1QWxDz88MOEkhwb+nD1w5RklOCJ7sRsLiUWtNO6Y/uEo3NQsmfQaBmQVoC/H16+9aDH2PZ2KwKYfXI+sixTXV1NWVkZBoOBq+dezXXzr+PJmif56bs/JRKL8Mt1v+Q911vkjuTxwKoHyLXs+8GyDw1rofVDOOl6kA78VrG53Wi0WvoPMDG6J99Y9A1yzDl8/63vE4gok9jFxV8iu7CY/FPvYUDO593wGeRetYDAtm00XPkpQs0tu3cQ9MGDVyoZPJc/SIBhNm+5HpOpmDmzf40QiXlr+/qCPPuHTZhtelZ/aS4abWL/ZUymIkZ0Ei0P1jLy+hvkevR0heyqZt0IScK26hyG33iDqG/f6OY005dpJ+hbXm9hZFMOn/XpcHSG6QmEGInJVCxeSu3bXSy3a4nEIjzb/iwXX3wxHR0d/Pe//01aGFRVbxUfdX3EJd4L6e9/D7d7ObvWKX09vSdO3E5OaLXo8vMJtIVh6Vfgwweg8b39HiMajbH97TaKjsvE5jLS0dFBf3//WBgXwJfnfZkvzv0ij+54lPMfP5+/b/s7iyLLWNa5FBGe5J997Z1gzoTjD+59liQN9uzcg5Zc4lj1Vn580o+pH6znro1K6UUIDXNm/xprTgMlp75CS2geG7qKKLzuNCI9PdRfcTkj69crcQdPfBm6tsMn/0rUWcCmzV8mFgsxb+49aLW2yT2/gxAJRXnunk2EA1HO/eo8TLbEt3AzSrkMPG9npAlyf3wrpcvn4usNMtRzaE6tyZKxejVyOIzvlVdUPU6a5DLtBH3BqmIKludhk6Os9un56jMDvP1SAyXHLyXsD1NoOI75Fg0PVz1MXnEeZ555Jps3b+b99w/NFjhVHq5+GKPGyMlOJ7Icxu0+m53vr8WZX4DLU7jfx+kLC5TFRWf8P8jwwNNfg+jE4VX1m7oZGQwx51TFIVFVpSx6iodxgRIbcOMJN/K5OZ+jbbiN7574XS53XoVA0HKQGF1AaeG34zlYci3oJrdM/GDWxb1Zlr+MT1Z+kge2PsDGzo2AYmWcOeOn6HMepnxZG9v9Z1O1y0fJN89FaHU0fPoqqhcuoO5379DafCa977ZT9Z+v4OvcxnFzfovFMrGtcqrIsswrf9tOZ+MQK74wm0zPBHbOwyQWDNL77duJ1enIXdyPc9XJ5HsdAFOO050qpuOPR5uXl3a7pBjTTtB1eg1LVpawzPEXnjIHGZZkdj5Zzyv/GMKYcSZ9O3M40zrMUHiIR3c8yqmnnsqMGTN44YUXaGiYXODWoTIUGuKZ2mdYVbqK4MA7aLUODJoKmrZuwrtk2QFrr7qCQsLNzWCwwupfQudWJVNmAra+2YrVaaB4jmJ/rKqqorCwEKt1vOgIIfj6oq+z9oq1XDHzCnLylUVP7S0Tx++O453fg9YIi784yWc/Oevi3nx90dfJs+Txg7d/MFZ6yck5l7zci9EW3ErxPA3v+q6i6YO1lP74cvJvugxn+QAaVza+Le10/PR/kH7wLnlf19F/xc9pvuEGuu66i6E1awi3tBzyldkHz9Wzc30nyy4qp/T4rEPax4GIBYM0X38DI2+tJfoJgbN8BPoacOVZMFp1h9TwYioISSLjnHPwvf020cFBVY+VJnlMO0GPxCK4LTr6tWGsBsE/7SE0nymlcKYTNPOpX3ceuo2fZal0HH/f9ncicoSPf/zjOBwOHnnkEQZVfPP+t+a/+CN+Lq28hJ6e13C7z6Duww+RYzG8SyYut8TRFRYQ7etTapozz1UmIV/7OfSPz3gZ6PLTtK2XWSfnI2kk+vv7aW9vH1du2RuzTlkR6ClW6uZdHd0HfiK+TiXLZv6V41a+HozJWhf3xKKz8OOTldLLnRvuHLu9svKHmM0F2I+7lZwSMy8P3kL3C3/G3nMPOedWUvT4q2Q+/Us6fhYl8v/m4b75JozHzSG4cxfdv7+L5q9ez67lZ7PjxKU0XPUZ2v/3f+l/7D8Etm07aCu2mg2dvPdUHZUn5nDCyqktJosTCwYJt7Tg37SJoVdepe/RR+m+54+0/8//0nLLLdRfcgnDb75J3k9/QmzlaIZNXz1CEni8joQ0jj4YGR9bDeEwQy+vUf1YaZLDtFv6/0ztM9zz0T3MdcGZ7TE+GJHZoony5S/N5aM1b/Pa399lqGkx8+tPwmXfzr+dz3HF2Rdw2WWX8ec//5lHHnmEz33uc2i1iX3qsizzSPUjzMmcQ4EuRGe4D7d7Oe88sxZbZhY5ZRX7POaf7zUQjcl8ZlnJ7tTF5mY0M2fCx34Jd52odNq5/J9jj9n2VitCwOyTFZtgdfX47PMDkZljV2J0J3KM7Mn790I0BEu/OtmnD4y3Lu4d0nUgluYt5dLKS/n7tr9zdvHZnJB9AlqtlTmzf8sHH15K6fJ/43/i4zw78AMutv0K00V/Y7i/nQ0f/AitZSHZC35HTDYiLY+iC8fA5yfQ2EKguY1ASwehji6Cb/UQe/1topr3iWn0CEcmOFxgc4DFhmy0EENDJBSlp3WYnNIMzvz0zLGrKlmWiQ0PE+3pIdLTQ6S7W/m5u4dIbw/R7tHbe7qJdvfsNxZYslrRZmaicbvJ/9WvsJ9/Hi2b30BmLaJfuYLMr3RSs6GLwW7/hA2mE4Vx7lx0Hg+Dzz+H4xMfV+04iaDvwQeRQyGcV16J0CWu/2uqMe0EPceSQ6Ypk+ecH/HT3j4IOdi4qxNOrGD2KQt57f7fUH5yPX29ENyxkr7HLDz83vuccHYR5593Af95/DFeeOEFzj333ISe1wcdH1AzUMNtJ91Gd/cahNBhMy2i/qM/cfzZq/cptwTCUX7+XBUui57PLCtBV6AIeqipCePMmeAogtO/pTheqp+DGauJRmJsX9tK8Vw3VqcRUMotbrcbt/vgAipJEjphZtB3gKuU0LCyUGfmueDe90PoQOxpXSycPbWQrVsW3cLbrW/zg7d/wKPnP4pJa8Jun09p6U3U1v6GEy8/hdf/6uQf9bfCD6pHH/U9ALayvxW2GcqX2QujA21JkpGIoYmGEMNBpIEhpFgfUjSERiuhNRsosOqYM/gmbTf+lUhPD9HubiI9Pcj76fajcTjQuDPRZroxzTkOTWYm2sxMtO5M5We3WxHxzEwkg2GfxxstpQQNEoa+OgTgqXQASq6LmoIuhCBj9Sp67n+AaH8/GodDtWMdDsGaGtp/8lOIxeh//AnyfvITTHOPO/gDj0GmnaAvzVvK0ryl/N8//xerth2zNpOm+j6+99b3+Mzsz1By/Ak0rttC6fnrCC7U89RbA5wzcAUv378di8PAzIKTWf/eu3g8HubPn5+w83q4+mFsehurSlexcf3dOJ1LadpSTTQcnrDc8lp1J0OBCMPBCMFIFP1oo4vwnqmLy66Hjx6GZ78FpadRt3kY/1CYOacqwun3+6mvr+fkk0+e9Hma9VZGDtSVaeO/wN+3336hB2LMutgxOafLnlh0Fm476TaufvFq7txwJ99a/C0ASoq/TG/vW7T2/pDVX/03bdU6unr+y4h/M0XFV2J3zkKrk9CMfmm1Elq9Bo129HfdHt+1EkIa/8Ea6e0lWFVFYHsVgeoqgturCDU2Es6wIWcqQqwvKUab6d4t0PGfXZloXc7DHjGaTIX4jRK63p1oQKmjW3S07uhj1kl5h7Xvg2FbtZqeP/+FoZdfxnHJJaoe61DpuvP3SEYjOd/7Hl233079ZZfhuuoqsm68AclyePELqca0E/Q4lZ4zWLTtQUqdC9jaPcJzu97lqZqnWGmaR36rD5N+JpWaHfjLY7yi+zM/K7mTjS830bIliFtaxpp/bMGqd1Ix+/AzuLv93bzc+DJXzLwCOdTOyEgtBQVX8dG/12LKsJM/c9Y+j3lig+IGicnQ2DOCN8eOlJExPhddo4PzfgP3rYbXf8nWrRdhdRkoGl12vmPHDmRZPmD9fG9s1gwGejqIxWJIe3vLY1FlMrRgsdJQe4qMWRcnsbhoIpbkLeGyGZfxj23/YHnRchbmLByzMr73/rm09f8/suYsZ7D2d8wu/xYlxQdZ5ToJtC4X2pNOwnLSgec41ETJRdeQ0dcIgJAE+ZUOWnb2q35s45zZ6AoLGXzu+aNS0APbtjH0/PO4r/sKjos/gW3lCjp/8xt6H3iAoZdeIvfHt2I99dQjfZpHDZOaFBVCrBJCVAshdgkhvjPB/UVCiFeFEBuEEJuEEIkLnt4PWZ5yEB0sGO2S8+mSO7h5wc1stbcTEzIfVPcy7NvGZd7z2NK7hQ53DRd97QQu/e5iyuZnYfDl8fwdu3j2jx/R2XB4E6X/2fkfIrEIl1ZeSle3MsHkzDiV2g3rqVi8FEkan/Q3MBLmlapOFhUrLeJqupTFHfqCgn1z0YtPghM+Tf+b/6a5SuloI42OMqurq7FareTnT9zlZiKcTheyiNHT1b/vnVVPK6svT7pBiQQ+BKZqXdybWxbeQr41nx++/UP8EWXJf9zKODi4kZraX5Od/TGKi6495GMcbZhMhQSMGjTDPRBWnD6eSgdDPYFJ9Rk9HJSyy2qG332XyBFYVX0wun53B5LdjuvzSvMajc1G3o9+RPG//okwmWi65lpavvFNIj2Tn4hPZQ4q6ELJHb0LWA3MBq4QQuy9Zvz7wCOyLJ8AXA7cnegT3ZvSbDs9BDguLIiZtbxePcDVc6/mv1c8j7WsgMAWZXHJxtp7MWlN/HHTHwHIKrKx+trjOefGcgLWVuo2dfHoz9bz+K8/pH5TN3Jsaja3aCzKozseZWneUkrsJXR3r8FqnUXHzi7CAf+E5ZbntrQRisa4ZUUlADVdygSarrCQcNMEnYvOvo1twdUIYsxepjhVwuEwO3fuZMaMGfuOtA9AVo4yum+t7xx/hyzD23eAswRmnjfp/e3NoVgX98SsM/OTk39C41Ajd3x4x9jtOTnnUljwORyOJcye9YuELr8/0hgMOWMxugwof39PpfJh35qEUXrG6lUQjTL00suqH2sqjHy4Ad/rr5N59dVobOMXi5kXLKD08f/gvv56hl54gdqPnUv/fx5P2gLCo5XJKMESYJcsy7WyLIeAh4AL99pGRpmBArADhz5EmyQ5GQZaJCjwy8SyjWxp6GcwEEan0bH09PORWvWgyWOJVYM/4mdd+zpuefUW6gfqAfDOLuGsK+fQnfkOWfNiDHb7eebuTTx423tsfbOFSHgSKX/AG81v0D7czmUzLiMc7qO/fz1u91nsfH8tepOZouPm7fOYJza2UOq2sKw8k5wMA7Wjgq4vLFC803slDEYNTqqC51BieB9Lw2MA1NXVEQ6Hp1RuAcgtUDzVHa17edGb3oOW9UrdXjr07PBDsS7uzeLcxVwx8wr+uf2ffNDxwdjtlZU/YOGCB9FoUqsxgxASMfvoVdZoc/J4HV3tBUYAhpkz0RcXM/j80bPISJZlum6/HY3bjevTEzfDlvR6sq7/KqVPPI6+vJy2736Xxi98gZDK602OZiYj6B5gz2Fj8+hte3Ir8GkhRDPwLDD1GbUpIoSg22Anzx8klm0kGpN5rVoRqYrRBhK64VlkiQH+tuJutJKWlxtf5oInLuCGV25gXfs6TjjhBBYsns+2zrdYeKWDFV+YjUYn8do/q/nbd9fSsOXgovTwjofJNmVzRuEZdPe8DsTIdJ5Jzfr3KF+4ZJ/Gya39ft6r6+XC+fkIISjPso6VXHQFhcjhMJHO8aPn2o1d+AMa5hQ3wYs/gOEeqqqq0Ov1lJaWTul1KyhRRvjd3Xs9t7V3gsmpeM8Pg8mmLh6MmxfcjMfq4Qdv/4CR8Mhh7Ws6IJyjf8f+euV3SZCfJD+6EALbx1Yz8t77RLoPskYhSYy88w4j77+P+9prD9pZyVBeTvE//k7urbcS2LyF2gsupPvePyGHJ15pncokamHRFcD9siwXAB8D/i4mSEcSQlwrhFgvhFjf1TWJ1YoHIWgvxkwbbqsOvVHDS9uUJe22TDe55V7aPooCMXJoH8v8vmzGZWzs3MgXXvgCVzxzBWKmID8/n6f++yTOUg2XfncxF37tBMx2A8/es4mGrfsX9aZBpYnFJZWXoJW0dHevQa/PZqAZAr6hCcstT33UiizDRfOVz8SyLAu1XT5kWUY36nQJ7VV22fpmK7ZMI0VXXA/BQeSXfkh1dTVer3fKfnqT2YAmZmBgsH/3jd27oOoZZVWo/vBcA1NJXTwQZp2Z206+jaahJu7YcMfBHzDN0Tq9xCSQ+3aPLvPjdfQedevooHQyIhZj6KWXVD/WwZBlmc7bf4c2Lw/HJPPghSThvPwyyp55Butpp9H1m99Q98lL8W/erPLZHl1MRtBbgD1DSApGb9uTq4FHAGRZfgcwAvsYo2VZvleW5UWyLC/Kyjr85dQisxytaKfQH8OaZ+HVqk6CEaVcUbF4Gc0bO9Dr8ujseoGrZl2FRmiIyTFevORFfrD0BwyHh/l/a/8fT9meIiIiPPjQgwSDQQpmOLnoayeQmW/luT9spnE/ov7ojkfRCA2f8H6CWCxET88buN1nsmvdu2j1BkqOX7DPY57Y0ML8QgclbkU4y9xWBgMRun2h3YuL9pgY7e8YoaW6j9mn5CPy5sCyryI2/oPM4R1TLrfEMWgt+Eb2mAh+9y7FUbPk8CcaD8e6uDeLcxdz5cwr+ef2f7Kufd1h7+9oxmQqxm/QIPfuzvBPZh3dUOlFX17O4HPPq36sg+F79VUCmzaR9dXrkPRTC0TT5WRTcOcdFPz+TqK9vdRfdjkdP/vZfhd6pRqTEfR1gFcIUSqE0KNMej611zaNwHIAIcQsFEE//CH4QbDkVqIVbeT7YoSyjPiCEd6tVWbqK5YsAwQiUElv79s49SYuKL+AJ3Y9wXB4mEtnXMqTFz3JnWfdSZYzi1edr9Ld082v7vsVrUOtGC06LrhpPs48M8/+YTON28aLejAa5PFdj3NW0VnkWHLo63+faNSH23UWu95/h5LjF6AzGsc9prp9iKr2IS6av9uVUp6t5K/UdvnQ5eWBJBHaw7q49a1WJEns9iOf/m38BjfnsYaK0kOzXFpNNgKR0Tf4cLfiPT/+crBOrbn2RByudXFvblpwE4W2Qn749g9TuvQS96LLvTVjt2XmWzBYtEkru2SsWsXIunWE9yr5JRM5FqPrd3egLy7GftFFh7wf29lnU/bM0zguu5TeB/5GzfnnHxN9VA8q6LIsR4DrgReA7Shulq1CiNuEEBeMbvZ14BohxEfAg8Dn5CRMN7sLKxCiA8+wTHeGFrNew0vb2gHI9BTiyi+ga5sGWQ7R0/M6nz/u84RjYf65XVlKLwmJMwrP4L5V9/H7S35PpCJCtCPKTffdxLde/xYhnZ8LbzoBR64i6k3bd9u64k0sLp2hNCLo7n4ZSTIS7MvC19c7YVTuExtb0EiC847fLehloyP12u5hhE6HLi9vbIQeDceoeqeNkuPdWOyjKwz1Fl7WryKbHkwb/3pIr1tGhoOoCBIMhJRVoZEALEvctIcjN2/SuegHw6wzc9tJt9Hsa+b2D29PyD6PRkymIgJGDWJg98WvkuvipDUJE6Mw6naRZYZePHJll8HnniNYXY37hhsQhxnPsafFUTKbafrSl2m55etHzTyBGkyqhi7L8rOyLFfKslwuy/L/jN72Q1mWnxr9eZssyyfLsny8LMvzZVl+Uc2TjlOabaePETz+GGgEC8tcvLStg9io9bBiyTLq3mtHq3XS1fUixRnFnF18Ng9XPYwvND7Yf457Dv/zqf+hfEY5c3rn8MH2D3hsx2MYrTouvHk+jmwzz9y9iaYqRdTjTSxOzD0RWZbp7lqDy3UKNes+QNJoKFswvv9mLCbz5IYWTvW6cVt3L//2OEwYtBI1naMTo3tYF2s2dhLw7V4ZCtDV1cUHQ1n05pyshHf1TX1G3+1WUhpbaxqU3JbK1ZBVOeX97A9nXj59HYduXdybRbmL+PSsT/Ng1YMpW3qJj9Cl4LDSkHuUfK+Dwe4AQ73q5qMDGCoqMHi9R6yTkRyJ0H3HnRgqK5XgsARhXrCA0v/8B/cN1zP00kvUnHse/Y/9JyUtjtMubXFPsm0GWgXk+5U2a5VlTjoGg2xuUf4hvIuXIUdldNE5dPe8RiwW5OrjrmYoPMS/d/x7n/0JIbjs4svIzs5maddSdrXtAsBk1XPh1+bjyDbx7F2bWLvuIz7q+ohLZ1yKEALfcDWBYCvuTKXcUnTc8Rgt46Ns19X30joQGJsMjSNJglK3hdru3dbFULMyQt/6RisZbiOFM11j28ezz7Xn/Z+y+Oe5b0/9dctT5i+iG/4JIz2HtMz/QDhyPUSCh2dd3JsbF9xIka2IX677ZcL2eTSh0ZiJWB3KL3t8SHtmKLcla5RuW70K/wcfEO6YRGZ+ghl44glCDQ1k3XQjYgprKyaDpNeT9VXF4mioqKDte9+j8fOpZ3Gc1oIuhKDbaMEzonS4d+Vb0UiCF0fLLjllFVhdmfTuNBKN+ujte4c57jmcmHcif9v2N0LRfWNU9Xo9l112GRpZQ3BLcOxT3GTVc+HNJ5CRZeLD+zspGZrNBeVKxam7S1mQIQIV9He0TehueWJjKyadhhWzc/a5rzzLSu0e1sVoTw89td207uxXJkP3yB+prq4mPz+fjMLZSjOMHc8pDpUp4CnKQSCT2/AQ5C9QVqMmkERZF/fEpDWxqnQVO/p2TPh3SwVkx6j3oK9+7LbMfCsGc3Lq6DDqdgGGXnghKceLEwuF6Lr7bozz5mE96yzVjmMoL6f4739TLI5bRi2Of7w3ZSyO01rQAQL2IrLDnRhj0EmME0tdvLhVGV0ISVJa063tRCOZ6epSKkFfOO4LdPm7eLr26Qn3mZmZia5Sh3XAOhZPC2Cy6Vn+VS8Dhm7O2f5FfA3KlUF39xoyMuZT/2EVCEH5ovE5KMFIlGc3t3HOnBwshn3rguVZFhp7R8aFdG15uXZ0MnR3uWVoaIjm5ubdUblLvwLZc5TwruDke0Nm5TmplOuwhtsOa5n//kiUdXFvKhwVxOQYdQN1Cd3v0YJweZUf+nePGnf70ZMzQjeUlWKYOTPpnYz6H36ESGubMjpXeRXwOIvj6afT9dvfUnfJJ/Fv2qTqcZPBtBd0kVmGTrThCUZp9IdYMTuHnZ0+6kZLGBWLlxH2hzFIc+nqehlZjrIsbxmzXLO4b8t9RGMTrwitPL6SAd0ATz/79Lgm0y93Ps+Ts+7E4tLz9O8/on5rHYNDm8hyL2fn++/gmTEbi8M5bl+vVXcx4A9z4Ql7r8dSKMuyjoV06QoLiUpadmwZonR+FuaM3bat+IfLmF1Ro4PzfguDzfD6zyf9mkkaiVPEhwzghFkXHPwBUySR1sU9KXeUA1DTX3OQLacnensFYY1A7qsdd7un0pm0OjpAxqpV+DduJNyq+oJvAGIjI3T/8Y+YlyxJakiaLiebgjt+R8Fdvyfa30/9ZZfT+ZvfJu34ajDtBd2ct9u62OAPjpU04m6XglnHYbRYGWywEw73MDCwASEEX5j7BeoH63m16dUJ91vhqmBj5kZ8gz7eeustQFnw8HD1w5TnFXPpN5Zicxl5/p4aRroq0MvH0d1YP2G55cmNLWRa9JxaMXFmeVmW4nSp6RpGV1BAl/sEQiHBnNPGh25VVVXhdDrJzt7DXlh0Iiz4LLxzN7TvLxd8L5rep5Am3mcJaBIfuJlo62KckowSNEJDzUBqCrrZVETAKBHr2Tnu9vzRfPRk+NFh1O0CDL6QFG8Dvf/8J9HubrJuvvmIZPTYli+n7Jmnsa06h5577z0i8weJYtoLepanAol2PCMyjYEQHoeJ2XkZY2UXjVZL2cIl1L7VjRC6sbLLiqIVFNoK+cvmv0w4211uL6fb1I2lyMLbb79NT08P6zvWUztQy6UzLsViN3Dh105Ab/HR/ObNbHtdmQD0Llk2bj+DgTAvb+/kvHl5aDUTv9xlWcoEak2XD43DQWvhaVg0fgoqd4/0g8EgdXV1zJw5c983/dm3gskBz9wCsdjBX7S1dxIUZtbJM4hNZvtDIJHWxTh6jZ5CW2HKjtCNJiVGV96jhg7g9sTr6Mkpu+iLizHOnp0Ut0t0aIieP/8Fy+mnYV5wgurH2x8aqxXnFVcAENy564idx+Ey7QW9JMdJvxjBMxLDF43RG46yck4OHzT20e1TOsxULFnGSL8fk+44OrteRJZlNJKGz835HFt6tkxohXMYHbiMLoZKhtBqtTz77LM8XPUwGfoMVpUqIxijNUbB6T/HaItS/a6FzKLFZGSNX5zz/JZ2QpEYF+2n3AJgNWjHQrr62kbot5VRFNkxbjJ0165dRKPRiVeHml2w8qdKwNaGvx/4Beuthe3/pc6+gpDQ0N+jTo/VRFsX41Q4KlJW0E2mQvwmDZrBdiX9cpRk5rrEsa1eRWDTJkLNey8KTyy9991PbGCA7JtuUvU4k8HgVeYwgjt3HmTLo5dpL+hZNgNtIqZ40YGGQJCVs3ORZVizXRmll8w7Aa3ewEhrFoFAEz6fYv27sOJCMo2Z/GXLXybcd7mjnBp/DWeeeSZb67fycuPLXFhxISat0hast/dtNIYuTrlSIhYdwj9yMu21A+P28cSGFoozzcwvdBzweZS5lZCurW+1IIiR2/L2uPurqqowm80UFhZOvIPjr4DiU+ClHyqrP/fHO3eDpKW/TAnhamlQ5/LSkZNPJBhkuC+xGdtljjIahxpT0uli0GcTNOkR0TD4xv9dPJVOBrv8yaujrx51u6iYwBjp7aX3/vuxnXMOxtl7J3InH63TicbtJrgrLehHDCEE3SYTnhFlRNPoDzErz4bHYRoL69IZjJQcfwJ1a/sBMVZ2MWgMfHr2p1nbupbtPdv32Xe5vZyagRoWL15MV24XUTnKx0t3N9Pt7l6DRmNlqCVIaOhRzHYD/71jI+11iqi3DwR4p7aHC+d7DlobLM+20NDpo/qddjyWfqTGncij5ZBoNMqOHTuorKzcf/a5EEp3o9Cwksg4ESO9sOEfMO8yMsuUkX57izoJDY68UadLgssuqex0EUIimqGkYe69YCzf6wCSV0fXFxRgnDtX1WyXnj/9mVggQNaNqoezThpDRUW65HKk8Wd4KPArNewGfwghBCvn5PDGzm6Gg4pHvWLxMgbahjAb5tDVvXuy59IZl2LRWbhvy3377LfcUc5weJh2fzt1tjqy/dk0bFT+0WQ5RnfPK2Rmns6u99fhyndy8TcXY7Tp+e/vNtJRN8h/x5IVD95RqMxtJW9IJuiPMKMc5FCIyGgiZX19PcFg8OBhXFkzFBviR/+C+rf2vX/dXyDih5OuJ79YmTzeJ0Y3QahlXSyzlwFQO1B7kC2nKY7RfJ7+8YKeWaDU0ZO1wAiUUXpg61ZCjY0J33e4o5O+f/0L+/nnYygvT/j+DxWD10tw166xwdR0IyUEXWSWY5NbyQzHaAwodfMVs3MIRWK8uVMRxbKFSxCSRLA7H5+vCr9feZNm6DO4dMalvNDwAk2D42Nr4za5p3Y9RVewi7NcZ7F27Vq6uroYHNxEKNRNhmUZzdu24F1yElankYu+dgJGq46n7tjIq+80cXyBfWzS80CUZ1s5PqjB4DTgOU4R23gEQFVVFVqtlrKysoO/GKd9ExxF8PQtENmjLBEOwPt/hIoVkD0Li82EFNMz0N9/8H0eAmpZF0vtpWiEhl3903cUdSA0mcoag70nRiVJkFeR3Dp6xjkrAVQZpXff8wfkWAz3DdcnfN+Hg8FbgTwyQrg1se/bZJESgh5PXfSMxGjwKyK2pMSF3aTjxdGyi8lqo3D2XBrfUxL7Ort2j9Lj0boPbHtg3H7jo8Fn654l25zNV1d9FZ1Ox7PPPktX9xqE0NBfZ0SWY2PuFpvLyEW3LEBj1LCwPsL5hRNbFffGHRUURDVIFVYMRUqdPNTUjCzLVFdXU15ejn4yUaJ6M3zs19BdDe/cufv2TQ/DcNe4Zf5GrQWff2hS5zdV1LIuprzTxVZOUC8h9+zY5z5PpYOBLj++vuTU0XUeD6bjj2fw+cQKeqi5mf5H/43jkovRFxQkdN+Hi6EiPjG67+s/HUgJQXcXepFEO55hmQa/MkLXaiSWz8rmlapOIlHl8qliyTK6anowGSro6tq9tDnLnMUF5Rfw+M7H6fbvnlDMNGWSoc+gfrCeS7yX4MhwsHz5curq6mhpfga7fRG16zaRkZVNdunuy0aby0jvEjtBIRN7rZOuxoOLZs9HPUSRaXdq0OXngxCEm5poa2tjcHBwatnnlSuVBUOv/1JZRh6LwTu/h9x5UHra2GYWo41AePIrTKeKGtZFGJ2sTlFBNxmVkK5Y775XIPF89KSO0j+2muD27QTrEjdn0f37uxAaDe4vfyVh+0wUBm8FMH2tiykh6MU5Tobw4fHHaAmGCY+mLa6cnUP/SJh19UrdMd6aLjpQwsDABoLB3ROCn5vzOcKxMP/a/q9x+447Wi6uvBiARYsWUVBgIhJtwGZZRsOmDXiXLBs36SnLMk/s6KT2OAsGk4Ynb99wQFEPh6JUv9dBW4bErkE/Qq9Hm5dLqLmJqqoqhBBUVk4xDXHVz0HSwrPfhJ0vQPcOOOnGccv8MzIcRAgQCqmTY6GWdbHcUU7TUFNKOl2U1EUNYmDfZuGZBVb0puTW0W3nnAPAUIJG6cGaGgaeegrnlVeiyzn8/P1Eo7HZ0OblTVunS0oIepbVQLsUwzMSIwa0BJV/9NMqszBopbGwrnhrutYNEUCmq3t37nOJvYSzi8/moaqHxqJ1A5EAfYE+tEJLlklJKJQkiSUnKo0rPno/QDQSoWKv1aEfNPTR3OfnY0sL+fgtC9AZNDz5uw10N08s6rvWdxLyR4gUW8ZCuvQFhYSbmqmqqqKoqAiLZYqt4eweOPO7sPNF+O9NkFEAcy4at0lmpgsEtDao09BALetihaOCqBxNSaeLyVRIwCgh+XogOv6DVor70ZPkdAHQ5eZiWrAgYXX0rjvuRDIaybz2moTsTw2ms9MlJQRdSV00kO9XRoLxOrpZr+WUCjcvbu0YGyVWLF5G86Z2DPqCMftinL2jdV9seJFQLEREjtDl3z2aj0Y+IBbNZmvNMPrMbPIrx5dDHt/QglEnsXJOLhluExfdsgCdXsOTv91Id/O+JY5tb7XgzDWTW5FBU5+fYCSKrrCA3u4uOjs7D7nVHEu+BDlzFU/z0q8o2S97kJ2r1PfbmlQSdJWsi6nsdNFozIQtDoQsw0DzPvd7Kh0MdPrx9QWTdk4Zq1cT3LGDYM3hlbkC27Yx9MILuD73WbRO58EfcIQweL2EamqQoxPnPB3NpISgA4xk5FIwOsEXd7oArJyTQ0u/n+1tyn1jren8lfT1vUsksnvUHI/W/fu2vxOKhni4+mFyzYovOF6zjUSG6Ot/j/z8VYhohHBBOXv2ww5FYjyzuY0Vs3OxjiYr2rNMXHTLCWh0Ek/evoGelt2i3tPio712kNmn5FOebSMak2nsGUFfWEijSel2PpauOFU0Wvj4PTD/07Dws/vcHbcudnWo08HFqUKMLqS+00V2jK4q3svpAnv2GU1i2WXlShDisEfpnb/7HZLdjuvzn0/QmamDwetFDoVUsWuqTcoIOu5y8oIt6GLy2AgdYPmsHIRgrOwSb03XuVVClsN0d48P5/rCcV+g09/Jr9b9ik1dm8Zq5/HRYE/PG8hyBHmoFH1nM4OhMJv36Cz+xo4u+kfCfPyE8d5ze5ZZEXWN4Inf7hb1rW+0oNFKzFyWt1dIVyEtBR6yHA5cLheHTO5xcNFdYLDtc1eOJxNkQW9vYksicWzuLDRabcJH6KnudBHO0Qn2/n2bL8Tr6MmcGNXlZGNetIjBw1g1OvLhBoZff4PML16Nxrbve/FowlARnxidfnX0lBF0c64Xg2gjPxAbJ+huq4GFRc6xsC6It6ZrRadz71N2iUfrPlT9EEaNkStmXkGGPmNMPLq716DTuWj+sAtr2E9eXh4vvvgigYBiJXtiYwsui55TvVn7nKMj26xYGjWCJ2/fQEf9INXvtVO+MAujRUepOy7oPiLZWXS73ZRnZCT8tYqj0UjoMDM4pE6ei1rWRUhtp4vWNZOYALl335KSJAnyK+xJC+qKY1u9itCuGgI7pm7nk2WZrttvR+N24/rUp1Q4u8RiKC8DIdKCfiTJKpyBRrSTPxIbsy7GWTknh21tgzT3KR507+JlyDEZbWQ2Pb2vE43u9vXGo3UBVpeuxm6wj4lHLBahu+c1XK7TqVm/jooFSzjvvPPw+Xy89tprDAXCvLStg3Pn5qHbT7KiI0cRdSEJ/vPLDwgFosw5VbnEthl1YyFd9aEQsiRRonIdz6S3MhJUx4sO6loXU9bpYikmYJCI9lRPeH9+pZOBTj/D/Umso69cCZJ0SG6XkXfeYeT993F/6UtIZrMKZ5dYJLMZXWEhwV3Tr6SXMoJenOPCJ4bwjMj7CPqK2UodPJ7tklPuVVrT7TASjY7Q2zc+CGtF0QpuPOFGrpt/HaCIx67+XfT3rycSGUCMVBAY9lFx4kl4PB4WLlzIe++9xyNrdxCMxLjohAMv9XfkmMdWlGZ6rOSV28fuK3Nbqe32sbO5GbPfj71DnQnLODZLBsHYsGoNc9VMXUxVp4vRVETAqEHum/i5eUbz0VuSWEfXut2Ylyxh8NnnpvS3lGWZztt/hzY/D8dll6p4holFcbqkR+hHDLdVT6eI4PHHGIjGGAhHxu4rdVvwZlvHBF0IMdqarh2NxrZP2UUjabhm3jXkWpQPgnJ7OYOhQZo7nkEIPW2bAmgNBkrmKfnNy5cvx2g08s+3qil0mlhQdPAZfGeuhStvPZELvzZ/nIe9PNtCTaePnTt3UejzEWne1+mQSJwOJ7KI0t+jzihdtdTFFHa6mE1F+I0S0sDE0bXuQht6oyapdXRQOhmF6usJVk985TARvldfJbBpE1nXXYc0mZXORwkGr5dQfQNyaHpdAaaMoCvWRf3u1MXA+D/Eyjk5vFfXS/+IcnvF4mWEA2EMYi7d3WuIxSL77DNOmaMMkOnufgWncym73v+A0vkL0RkUP7rZbGbhKWdRN6JnaZ520l1XDGYdJuv4N3mZ28pgIIIvAqV6PaHmfReYJBJ3diYArY0qxeiqZF0stZciCSklnS56fRYBkwFNwDdhr1hJEuR5HbQmWdBtK1eARjNpt4sci9F1++/QFxdjv+gidU8uwRi8XohECNbXH+lTmRIpI+gAvowsPH6lHr7nxCgoZZdoTOaVKqWEEW9NN1BvIxzuY2Bg/X73W24vJ1srI4fb0UXnMNzft0+rubqoCxmBrnUDfr//kJ9D3Oni19ooys4hPJrnoha5HmXyVq0YXbWsi3qNniJbUUpOjCoxuqOrKPsnts55vE76O0YYHkheHV3rcmE58UQGn5tc2WXw2ecI7tiB+8YbENrEtzpUk90RANOr7JJSgi7cZRQFFOFo2GuEPs9jJyfDMFZ22d2argsh9OPCuvYm25zNQqvyhuyq1iBptJQtWDxum6c+amNGlglDsJ9XX524T+lkKHMrk0aGrGKMhYXIweBYjK4aeEqUslJ3lzoxujZ3FpIm8dZFSG2nC44i5fsE1kUAzwwHQPJH6atXEW5sJLBt2wG3kyMRuu+8E8OMGWPNMqYT+tJS0GjSgn4kMedW4oy2YA/v63SRJMGK2Tm8vqOLQFhxjlQsWYZ/wI9ZdzzdXS/td9QhhOB4i6A3ZqHmnS0Uzz0eg3n3UvyaLh+bWwb45JISFi1axLp162hrO7QRaXSoGw0xIuZM9IVKEl1YxTq6zW5Giuno71dngk2SNNhzctNOlykiZSrZPXLvxBOj7gLraB09yfbFs88GrfagbpeBJ54g1NBA1k03IvbXlOUoRtLr0ZeUTDuny/R7pQ+Au3AG2lHrYqN/33/yFbNzGQlFWVujrIyMt6Ybbs0kEGxlaGjLhPsNhXrIkXzs6NMw0NmxT3bLkxtakARccHw+Z511FiaTiWeeeeaQGjDvqK4mQwTpi+jRjbabU1PQAQwaC74R9ayLztw8dbzo9nKicpT6wfqE7/tIo3fMICJBrHdi37ekkchLcp9RUNq0WZYtO6DbJRYK0XX33RjnzcN65plJPb9EMh2dLikl6MW5mfjEIAUjMg0j+9YWl5VlYjNoxxYZKa3pFlD7dh8CzT5ulzg9Pa8hkOmq0SsOmUUnjt0nyzJPbGzlpHI32RlGTCYTK1eupLm5mY8++mhK5y/LMlVVVXhsGup7RtB5FH96qEndiVGL0YZfxRhdNVMXgZQsu5jMxQSMmv0KOiht6ZJdRwfF7RJuaSGwZeIBUP/DjxBpbSP75psmbRA4GjF4vYQbm4gdxpxYskkpQc+06OkWYTz+GM3BENG9BESvlTh9RhYvb+8gGouHdS1lsH0Qs+G4cemLe9LV/QponVh3ZpBRXoTZ7hi778PGfhp7R7joBM/YbfPmzaOwsJCXXnqJkZGRSZ9/Z2cnfX19zMh30tTnJ6zRos1RJkbVRInR9RMO79/pczioZV1MZaeLkouugb7954nsznXpT9JZKdjOXg46HYPP7hsFEBsZofuPf8S8ZAnmZcuSel6JxuD1giwTrJ0+1tiUEnQhBL0GHR6/TBhoC+6b871yTi7dvhAbm5TaY7w1XaArj+HhnQwPj//jRaNBenvfwGJcjNNngL2W9D+5sQWDVuKcOTljt0mSxLnnnovf7+eVV16Z9PlXVVUBsGhGwVhIl66wQHXrYjxGt61pejWMjjtdavunzz/cZInH6GoGO2A/VzZZhVZ0R8CPrrHbsZ50EoPPP7/PVVfvP/5JtLubrJtvntajc5ieTpeUEnSAIbuT/BFlpLn3xCjAGTOy0GnEWNlld2u6YYB9yi79/e8SjY4Q7VTsd50Fu9/A4WiMpze1cfbsHGzG8dG0ubm5LFmyhPXr19PSMvECkb2prq6moKCAOaNt62q6hsdy0dUkO1fxoqsVo6uWdRF2r+JNNTQaEyGrHSkSgpGJHUiSRiK/wpHUhhdxbKtXEWlrI7BHWTE6OEjPX/6C9fTTMS84IennlGj0RUUInS71BF0IsUoIUS2E2CWE+M5+trlUCLFNCLFVCPGvibZJBnJmGUUBZaS59+IigAyjjqVlmby4bY+M9CXL6KrtxmScQVf3eEHv6n4FSTJR/243w5kSNdHd4vrmzi56h0NcNN/DRJx55plYrdZJTZAODAzQ2trKjBkzxkK6art96AoLiHR0EAuqVyfNK1SuLjrb1YnRVdu6mKpOl5hd+SCkb2LrIkB+pYO+9uTX0W3LlyN0Ogaf21126b3/fmIDA2TdfFNSz0UthFaLvrx8WjldDiroQggNcBewGpgNXCGEmL3XNl7g/wEny7I8B7g58ac6Ocx5lXiCTWhkeUKnCyhll7ruYWpGuwNVLB5tTddfzODgRwSCStSuLMt0d79MhnUx7TtrkWbkjJuAe2JDKw6zjtMr901WBDAajaxcuZLW1lY2bNhwwPOuHl1OPXPmzLGQrprOYfRxp8skR/mHQm7BaIxujzoxuqpaF1PY6SKcSrwB/fX73cbjPTJ1dI3NhuXUUxl8/gXkWIxIby+99z+AbdUqjLNmJfVc1GS6OV0mM0JfAuySZblWluUQ8BBw4V7bXAPcJctyH4Asy+omSh0Ad9EMTLSR44/ts7gozopZyoj0hdGyi83lJreikpYPlZp7V5cyOerzbSMYbCfUo9SA8+bPoyfQQ3+gn+FgZCxZUa/d/8s4d+5ciouLefnllxkeHt7vdlVVVWRmZpKVpXw4xEO6dAWjgq6i00Wr06LDxODQgGrHUM26mMJOF02mIoyxCWJ042QVWdEZNElfYARKJ6NIRwf+DRvo+dOfiQUCZN14Q9LPQ00MXi+R1jaiPvVcYIlkMoLuAfZUk+bR2/akEqgUQrwthHhXCLFqoh0JIa4VQqwXQqzvUmn1Y1GOmxH6KfDLNAwHJtwm127k+AL72KpRUBpIt2xux2goHqujd3W/AgiaPhgms6AIb/nxgBII9eK2dvzh6Dh3y0QIITj33HMJBAKsWbNmwm38fj/19fXjWs2VZVmo7RpGVxC3LqpbRzfprAwH1IzRVce6mMpOF2NGBSGdINqzfb/bSBqJvApH0hcYAVjPPBOh19P7wN/o+9e/sF9wAYaysqSfh5oYvF4AQtOk7JKoSVEt4AXOAK4A/iSEcOy9kSzL98qyvEiW5UXxkWiiybTo6RFh8v2xffJc9mTlnFw2NvXTMaiIvtKaDhjx0t//HuFwP93dL2O1zKV5Uy3eE0+iwqHMeu/q38XjG1rxOEwsnESyYnZ2NkuXLuXDDz+keYJFQrt27SIWi40T9PIsKwP+MP2mDITRqOoIHcBqySAU2/8VxOHizFXHupjaTpci/EYNcu+BxcQzWkcfGUzuPILGasF6+mkMvfgiciyG+/qvJvX4ySDudAlMk7LLZAS9BSjc4/eC0dv2pBl4SpblsCzLdcAOFIFPOkII+oxaPCMy3dEow/tpELFitlJ2iY/S463pOrYIZDlKS8uDDA1tQfaVIssxvEtOIteSi0lrYktHI2/t7OKiE/KRpMlZs8444wxsNtuEE6RVVVVYLBY8nt2j/XhIV133CPrCAkIqrxZ1OBzERISBPpVidFWyLkLqOl3i1sX9xejGyY/nox+BUXo8p8X5yUvQFxQk/fhqo/N4ECbTtKmjT0bQ1wFeIUSpEEIPXA48tdc2T6CMzhFCuFFKMEdsyDSUYcfjV0RzfxOj3mwrJZnm8WWXJcuof78FvS6buvrfA9C+OYo9O4esYuXSvsxexns7w8Rk9utumQiDwcA555xDW1sbH3zwwdjtkUiEnTt3MmPGDKQ9Mi/Ks6yAkhOjKyhUfYTuzlKsiy0N6sToqm1dTEWni16fTcCkR+Prhdj+O1dlFdmOWB3devbZZN18M+4bUqt2HkdIEoaKitQpuciyHAGuB14AtgOPyLK8VQhxmxDigtHNXgB6hBDbgFeBb8qyrE583ySIZZVQOKKMViayLoIykl8xO4e1Nd0MBZTJ0D1b08ViAYyGAurer6diyUljiyTKHeXUt2QxOy8Db87Umt3OmTOH0tJS1qxZg290kqWuro5QKDSu3ALgcZgwaCVqu3yji4vUjtFV4lo7VIrRVdW6mKJOFyEEUVs2IhaDwf2/bhqNRF6FnZYkO11ACbFyf/lLaJ0HLz1OVwwVFSlVckGW5WdlWa6UZblcluX/Gb3th7IsPzX6syzL8i2yLM+WZXmuLMsPqXnSB8Oc46U4oPwDTLS4KM7KObmEozKv71BELKfcizXTTc8OAwCa8Cxi0ei47HOHVElwJI9VczOnfF5CCD72sY8RCoV4+eWXAaXcotPpKC0tHbetJAlK3crEqL6gEHlkhGivOrZCAE+JIuhdnep8Dqudugip6XSRHaNljP3E6MbxVDrpaxtOeh39WMDg9RLt6ibSl/yS1lRJuZWiAJlFM8mMNmKJHHhidEGRk0yLfmzVqBK8tZSat9spKvgSXVucWBxO8r0zxh7T0pYHxJhTfGiBPVlZWSxbtoyNGzfS0NBAdXU1Xq8XnU63z7blWVal5BKP0VWx7GJ32kZjdPtVO4Za1sUSewmSkFJS0IVLmZTbX4xunHgdPdl+9GOB6eR0SUlBL8rNIki/0jB6gtTFOBpJsHxWNq9WdRKKKDX3isVLiQRDRDqXUPv+DioWLxvLc5ZlmfW7BBpzLf3R+kM+v9NOO42MjAweeeQRfD4fM2bMmHC7siwLTX1+yFcEXW3rol5jZmhEPS+6WtZFg8aQst2LtJmzkYFYz4H7eGYV2dAakp+PfiwwnZwuKSnoLouePimExx+jcfjAS6JXzs5lKBjhvTql1BBvTffGP+8jEgyOK7d81DxAc18Ik3MrNQOHLh4Gg4FVq1YxPDyMEILKysoJtyvLshCNybSZlPpkWOWQLovBRiA0/ayLkMJOF2sZAYNEtKfqgNtpNBL55fakB3UdC2hzcpBstmnhdElJQRdC0KfXKCP0UPiAI8JTvG5MOs1Y2SXems7X24PRYqVg9nFj2z6xoQW9VqKyYPiwR4OzZs1i5syZzJ49G5PJNOE2cadL7WAEbVaW6iP0jAw7YfxEIvt3VBwOjlGnixoTo2X2spR0uphMRQSM0kFr6DCa65KuoyccIQQGr5fQzqN/wJCSgg4waM/A448RRKYztP+cb6NOw2mVbl7aK6wLoHzRiWhGm9tGojGe3tTK2bOymeE+/Mt7IQSXX345n/zkJ/e7zfiQLvWtiy6XC4RMR7M6IV3OUS+6GtbFCkdFSjpdTMYC/EYN0mD7Qbc9UvnoxwLxTBc1nWaJIGUFPeouwONXmkscyOkCStmlfTDA5halflxy/AIqFi9l/jnnjW3z1q5uun0hLpzvodxRTsdIB76QuvkONqOObJtBcbokYXFRdq4S29vapI4XXe3URUg9p4tGYyJsyUA7MgjhA0/EZxUrdfQjEaeb6hi8XqIDA6o2bE8EKSvo5rxKiv2KcOzPix7nrJnZaKTdGek6vYELv/F9cst3L3Z9cmMrGUYtZ8zIosyu5FXUDqi/dmrM6VJQSKS9nVhIvcvpsRjdNnVG6GpaF1PZ6RK15yo/9B/4Ck2jkcgrPzJ+9FRnujhdUlbQM4tmUhhsQsjyAa2LAE6LnsUlznGrRvdkJBThha3tnDsvH4NWM5bpkgzxiId0aQsKQJZVjdHNK3KPxuiqN8JTy7qYyk4XnMXK90nU0T2VDnpbh/EPpevoiWS6dC9KWUEvys1ByN1kBeSDllwAVszOpbpjiPrufV0eL23rYCQU5aL5Sg3YY/Wgl/RJEY94SJcvWzl2WMWyi06nRYuRgaF+1Y6hlnURUtfpsjtG9+DPLV1HVwdtZiYal+uoty6mrKA7zTr6RIACf4wG38Qxunuycq+wrj15YkML+XYji0tcAGgkDaX20sOyLk6WeEhXk1FZmRpSeWLUpLMy7FcvRldN62KqOl30zllEBUS7tx1026xiG1q9lLYvqsB0cLqkrKALIeg3SOT7D15yASh0mZmVl7GPoPf4gryxs5sL5nvGJSuWOcqSEtkaty42RLQIg0H1/qJWcwZBFWN0HSqGdKWs08VcTMCoIdZ78NGh5gjmo6c608HpkrKCDjCUYcPjj9ERixKIHrinJyiRuusbeunx7S7RPL2pjWhM5uN7NbKocFTQOtzKSHgk4ee9J/nxkK7uYXQFBaovLlJidMMMDagj6k6VY3Qh9ZwuSi66hDjIpGicfO9oHd2XWlcqRxqD10tsZIRIa+Lfu4kipQU97M6nYCSIDDQHD/7mXjk7h5gMa6p2d9B7YmMLM3NtzMgdn6xYblfEQ22ni2ZcSFeB6ouL3G51Y3RtmepZF1PV6aLXZykxukOTs8yl6+jqMDYxehQ7XVJa0C35Xgr9yj/BZMouc/Iz8DhMY/bFhp5hNjT2T9hmrsyhWBeT5nTpHh5bXKTmJV9OvtJJqr1ZHb+tpFHPupiqThclRteNJhQEf/9Bt89O19FVwVBx9DtdUlrQXYUzKQkoI9rJOF3iGelv7uxiJBThyY2tCAEXHJ+/z7aFtkJ0ki4pE6PlWVYae0cQBQXEhoeJqpiIWFCiTA53darjRQf1rIugTIymotNFdowOKiZhXdRoFT96eoFRYtHY7WhzctKCfqQoyssjI9yGITq5iVFQyi7BSIw3dnTzxIYWTix1ke/YN2tFK2kpsZckbYQejcm0u5R/alVjdDNtiJhW1Rhdta2Lqeh0EU6lxHewGN04+V4nPS3pOnqiUSZGj94BQ0oLutOiZ1D4ldTFSVgXARaXusgwarljzU5qu4cP2Gau3F6eNC86QKNRsU2qaV0UQmCQzAwND6p2DDWti6nqdNG4ZwMQ7dk+qe096Xx0VTB4vQRrapD306v4SJPSgg4woBdK6uJBYnTj6DQSy2flsK1tEL1GYvXcvP1uW+Yoo9WnvtMlHtLViHKloLZ10Wy04Q+r50VX07oYd7okw1KaTIyOmYS14qAxunGySzLQ6qQj0mc0lTF4vcjBoOpBeYdKygv6YIZVGaGHDxyjuyfxRUZnzczGbtq3k1Cccns5MrLqo8F4SFfdQAiN201IZetihtVOWPYTVSlGV03rYtzpkmp1dKOpEL9Rgt7JfVBptBK55fa0Hz3BHO1Ol5QX9HBWDp6RCD5kesOTE6jTZ2SxtMzF1aeWHnC7ZGa6xEO69AUFqo/QXS6nEqPbqk4P07h1Me10mTwmYyEBowYxMPnXzDNDqaOn89ETh6FcuQI8WidGU17QzXleCgLKKKUhMLmyi1mv5aFrl40t9d8fhRmFaIU2uSFdSchFz4rH6Daq40WPWxfVGKGD4nRJhvsomWg0RkIWG1pfH8QOvkgOoHCW8v5trlKvufixhmSxoCsoSAv6kSKzaNbuGN1JOl0mi07SUZxRnKRMFyWkayS/iHB7O7KaMboF2QB0tquX/aymdbHcUU7jYGPKOV2iGTlIsSj4JvdBm1Vkw2DW0rQ9LeiJ5Gh2uqS8oBfm55HnVyJnD5aLfigkL9NFmRhtdhVALEa4LfETinHyi7NBhp5u9YRATetiqjpdcBQp3yfhRQeQJEHBTCdN2/uO6vyR6YbB6yVYV6fqoOpQSXlBd5j1hPGRGYxRP3Tgji+HQoWjgqahJgKRydkiD5W4dbHZFLcuqldH1xt0Sozu4IBqx1C7YTSkntNFylSaicd6Jj86LJzlYrg/SF+7uk6sYwlDpRciEUINk/tgTSYpL+gAA3oZz4hMvW9yNfSpUOYoS4rTJd9hQq+VaBBmANVDuoxaKyOB6WldTFWnizZTaVge6d486cfE6+jpskviGIsAOAqdLseEoA/ZzOT7YzROclJ0KsRDutSeGNVIgjK3hbrhGEKnUz0X3WrOIBBVL0ZXTetiyjpdMioI6CVivTsm/ZgMtwl7likt6AlEX1YGknRUToweE4IeysrC44/RGosSjiW2llicUYxGaJLmdKnrHlFidFW2LjrsDmIixLAv8WUqUNe6CKnpdDEZiwgYJehrnNLjCme5aNnRTzQyOXdMmgMjGQzoi4vTgn6kMOdXUDAyREwIWicRozsV9Bo9RRnJGQ3GQ7ooLFS1FR1AZpZyqd5aPz2ti6nodNHr3UqM7mDnwTfeg8JZLiLBKB116s2JHGscrU6XY0LQnUWzKQ6MRuIm2LoIStlF7Vx02B3S1ZlfQUhlQY/H6LapFKMLo9ZFFWrooAh6qjldhBBErE60I0MQmfz72DPDgRDQtD29ajRRGLxeQo2NxALqmiGmyjEh6EX5HvJHrYuTXVw0FcocZTQOqT8aLHMrTpcWl4fY4CDRAfVGXAXF6sfoOnLz6W9Xz7oIqed0idnzEcgwMPk5FINZR05pBo3b0nX0RGGo9EIsRqj26Hp/HROCbjfrMIZ70cbUcbqU28uJyTHVR4NjDaPN6lsXHe4MhKyhr1+9UZ0jN49wMKCKdTFVnS7CNRqj2zc1y1zBLBddDYMEhsNqnNYxx9HqdJmUoAshVgkhqoUQu4QQ3znAdhcLIWQhxKLEnWJiGNbFlIbRg4mf5EtWL8t4SFcj6lsXJUlCLywM+dSN0QV1rIsGjYFCW2HKOV00mTMBiPZsndLjCme5kGVoqU6XXRKBvrgYdLqjbmL0oIIuhNAAdwGrgdnAFUKI2RNsZwNuAt5L9EkmAp/NpKQujiS+5pXMXpZlWRbqAwJQNxcdwGyw4g+p50VX07oIo3n1KeZ00WfOJSYg0r1tSo/LKc1AZ9Sk7YsJQuh0GEpLCe6YZoIOLAF2ybJcK8tyCHgIuHCC7X4C/AI4umYJRgm53XhGYjREEn/JGR8NJmNitDzLSm2PH8npVN26mGGzE5L9xCYZBjVV1LYupqLTxWQpIWCQkHun9kGl0Uh4Kp1pQU8ghoqKaVly8QB7DgWbR28bQwixACiUZfmZA+1ICHGtEGK9EGJ9V5d67omJMBSU4fH7GRCCgXAk4fsvs5claYQ+GtJVXKH6alGn0wkiRlebOiKQDOtiVI7SMHj0LdE+VIzGAvxGDVL/1D/MC2e5GOwOMNCVjgFIBIZKL+GWFqI+9RbgTZXDnhQVQkjAb4CvH2xbWZbvlWV5kSzLi7Kysg730FPCVTiLIr/i2FAjpKvCUUHjYCPhqLqTTvGQrlZPhaqTogBZOUqMbkvj1HzPU0FN62Iy8+qThRKja0Xj65nyYwtnOYG0fTFRGLxeAEI1R88ofTKC3gIU7vF7wehtcWzAccBrQoh6YCnw1NE2MVroKcATaAfU8aKXOcqIyBHVR4PxkK4WVwHh1lbkSOKvNuLkFSgfup1t6l1NqWldTFWnS9SWhTYYgKBvSo9z5JixOg3pskuCOBqdLpMR9HWAVwhRKoTQA5cDT8XvlGV5QJZltyzLJbIslwDvAhfIsrxelTM+ROxmPU6/IkyNIypmuqg8CRcP6Wo2uyAaJdzert6xkhKjO2pdVMEemcy5jaTiGB1fTTJGN44QgsLZLlqq+4hF0zEAh4uuoABhNB5VE6MHFXRZliPA9cALwHbgEVmWtwohbhNCXKD2CSaSmCaIPSRTP5B462KJvQSBUH0hSzykqyFuXVTR6WI0GdBgZGCgX7VjjFkX1Wp2YS9PuRG6cCmX+tEpxOjGKZzlIjgSobNBPffSsYLQaDCUlx9V1sVJ1dBlWX5WluVKWZbLZVn+n9HbfijL8lMTbHvG0TY6jzNsMeLxx6hXIXDKpDVRYCtIiniUZVloCCp/OrWti0atBZ9fPS+6I1dl62IKOl20WXMAiHR9NOXHFsx0gkjH6SaKo83pckysFI0Tysok3x+jIZj4kgskMdPFbaVpMERYZ1Ddumg12QiqGKOb4VbfuphqTheDczYRjSDWUz3lx5qserIKbWlBTxCGSi+Rzk6i/f1H+lSAY0zQ9QUleEbCtCATVWESrsxRRv1gPeGYyk6XbCWkq6t0JiGVrYt2u4OoCOFXYUEWqG9dTEWni8lcgt8oQX/9IT2+cJaLjtpBQgH1JtSPFeJOl6NllH5MCbqzaCaF/l4iQtAeTLzoVjgqiMQiNA2pK7LxkK52j1f1EXqmOxOAFpVidEFd62IqOl30ukwCRh3SwKFNiBfOchKLybTs6E/siR2DHG1Ol2NK0AsLiigIKJ5qtayLoP5oMB7S1ZLpUXVSFCB3LEZXPS+6mtbFVHS6jMXo+vrhEF6zvHIHWp2ULrskAG1eHpLFctQ4XY4pQc8w6ckMKIuLGvyJr6OXZpQCyQvpajJlEh0YIDqo3qRlfjxGt2PqC1kmi5rWRUhNp0vMnosmGoHhqccba3QS+V4HzWlBP2yEEBi83qPG6XJMCTqAOTqEJMvUDyR++bNZZ8Zj9SQlg7ssy0KjUEbqanYvysy2I2SJvj71Vheqbl1MQacLzhIA5L76Q3p44WwXfe0j+PqOyuilaYXBW0Fw505VrjCnyjEn6EGLnly/TL0KMbqgiEcyEv7Ks6zUBwQy6uaiS5KETlgYGk6CdbEj7XSZLBrXDAAi3VsO6fGFs5RM/XTZ5fAxeL1E+/uJ9qh3FTtZjjlBD2c68fhjNAyrE1BUbi+nbqCOSExdB0FZlpWBUIwBvUX1kC6z3oo/qN5ClDHrokoj9FR0umizjgcgeoiC7sq3YM7Q05TuYnTYjDldjoKyyzEn6NqCYjz+KI2xqCr7L3OUEY6FaR5S130Snxhtyy1VfXGRzZpBSB5RLUZX0miwZ+eoZl1MRaeLyT6DkE4Q6z00ERFCUDDLSVNVH3LsyJcKpjNjTpejoGn0MSforuKZFPgH6dVoGI4mXtSTlelSMRrS1VqgvnXR6XQhixg9Hf3qHSMvXzXrYio6XeIxuvQf+od54SwXAV+Y7uaphXylGY/G7UbjcKRH6EeCgsJiCuIxuipaF9WeGI2HdLW4PKovLsrOGfWiN6rnRVfTugip53TRaAyEzBa0g4eehJmuoyeGMafLUeBFP+YE3WbS4w4okxdqWBctOgt5ljzVxUMjCUozLTSb3YRbWpFVuNqIk1uQDUBHq5oxuipbF0edLmrn1SeTiC0T3YgPDrF8aLEbcOVb0oKeAI4Wp8sxJ+gAjrDi2GgYVMeyVeYoS047umwLDcIMkQgRFWN0PSWKF13NGN1kWBejcpT6wXpV9n8kkO0FCFmGwZaDb7wfCme5aNs1QCSk3oDgWMDg9RLz+VT9P5wMx6Sga/QylohMfb86oVNxp0tUpYnXOGVuKy0hibDQqGpdNJkNaGSDqjG6alsXU9HpIlxKee9QYnTjFM5yEY3EaN3Vn6CzOjY5WjJdjklBj2Ta8YzEqBtUx4pX4aggGA3S6lNHnOKUZ1uIytBucaluXTRqLPj809e6GHe6JGONQLLQuuMxuhsOeR/5XgeSVqTb0h0mY06XIxwBcEwKuqawUInRDanjFY9PjKpdR4+HdDXZc1XvL2ox2QhE1HNDxK2LajtdUmmErs+ajwxEu7cf8j50Bg155fZ0Hf0w0TgcaLOyjrjT5ZgUdFfJTAr8I7RqJVUmMcrsoyFdKo8Gx7zoeeWqh3TZMxxECRJUwRkUx5mXr5oXHZS/Syo5XYzWMgIGCbmv7rD2UzjLRU+zj5HBFIpGOAIYvEe+2cUxKegFRWUU+HsJShJdKozSbXob2eZs1a2L8ZCulswCQirmuQC43C4Q0NKgonUxJ09V62KFoyKlnC56XSYBkw5p4PA+BNP2xcQQty7KKi3AmwzHpKBbjTqyA8qbtyGgzqikwlGRlHptWZaFJnOm6iP0nLwkxOjm5atuXUwlp4sQgrDFjnbo8ITYXWjDaNGl0xcPE4PXixwIqBqWdzCOSUEHcIUGAKgfUiekq8xeRt1AHTFZ3U/rsiwrjcJCpK+PqE+9GrcnCTG6ybAuQmo5XaL2HHTBAIQP/X0sSYKCmU6atvcecR/1dOZoaHZxzAq6TVIuu+t6VbIuOsrxR/zqO12yrAzGpNGQLvVGBlm5TpAlenvVG8WpbV0stZemnNMFRxEAct/hJUkWznIxPBCit029/rGpjv4ocLocs4IuXFayAzFq+wZU2X98NKj2AqP4xGizNUvVkC5JI6EXZoZ86sXoqm1dTEWni5RZCUCka/Nh7adglhOA5rR98ZDRWK3o8vOPqNPlmBV0qaAAz0iMBr9Kq0XtyWlHFw/parFlqx7SZdJbGVExRldt6yKkntNF654LQLh702HtJyPThCPHnJ4YPUz0R9jpcswKuqN0BgX+IC1CnZfAbrCTZcpSXTziIV3NrgLVFxfZLOrG6IL61sVUc7oYM+cTlSDWs+Ow91U400nLjj6i4SPn0pjuGL1eQrW1yOEj8/46ZgW9oLiMAv8AXToNgag6b+AyR5nq1sV4SFdrZoHqi4ucTieyiNLXrU6ZCtS3Lqaa08VoKiBg1CD6D78bU8EsF5FQjPZa9f6+qY7B60UOhwk1Nh6R4x+zgm416skO9CMLQXNQHetiuV1pR6e2c6A820JjEqyLWdmjMbr109u6COov+koWGo2BoNmENHj4fxPPDCdCEumyy2GgP8LNLo5ZQQfIjFsXferU0eNOl/ZhdRPYytxW2iQzI63t6sboetSP0XXm5AHqWRfHnC4pNDEasWai8x3+qNpg0pJbmpEW9MPAUF4OQhyxidFjWtCdKEJer6J1EZKQ6ZJlIYqgTW8j0qne6LmgNBeA7m71vOiOPA+gnnUxFZ0uMXse2kgE/Id/VVMwy0Vn4xCB4dSYY0g2ktGIvqgoLehHAqvNgCEqU9Olzogk3o5Obeti+ajTpcmWraoX3Ww1Isl6VWN01bYuguJ0SSVBF85SAKKJmBid5QIZmqvS9sVD5Ug6XY5pQdd48vH4Y9T5RlTZv8PowGV0qS4eu73o2apPjBo1Fnwj09u6mGpOF81ojG6489BjdOPklNjQGzXpssthYPB6CTU0EAsmviPawZiUoAshVgkhqoUQu4QQ35ng/luEENuEEJuEEGuEEMWJP9XE46jwkj8SoVnFOctkZLrEQ7qabdmqWxctRnVjdEF962K5o5yIHEkZp4suez4A0e6th70vSSPhmZGOATgcjF4vRKOE6g4vBfNQOKigCyE0wF3AamA2cIUQYvZem20AFsmyPA/4N/DLRJ+oGniKvRT4h2jVaVV785bZFeui2v8cZVkWpWG0yiN0e4aDCAFCIfVGt8mwLkLqOF1MjlmEtQK5NzHPp3CWi6GeAANd6uQcpTpH0ukymRH6EmCXLMu1siyHgIeAC/fcQJblV2VZjtct3gUKEnua6mAx6skLDDCi1dAbVscdUu4oxxf20TGiXuwsKCFdzUmwLroylRjdtobpa10syShJKaeLTpdJwKhFDBx6b9E9icfpptMXDw1DSQlotUdkYnQygu4B9lSJ5tHb9sfVwHMT3SGEuFYIsV4Isb6rSz3r21RwB5VsksYRdepdY5kuKi8wKs+yMigZ6GpXz4ECkJPvBqC1ST1BH7MuqlR2MWqNFFgLUkbQlRjdDDRDifnb27NN2FxGGrelBf1QEHo9htKSo1bQJ40Q4tPAIuBXE90vy/K9siwvkmV5UVZWViIPfci4UC4r63rVqQsn6/I+PjFaH9IQG1Fnkhcgv0iJ0e3s6FbtGGPWRZXr6Kki6ACRjCz0wz5IQCyDEILCWU5aqvuIqbSKOtXRVxwZp8tkBL0FKNzj94LR28YhhDgb+B5wgSzLyZ/ePURcRg0AO9vVESiX0YXT4FRdPMrdu0O61OxelJOfCbKgr1c9W5tiXdSoal1MNacLjiIkWUYeSow7qHB2JqFAlM4G9RxNqYzB6yXc1KTq4GoiJiPo6wCvEKJUCKEHLgee2nMDIcQJwB9RxFy9a3EVsORl4wrGqOlX741b5lDf9+xxmtBL0GRV14suaSR0mBkcUi/vQ7Eu5qqbuugoSymni3ApE3Hhro8Ssr+CGU4Q6bZ0h4rB6wUgWJPcq8CDCrosyxHgeuAFYDvwiCzLW4UQtwkhLhjd7FeAFXhUCLFRCPHUfnZ31GH3evH4YzSreGmZjEwXjSQocZlptmapPjGqdowuJCd1EVLH6aLNGvWiJ0jQjVYd2UW2tKAfIoYj5HTRTmYjWZafBZ7d67Yf7vHz2Qk+r6ThKfVSsOE9NjqNqh2j3FHOUGiIbn83WWb15g7KcjPY0pCjunXRZsmgta8bWZYRQqhyDEdOHk1bN6t2jFRzuuizFgIQ665K2D4LZrnY8GIjIX8EvWlSUpFmFH1REUKvT/rE6DG9UhTAbDSQ5x+kU68jHFPX96x2pktFlpVWs4uRJMXo9veoN0pX27qYak4Xk62MgF6CBMToxima5UKOybTsSMcATBWh0aCvKE8L+pEgKzhEVBK0BFS2LiahHV1MSDR0qVsOcWeNxug2qJciqbZ1EVLL6SJJBoJmI9Jg4uYdcsvsaPUSTem2dIeE4Qg4XdKCDmTKo9bFPnVmpDONmWToM9R3uoyGdNUPRZFV7Cq0O0Z3elsXU83pErE60Q71J2x/Gp1EvteZrqMfIgavl0h7O9FB9frw7k1a0AG3XqnR7mhRZ8QphFAyXZIU0tVodBLpUk9sPSWKoHd3qbeIKRnWxbjTpWEwcWWKI0nMnos+EIBI4hq2FM5y0t8xwlCvOj0DUpkxp0sSR+lpQQeys+xoYzI1KrZWK3OUqe50sRl1uA1CcbqoGNJls1uQZB39KtW3ITnWxbjTZddAijSNdpQggGhv4uq2hbOVGID0KH3qGCpGBT2JTpe0oAMObwX5fpmmUES1Y5TbyxkIDtATUHdpflmmiRZrFiGVrYsGycLQiLqXko7cPFVLLqnmdJEyZwIQ6vwwYft05Vmw2PVpQT8EdPl5SGZzUidG04IO5JdXku8P0KbRqHaMpGW65DuVRhcqO10sRhuBsDqdnuI4c/NVTV1MNafL7hjdLQnbpxCCglkumrf3IavkAktVhCQpzS7Sgp5czEYjHr+PVoNetWMkK9OlPCeDIb2Frmb1ShUAGRl2IvgJqxmjq7J1EVLL6WJ0zycmEltyASV9MTAcpqspHQMwVZLtdEkL+ijZQR+DOi2DEXVidLNMWdh0NvWdLtmK02VXp7qj57EY3Sb1Jl+TZV1MFaeLzpClxOj2J7bcVjDTCaTr6IeCwesl2tNDpEfdUmuctKCPkhVVLIv1A+pYF4UQScl0iYd01Q+qNx8AkJ2rrHhtbVIv5z1ZqYup4nRRYnStaAcTG01tsRvI9FjTfvRDYMzpkqSJ0bSgj5KlU+qD1Q2JaRIwERWOCtUXF3mcJvTINGIk5lev40x+kWJd7FIppRL2sC6mnS6TJmJzoxtOfGmkcJaTtpp+wiF1rmBTlTGnS5LKLmlBHyXPZQOgukO9S6Myexm9gV56A+pdumokQZF5NHWxRb0Pp9wCN8iCXhVjdMesiyp60VPN6SLbPejCEeRAYh1IhbNcxCIybTv7E7rfVEebnYVktydtYjQt6KPklZeQEZZp8qtXSx2bGFV7gZHLRLPK1kWtVoMOk6oxuqC+dTHVnC7CpbzHwl0bErrfPK8DjVaiMV1HnxJCCAxJdLqkBX2U/MpZeEbCtInpb12sKMik3ZLJSKO61kWT3spwQOUYXZWti5BaTheNezRGtyOxgq7Ta8irsKf7jB4CcaeL2o3iIS3oY5iMBjz+Edp16lkXc8w5WHQW9a2LhW6ikoa6RnV7jVjNGYRi6rppHLl5SbEuporTRZ+9AIBoz/aE77twlouelmGGB6ZNQ7KjAoPXS2xwkEin+r1/0oK+BzlBH20mPVGVPkmFEJTby9VfXDRqXaztVKdPahyHw0FMRBjoVW+U7szNB9S3LqaK08XonE1EI5D76hK+78JZSgxAc1Xa7TIVxpwuO9Qvu6QFfQ+yIn7CkqBtWL0gojJHmeq56PGQrjqfutbFrLEY3eltXUwlp4ukMRI0GZAGEv96uQusGK06mralyy5TIZkhXWlB34MsjRI5u61ePXdIub2cnkAP/YF+1Y6RYdSRKcI0hHWq1u1yRmN021sS63vek2RYF1PN6RK22tH6Ej+KFpKgcKaTpqrepNSDUwWt04nG7U7KxGha0PcgP8MMQFWzeo0bktXsotQMzSYX0W4VY3RLcwB1Y3STYV1MNadLNCMH/fAwqCC6BbNcjAyE6G1Vd+4k1UiW0yUt6HtQXlKAJMs0+dQruSQr06XUZaLJlq1qf1G7w4oka+nv71ftGDBqXexQN5smlZwuOIrQxGSig4m3rcbr6OkYgKlhqPASrKlRtfEMpAV9HIWzZ5Drj9Emq/ey5FpyMWlNqotHhcfFkN5CZ526Mbp6ycLQsLoxus7cfPrbWlW3LqaK00XKrAQg2LEu4fu2uYw4cszpGIApYvBWII+MEG5V70oT0oI+DpPJTH4gQLvOoNoxJCFRZlc/08VbobhDdjWoa5VSYnRVdtMkybqYKk4XrXseAJHuzarsv3C2i9YdfUTD6o42U4mxCACVnS5pQd+LXP8wrUb1BB0U8VB9cVGekpBXo7J1McNmJ4yfiEoplZAk66JdKYWlgtPFkLMIgFjPDlX2XzjLRSQco61W3VXCqYTBqzip1Ha6pAV9L3IifnoNWobV7F7kKKfT38lgSL1ShcdpQidHqRtS17rocrlAyLQ3qed0cYwKuprWxVJ7KZKQVP+gTQY6SwEhnQT96lxteCodSJJI19GngMZmQ5uXp/rEaFrQ9yJbKAK4TcXac3w0qKZ4aCRBoQjQENaqdgyA7Fw3AK1N6pV2MrKyVbcuxp0uaq8RSAZCCEIWC5pBdf4meqOWnLKMdAzAFDFUqO90SQv6XuRZjYC6XvQyRxmgfkhXiQkadXZiQfWWasdjdDtVjNFNhnURSEpefbII21zofOqVRApnuehsHCLgm/6TyMnC4PUSqq1Fjqh31ZwW9L2oKMwFoHFAPZ+tx+rBqDGqbl0sc5mUkC4VrYt5hVkgQ2+PuqO1ZFgXKxwVKeN0kTPy0fuDyNGQKvsvnOUCGZqq0qP0yWLwepFDIUKN6l39pwV9L2bOnoE5ItOmYo6/JCRK7aXqT4x6XEQlDbXVjaodQ6vTosXE4KC6E2TJsi6mitMFZykSEFLJ6ZJdbENv0qbLLlPAUDE6Mapi2SUt6Hthtlrx+EO0a9V3uqhdr02WddGkUz9GN25dHBnoV+0YqeR00bhnAxDu/FCV/UsaiYIZTpq296VjACaJobwMhCC4Ky3oSSXPP0K70ajqMcod5XSMdOALqWcrrPAqwVa1neqKrdWcQVDlGN24dbGvTb25jVRyuuizTwAg0r1FtWMUznYx1BtgoFO9VoephGQ2oyssVLW/aFrQJyAnHKDFZCCm4jLdMaeLipkudpMeV3iY2iF1+0AqMbphhlScd0iGdTGVnC6G7AXEALlXvfdX4SxlrUPavjh51Ha6TErQhRCrhBDVQohdQojvTHC/QQjx8Oj97wkhShJ+pkkkWw4T1AhqVGwYnax2dEVJsC664zG69erF6CbDugip43SRtGZCRh1iQL33sD3LTIbbmBb0KWDwegk1NBALqTNZfVBBF0JogLuA1cBs4AohxOy9Nrsa6JNluQL4LfCLRJ9oMvGYdQBs2VWv3jGsHvSSXnXxKDXJNGlt6sbo5mcB6sboKtbFHNWti6nkdAlZM9AOqWcnBSV9saW6j1g0HQMwGQxeL0QihOrqVdn/ZEboS4BdsizXyrIcAh4CLtxrmwuBB0Z//jewXAghEneayaV8dLFMQ696Kzk1koZSe2lSUhcH9Ra6WtSbGC0oUT9GF5SySzJSF1PF6RK1ZaEfVjf6oWiWi1AgSke9uvM0qcJYBIBKZZfJCLoH2NM42Tx624TbyLIcAQaAzL13JIS4VgixXgixvqtLvdHc4XL8cZWc0tWP06Bef1GAFcUrmJ2598VOYlkwu5Dl0Q4CfvUWFzkyM3AZPThdDtWOAVBy/EI8M9R9vWa7ZrOieIWqx0gWovQ0RvJKkWPqXW14ZjgpmpPJ9B2+JRd9aSnWM85AY7ersn9xsEtxIcQlwCpZlr84+vtVwImyLF+/xzZbRrdpHv29ZnSb/V7vLVq0SF6/fn0CnkKaNGnSHDsIIT6QZXnRRPdNZoTeAhTu8XvB6G0TbiOE0AJ2QN3r7zRp0qRJM47JCPo6wCuEKBVC6IHLgaf22uYp4LOjP18CvCKnVxukSZMmTVI5qJ9NluWIEOJ64AVAA/xVluWtQojbgPWyLD8F/AX4uxBiF9CLIvpp0qRJkyaJTMqgLMvys8Cze932wz1+DgCfTOyppUmTJk2aqZBeKZomTZo0KUJa0NOkSZMmRUgLepo0adKkCGlBT5MmTZoU4aALi1Q7sBBdwKGur3YD6oZUJI/0czn6SJXnAenncrRyOM+lWJblrInu+P/t3U+IVWUYx/HvjwxKi7SNlAa2CCOkMlxoggv/gJSY+4qillEWQSjtQzCkIChC/AMOgoxGEBgOJrSpoCwsnciFYVOj40aNXFT4c3HegWGmodM5x977Hp4PDPfOXdzze5h7nznnvfecJ1tDb0PS17OdKVWaqGXw9KUOiFoG1c2qJZZcQgihJ6KhhxBCT5Ta0D/MHaBDUcvg6UsdELUMqptSS5Fr6CGEEGYqdQ89hBDCNNHQQwihJ4pr6P82sLoUku6TdELSGUmnJW3NnakNSbdI+lbSJ7mztCFpvqRhST9KGpW0KnempiS9ll5bP0g6KOm23JnqkrRH0kQanjP52N2SRiSdTbcLcmasY5Y6dqbX1ylJH0ma39X2imroNQdWl+Jv4HXbDwErgZcKrgVgKzCaO0QH3gU+tf0g8AiF1iRpEfAKsML2MqpLX5d0Wet9wMZpj20Djtt+ADiefh90+5hZxwiwzPbDwE/A9q42VlRDp97A6iLYHrd9Mt3/napxTJ/VWgRJi4Engd25s7Qh6S5gDdX1/bH9p+3LWUO1Mwe4PU0Rmwv8ljlPbbY/p5qtMNXUYfT7gS3/Z6Ym/qkO28fS7GWAL6mmwHWitIZeZ2B1cSQtAZYDX2WO0tQ7wBvA9cw52rofuATsTctHuyXNyx2qCdu/Am8D54Fx4IrtY3lTtbbQ9ni6fwFYmDNMR14Ajnb1ZKU19N6RdAdwGHjV9tXcef4rSZuACdvf5M7SgTnAY8D7tpcDf1DGYf0MaX35Kap/UvcC8yQ9kzdVd9KIy6K/cy3pTaql16GunrO0hl5nYHUxJN1K1cyHbB/Jnaeh1cBmST9TLYGtlXQgb6TGxoAx25NHSsNUDb5E64Fzti/Z/gs4AjyeOVNbFyXdA5BuJzLnaUzS88Am4Oku5y+X1tDrDKwugiRRrdWO2t6VO09TtrfbXmx7CdXf4zPbRe4J2r4A/CJpaXpoHXAmY6Q2zgMrJc1Nr7V1FPoB7xRTh9E/B3ycMUtjkjZSLVFutn2ty+cuqqGnDxImB1aPAodsn86bqrHVwLNUe7TfpZ8ncocKvAwMSToFPAq8lTdOM+koYxg4CXxP9V4v5tR5SQeBL4ClksYkvQjsADZIOkt1BLIjZ8Y6ZqnjPeBOYCS97z/obHtx6n8IIfRDUXvoIYQQZhcNPYQQeiIaeggh9EQ09BBC6Ilo6CGE0BPR0EMIoSeioYcQQk/cAJAmOppr2MjWAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } ], - "text/plain": [ - " user_id ... click_referrer_type\n", - "25667 190841 ... 2\n", - "25668 190841 ... 2\n", - "25669 190841 ... 2\n", - "25670 190841 ... 2\n", - "107739 164226 ... 2\n", - "\n", - "[5 rows x 9 columns]" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" + "source": [ + "for _, user_df in sub_user_info.groupby('user_id'):\n", + " item_sim_list = get_item_sim_list(user_df)\n", + " plt.plot(item_sim_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这里由于对词向量的训练迭代次数不是很多,所以看到的可视化结果不是很准确,可以训练更多次来观察具体的现象。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 总结\n", + "\n", + "通过数据分析的过程, 我们目前可以得到以下几点重要的信息, 这个对于我们进行后面的特征制作和分析非常有帮助:\n", + "1. 训练集和测试集的用户id没有重复,也就是测试集里面的用户模型是没有见过的\n", + "2. 训练集中用户最少的点击文章数是2, 而测试集里面用户最少的点击文章数是1\n", + "3. 用户对于文章存在重复点击的情况, 但这个都存在于训练集里面\n", + "4. 同一用户的点击环境存在不唯一的情况,后面做这部分特征的时候可以采用统计特征\n", + "5. 用户点击文章的次数有很大的区分度,后面可以根据这个制作衡量用户活跃度的特征\n", + "6. 文章被用户点击的次数也有很大的区分度,后面可以根据这个制作衡量文章热度的特征\n", + "7. 用户看的新闻,相关性是比较强的,所以往往我们判断用户是否对某篇文章感兴趣的时候, 在很大程度上会和他历史点击过的文章有关\n", + "8. 用户点击的文章字数有比较大的区别, 这个可以反映用户对于文章字数的区别\n", + "9. 用户点击过的文章主题也有很大的区别, 这个可以反映用户的主题偏好\n", + "10.不同用户点击文章的时间差也会有所区别, 这个可以反映用户对于文章时效性的偏好\n", + "\n", + "所以根据上面的一些分析,可以更好的帮助我们后面做好特征工程, 充分挖掘数据的隐含信息。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", + "\n", + "![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" + ] } - ], - "source": [ - "# 随机选择5个用户,查看这些用户前后查看文章的相似性\n", - "sub_user_ids = np.random.choice(user_click_merge.user_id.unique(), size=15, replace=False)\n", - "sub_user_info = user_click_merge[user_click_merge['user_id'].isin(sub_user_ids)]\n", - "\n", - "sub_user_info.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "# 上一个版本,这个函数使用的是赛题提供的词向量,但是由于给出的embedding并不是所有的数据的embedding,所以运行下面画图函数的时候会报keyerror的错误\n", - "# 为了防止出现这个错误,这里修改为使用word2vec训练得到的词向量进行可视化\n", - "def get_item_sim_list(df):\n", - " sim_list = []\n", - " item_list = df['click_article_id'].values\n", - " for i in range(0, len(item_list)-1):\n", - " emb1 = item_w2v_emb_dict[str(item_list[i])] # 需要注意的是word2vec训练时候使用的是str类型的数据\n", - " emb2 = item_w2v_emb_dict[str(item_list[i+1])]\n", - " sim_list.append(np.dot(emb1,emb2)/(np.linalg.norm(emb1)*(np.linalg.norm(emb2))))\n", - " sim_list.append(0)\n", - " return sim_list" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" + ], + "metadata": { + "kernelspec": { + "display_name": "Keras Code", + "language": "python", + "name": "dswipython" + }, + "language_info": { + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "tianchi_metadata": { + "competitions": [], + "datasets": [], + "description": "", + "notebookId": "130008", + "source": "dsw" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "278px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } - ], - "source": [ - "for _, user_df in sub_user_info.groupby('user_id'):\n", - " item_sim_list = get_item_sim_list(user_df)\n", - " plt.plot(item_sim_list)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "这里由于对词向量的训练迭代次数不是很多,所以看到的可视化结果不是很准确,可以训练更多次来观察具体的现象。" - ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 总结\n", - "\n", - "通过数据分析的过程, 我们目前可以得到以下几点重要的信息, 这个对于我们进行后面的特征制作和分析非常有帮助:\n", - "1. 训练集和测试集的用户id没有重复,也就是测试集里面的用户模型是没有见过的\n", - "2. 训练集中用户最少的点击文章数是2, 而测试集里面用户最少的点击文章数是1\n", - "3. 用户对于文章存在重复点击的情况, 但这个都存在于训练集里面\n", - "4. 同一用户的点击环境存在不唯一的情况,后面做这部分特征的时候可以采用统计特征\n", - "5. 用户点击文章的次数有很大的区分度,后面可以根据这个制作衡量用户活跃度的特征\n", - "6. 文章被用户点击的次数也有很大的区分度,后面可以根据这个制作衡量文章热度的特征\n", - "7. 用户看的新闻,相关性是比较强的,所以往往我们判断用户是否对某篇文章感兴趣的时候, 在很大程度上会和他历史点击过的文章有关\n", - "8. 用户点击的文章字数有比较大的区别, 这个可以反映用户对于文章字数的区别\n", - "9. 用户点击过的文章主题也有很大的区别, 这个可以反映用户的主题偏好\n", - "10.不同用户点击文章的时间差也会有所区别, 这个可以反映用户对于文章时效性的偏好\n", - "\n", - "所以根据上面的一些分析,可以更好的帮助我们后面做好特征工程, 充分挖掘数据的隐含信息。" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Keras Code", - "language": "python", - "name": "dswipython" - }, - "language_info": { - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "tianchi_metadata": { - "competitions": [], - "datasets": [], - "description": "", - "notebookId": "130008", - "source": "dsw" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "278px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.3 \345\244\232\350\267\257\345\217\254\345\233\236.ipynb" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.3 \345\244\232\350\267\257\345\217\254\345\233\236.ipynb" index 3a4bccd4e..08bc05222 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.3 \345\244\232\350\267\257\345\217\254\345\233\236.ipynb" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.3 \345\244\232\350\267\257\345\217\254\345\233\236.ipynb" @@ -1,2107 +1,2107 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 多路召回\n", - "\n", - "所谓的“多路召回”策略,就是指采用不同的策略、特征或简单模型,分别召回一部分候选集,然后把候选集混合在一起供后续排序模型使用,可以明显的看出,“多路召回策略”是在“计算速度”和“召回率”之间进行权衡的结果。其中,各种简单策略保证候选集的快速召回,从不同角度设计的策略保证召回率接近理想的状态,不至于损伤排序效果。如下图是多路召回的一个示意图,在多路召回中,每个策略之间毫不相关,所以一般可以写并发多线程同时进行,这样可以更加高效。\n", - "\n", - "\"image-20201119132726873\"\n", - "\n", - "上图只是一个多路召回的例子,也就是说可以使用多种不同的策略来获取用户排序的候选商品集合,而具体使用哪些召回策略其实是与业务强相关的 ,针对不同的任务就会有对于该业务真实场景下需要考虑的召回规则。例如新闻推荐,召回规则可以是“热门新闻”、“作者召回”、“关键词召回”、“主题召回“、”协同过滤召回“等等。 \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 导包" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:29.834662Z", - "start_time": "2020-11-16T11:26:27.811511Z" - } - }, - "outputs": [], - "source": [ - "import pandas as pd \n", - "import numpy as np\n", - "from tqdm import tqdm \n", - "from collections import defaultdict \n", - "import os, math, warnings, math, pickle\n", - "from tqdm import tqdm\n", - "import faiss\n", - "import collections\n", - "import random\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from datetime import datetime\n", - "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from tensorflow.python.keras import backend as K\n", - "from tensorflow.python.keras.models import Model\n", - "from tensorflow.python.keras.preprocessing.sequence import pad_sequences\n", - "\n", - "from deepmatch.models import *\n", - "from deepmatch.utils import sampledsoftmaxloss\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:31.831215Z", - "start_time": "2020-11-16T11:26:31.826939Z" - } - }, - "outputs": [], - "source": [ - "data_path = './data_raw/'\n", - "save_path = './temp_results/'\n", - "# 做召回评估的一个标志, 如果不进行评估就是直接使用全量数据进行召回\n", - "metric_recall = False" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取数据\n", - "在一般的rs比赛中读取数据部分主要分为三种模式, 不同的模式对应的不同的数据集:\n", - "1. debug模式: 这个的目的是帮助我们基于数据先搭建一个简易的baseline并跑通, 保证写的baseline代码没有什么问题。 由于推荐比赛的数据往往非常巨大, 如果一上来直接采用全部的数据进行分析,搭建baseline框架, 往往会带来时间和设备上的损耗, **所以这时候我们往往需要从海量数据的训练集中随机抽取一部分样本来进行调试(train_click_log_sample)**, 先跑通一个baseline。\n", - "2. 线下验证模式: 这个的目的是帮助我们在线下基于已有的训练集数据, 来选择好合适的模型和一些超参数。 **所以我们这一块只需要加载整个训练集(train_click_log)**, 然后把整个训练集再分成训练集和验证集。 训练集是模型的训练数据, 验证集部分帮助我们调整模型的参数和其他的一些超参数。\n", - "3. 线上模式: 我们用debug模式搭建起一个推荐系统比赛的baseline, 用线下验证模式选择好了模型和一些超参数, 这一部分就是真正的对于给定的测试集进行预测, 提交到线上, **所以这一块使用的训练数据集是全量的数据集(train_click_log+test_click_log)**\n", - "\n", - "下面就分别对这三种不同的数据读取模式先建立不同的代导入函数, 方便后面针对不同的模式下导入数据。" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:34.476240Z", - "start_time": "2020-11-16T11:26:34.467352Z" - } - }, - "outputs": [], - "source": [ - "# debug模式: 从训练集中划出一部分数据来调试代码\n", - "def get_all_click_sample(data_path, sample_nums=10000):\n", - " \"\"\"\n", - " 训练集中采样一部分数据调试\n", - " data_path: 原数据的存储路径\n", - " sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)\n", - " \"\"\"\n", - " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " all_user_ids = all_click.user_id.unique()\n", - "\n", - " sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) \n", - " all_click = all_click[all_click['user_id'].isin(sample_user_ids)]\n", - " \n", - " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", - " return all_click\n", - "\n", - "# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中\n", - "# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集\n", - "def get_all_click_df(data_path='./data_raw/', offline=True):\n", - " if offline:\n", - " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " else:\n", - " trn_click = pd.read_csv(data_path + 'train_click_log.csv')\n", - " tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", - "\n", - " all_click = trn_click.append(tst_click)\n", - " \n", - " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", - " return all_click" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:35.168738Z", - "start_time": "2020-11-16T11:26:35.163210Z" - } - }, - "outputs": [], - "source": [ - "# 读取文章的基本属性\n", - "def get_item_info_df(data_path):\n", - " item_info_df = pd.read_csv(data_path + 'articles.csv')\n", - " \n", - " # 为了方便与训练集中的click_article_id拼接,需要把article_id修改成click_article_id\n", - " item_info_df = item_info_df.rename(columns={'article_id': 'click_article_id'})\n", - " \n", - " return item_info_df" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:36.152958Z", - "start_time": "2020-11-16T11:26:36.146324Z" - } - }, - "outputs": [], - "source": [ - "# 读取文章的Embedding数据\n", - "def get_item_emb_dict(data_path):\n", - " item_emb_df = pd.read_csv(data_path + 'articles_emb.csv')\n", - " \n", - " item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]\n", - " item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])\n", - " # 进行归一化\n", - " item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)\n", - "\n", - " item_emb_dict = dict(zip(item_emb_df['article_id'], item_emb_np))\n", - " pickle.dump(item_emb_dict, open(save_path + 'item_content_emb.pkl', 'wb'))\n", - " \n", - " return item_emb_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:37.333536Z", - "start_time": "2020-11-16T11:26:37.329545Z" - } - }, - "outputs": [], - "source": [ - "max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:42.163494Z", - "start_time": "2020-11-16T11:26:38.018094Z" - } - }, - "outputs": [], - "source": [ - "# 采样数据\n", - "# all_click_df = get_all_click_sample(data_path)\n", - "\n", - "# 全量训练集\n", - "all_click_df = get_all_click_df(offline=False)\n", - "\n", - "# 对时间戳进行归一化,用于在关联规则的时候计算权重\n", - "all_click_df['click_timestamp'] = all_click_df[['click_timestamp']].apply(max_min_scaler)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:26:44.343500Z", - "start_time": "2020-11-16T11:26:44.113891Z" - } - }, - "outputs": [], - "source": [ - "item_info_df = get_item_info_df(data_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:24.295343Z", - "start_time": "2020-11-16T11:26:44.398007Z" - } - }, - "outputs": [], - "source": [ - "item_emb_dict = get_item_emb_dict(data_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 工具函数" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取用户-文章-时间函数\n", - "这个在基于关联规则的用户协同过滤的时候会用到" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:33.791656Z", - "start_time": "2020-11-16T11:27:33.784305Z" - } - }, - "outputs": [], - "source": [ - "# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - "def get_user_item_time(click_df):\n", - " \n", - " click_df = click_df.sort_values('click_timestamp')\n", - " \n", - " def make_item_time_pair(df):\n", - " return list(zip(df['click_article_id'], df['click_timestamp']))\n", - " \n", - " user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\\\n", - " .reset_index().rename(columns={0: 'item_time_list'})\n", - " user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))\n", - " \n", - " return user_item_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取文章-用户-时间函数\n", - "这个在基于关联规则的文章协同过滤的时候会用到" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:38.327581Z", - "start_time": "2020-11-16T11:27:38.321059Z" - } - }, - "outputs": [], - "source": [ - "# 根据时间获取商品被点击的用户序列 {item1: [(user1, time1), (user2, time2)...]...}\n", - "# 这里的时间是用户点击当前商品的时间,好像没有直接的关系。\n", - "def get_item_user_time_dict(click_df):\n", - " def make_user_time_pair(df):\n", - " return list(zip(df['user_id'], df['click_timestamp']))\n", - " \n", - " click_df = click_df.sort_values('click_timestamp')\n", - " item_user_time_df = click_df.groupby('click_article_id')['user_id', 'click_timestamp'].apply(lambda x: make_user_time_pair(x))\\\n", - " .reset_index().rename(columns={0: 'user_time_list'})\n", - " \n", - " item_user_time_dict = dict(zip(item_user_time_df['click_article_id'], item_user_time_df['user_time_list']))\n", - " return item_user_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取历史和最后一次点击\n", - "这个在评估召回结果, 特征工程和制作标签转成监督学习测试集的时候回用到" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:50.894683Z", - "start_time": "2020-11-16T11:27:50.888002Z" - } - }, - "outputs": [], - "source": [ - "# 获取当前数据的历史点击和最后一次点击\n", - "def get_hist_and_last_click(all_click):\n", - " \n", - " all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])\n", - " click_last_df = all_click.groupby('user_id').tail(1)\n", - "\n", - " # 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下\n", - " def hist_func(user_df):\n", - " if len(user_df) == 1:\n", - " return user_df\n", - " else:\n", - " return user_df[:-1]\n", - "\n", - " click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n", - "\n", - " return click_hist_df, click_last_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取文章属性特征" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:55.893810Z", - "start_time": "2020-11-16T11:27:55.887623Z" - } - }, - "outputs": [], - "source": [ - "# 获取文章id对应的基本属性,保存成字典的形式,方便后面召回阶段,冷启动阶段直接使用\n", - "def get_item_info_dict(item_info_df):\n", - " max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n", - " item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].apply(max_min_scaler)\n", - " \n", - " item_type_dict = dict(zip(item_info_df['click_article_id'], item_info_df['category_id']))\n", - " item_words_dict = dict(zip(item_info_df['click_article_id'], item_info_df['words_count']))\n", - " item_created_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))\n", - " \n", - " return item_type_dict, item_words_dict, item_created_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-13T06:42:38.730939Z", - "start_time": "2020-11-13T06:42:38.728461Z" - } - }, - "source": [ - "### 获取用户历史点击的文章信息" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:27:59.650781Z", - "start_time": "2020-11-16T11:27:59.640572Z" - } - }, - "outputs": [], - "source": [ - "def get_user_hist_item_info_dict(all_click):\n", - " \n", - " # 获取user_id对应的用户历史点击文章类型的集合字典\n", - " user_hist_item_typs = all_click.groupby('user_id')['category_id'].agg(set).reset_index()\n", - " user_hist_item_typs_dict = dict(zip(user_hist_item_typs['user_id'], user_hist_item_typs['category_id']))\n", - " \n", - " # 获取user_id对应的用户点击文章的集合\n", - " user_hist_item_ids_dict = all_click.groupby('user_id')['click_article_id'].agg(set).reset_index()\n", - " user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['click_article_id']))\n", - " \n", - " # 获取user_id对应的用户历史点击的文章的平均字数字典\n", - " user_hist_item_words = all_click.groupby('user_id')['words_count'].agg('mean').reset_index()\n", - " user_hist_item_words_dict = dict(zip(user_hist_item_words['user_id'], user_hist_item_words['words_count']))\n", - " \n", - " # 获取user_id对应的用户最后一次点击的文章的创建时间\n", - " all_click_ = all_click.sort_values('click_timestamp')\n", - " user_last_item_created_time = all_click_.groupby('user_id')['created_at_ts'].apply(lambda x: x.iloc[-1]).reset_index()\n", - " \n", - " max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n", - " user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']].apply(max_min_scaler)\n", - " \n", - " user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \\\n", - " user_last_item_created_time['created_at_ts']))\n", - " \n", - " return user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 获取点击次数最多的topk个文章" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:28:04.761105Z", - "start_time": "2020-11-16T11:28:04.756419Z" - } - }, - "outputs": [], - "source": [ - "# 获取近期点击最多的文章\n", - "def get_item_topk_click(click_df, k):\n", - " topk_click = click_df['click_article_id'].value_counts().index[:k]\n", - " return topk_click" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 定义多路召回字典" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:28:08.321506Z", - "start_time": "2020-11-16T11:28:07.623281Z" - } - }, - "outputs": [], - "source": [ - "# 获取文章的属性信息,保存成字典的形式方便查询\n", - "item_type_dict, item_words_dict, item_created_time_dict = get_item_info_dict(item_info_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:28:13.791569Z", - "start_time": "2020-11-16T11:28:13.786522Z" - } - }, - "outputs": [], - "source": [ - "# 定义一个多路召回的字典,将各路召回的结果都保存在这个字典当中\n", - "user_multi_recall_dict = {'itemcf_sim_itemcf_recall': {},\n", - " 'embedding_sim_item_recall': {},\n", - " 'youtubednn_recall': {},\n", - " 'youtubednn_usercf_recall': {}, \n", - " 'cold_start_recall': {}}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T05:41:12.710754Z", - "start_time": "2020-11-16T05:40:57.842614Z" - } - }, - "outputs": [], - "source": [ - "# 提取最后一次点击作为召回评估,如果不需要做召回评估直接使用全量的训练集进行召回(线下验证模型)\n", - "# 如果不是召回评估,直接使用全量数据进行召回,不用将最后一次提取出来\n", - "trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 召回效果评估函数\n", - "做完了召回有时候也需要对当前的召回方法或者参数进行调整以达到更好的召回效果,因为召回的结果决定了最终排序的上限,下面也会提供一个召回评估的方法" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T05:41:18.579118Z", - "start_time": "2020-11-16T05:41:18.571887Z" - } - }, - "outputs": [], - "source": [ - "# 依次评估召回的前10, 20, 30, 40, 50个文章中的击中率\n", - "def metrics_recall(user_recall_items_dict, trn_last_click_df, topk=5):\n", - " last_click_item_dict = dict(zip(trn_last_click_df['user_id'], trn_last_click_df['click_article_id']))\n", - " user_num = len(user_recall_items_dict)\n", - " \n", - " for k in range(10, topk+1, 10):\n", - " hit_num = 0\n", - " for user, item_list in user_recall_items_dict.items():\n", - " # 获取前k个召回的结果\n", - " tmp_recall_items = [x[0] for x in user_recall_items_dict[user][:k]]\n", - " if last_click_item_dict[user] in set(tmp_recall_items):\n", - " hit_num += 1\n", - " \n", - " hit_rate = round(hit_num * 1.0 / user_num, 5)\n", - " print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 计算相似性矩阵\n", - "\n", - "这一部分主要是通过协同过滤以及向量检索得到相似性矩阵,相似性矩阵主要分为user2user和item2item,下面依次获取基于itemcf的item2item的相似性矩阵," - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### itemcf i2i_sim\n", - "\n", - "借鉴KDD2020的去偏商品推荐,在计算item2item相似性矩阵时,使用关联规则,使得计算的文章的相似性还考虑到了:\n", - "1. 用户点击的时间权重\n", - "2. 用户点击的顺序权重\n", - "3. 文章创建的时间权重" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:30:51.872262Z", - "start_time": "2020-11-16T11:30:51.860099Z" - } - }, - "outputs": [], - "source": [ - "def itemcf_sim(df, item_created_time_dict):\n", - " \"\"\"\n", - " 文章与文章之间的相似性矩阵计算\n", - " :param df: 数据表\n", - " :item_created_time_dict: 文章创建时间的字典\n", - " return : 文章与文章的相似性矩阵\n", - " \n", - " 思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则\n", - " \"\"\"\n", - " \n", - " user_item_time_dict = get_user_item_time(df)\n", - " \n", - " # 计算物品相似度\n", - " i2i_sim = {}\n", - " item_cnt = defaultdict(int)\n", - " for user, item_time_list in tqdm(user_item_time_dict.items()):\n", - " # 在基于商品的协同过滤优化的时候可以考虑时间因素\n", - " for loc1, (i, i_click_time) in enumerate(item_time_list):\n", - " item_cnt[i] += 1\n", - " i2i_sim.setdefault(i, {})\n", - " for loc2, (j, j_click_time) in enumerate(item_time_list):\n", - " if(i == j):\n", - " continue\n", - " \n", - " # 考虑文章的正向顺序点击和反向顺序点击 \n", - " loc_alpha = 1.0 if loc2 > loc1 else 0.7\n", - " # 位置信息权重,其中的参数可以调节\n", - " loc_weight = loc_alpha * (0.9 ** (np.abs(loc2 - loc1) - 1))\n", - " # 点击时间权重,其中的参数可以调节\n", - " click_time_weight = np.exp(0.7 ** np.abs(i_click_time - j_click_time))\n", - " # 两篇文章创建时间的权重,其中的参数可以调节\n", - " created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", - " i2i_sim[i].setdefault(j, 0)\n", - " # 考虑多种因素的权重计算最终的文章之间的相似度\n", - " i2i_sim[i][j] += loc_weight * click_time_weight * created_time_weight / math.log(len(item_time_list) + 1)\n", - " \n", - " i2i_sim_ = i2i_sim.copy()\n", - " for i, related_items in i2i_sim.items():\n", - " for j, wij in related_items.items():\n", - " i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])\n", - " \n", - " # 将得到的相似性矩阵保存到本地\n", - " pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))\n", - " \n", - " return i2i_sim_" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:47:09.937002Z", - "start_time": "2020-11-16T11:30:57.394334Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [14:20<00:00, 290.38it/s]\n" - ] - } - ], - "source": [ - "i2i_sim = itemcf_sim(all_click_df, item_created_time_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### usercf u2u_sim\n", - "\n", - "在计算用户之间的相似度的时候,也可以使用一些简单的关联规则,比如用户活跃度权重,这里将用户的点击次数作为用户活跃度的指标" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T09:11:14.951940Z", - "start_time": "2020-11-16T09:11:14.945654Z" - } - }, - "outputs": [], - "source": [ - "def get_user_activate_degree_dict(all_click_df):\n", - " all_click_df_ = all_click_df.groupby('user_id')['click_article_id'].count().reset_index()\n", - " \n", - " # 用户活跃度归一化\n", - " mm = MinMaxScaler()\n", - " all_click_df_['click_article_id'] = mm.fit_transform(all_click_df_[['click_article_id']])\n", - " user_activate_degree_dict = dict(zip(all_click_df_['user_id'], all_click_df_['click_article_id']))\n", - " \n", - " return user_activate_degree_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T09:11:19.879276Z", - "start_time": "2020-11-16T09:11:19.868808Z" - } - }, - "outputs": [], - "source": [ - "def usercf_sim(all_click_df, user_activate_degree_dict):\n", - " \"\"\"\n", - " 用户相似性矩阵计算\n", - " :param all_click_df: 数据表\n", - " :param user_activate_degree_dict: 用户活跃度的字典\n", - " return 用户相似性矩阵\n", - " \n", - " 思路: 基于用户的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则\n", - " \"\"\"\n", - " item_user_time_dict = get_item_user_time_dict(all_click_df)\n", - " \n", - " u2u_sim = {}\n", - " user_cnt = defaultdict(int)\n", - " for item, user_time_list in tqdm(item_user_time_dict.items()):\n", - " for u, click_time in user_time_list:\n", - " user_cnt[u] += 1\n", - " u2u_sim.setdefault(u, {})\n", - " for v, click_time in user_time_list:\n", - " u2u_sim[u].setdefault(v, 0)\n", - " if u == v:\n", - " continue\n", - " # 用户平均活跃度作为活跃度的权重,这里的式子也可以改善\n", - " activate_weight = 100 * 0.5 * (user_activate_degree_dict[u] + user_activate_degree_dict[v]) \n", - " u2u_sim[u][v] += activate_weight / math.log(len(user_time_list) + 1)\n", - " \n", - " u2u_sim_ = u2u_sim.copy()\n", - " for u, related_users in u2u_sim.items():\n", - " for v, wij in related_users.items():\n", - " u2u_sim_[u][v] = wij / math.sqrt(user_cnt[u] * user_cnt[v])\n", - " \n", - " # 将得到的相似性矩阵保存到本地\n", - " pickle.dump(u2u_sim_, open(save_path + 'usercf_u2u_sim.pkl', 'wb'))\n", - "\n", - " return u2u_sim_" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T06:59:46.701572Z", - "start_time": "2020-11-16T06:59:26.852246Z" - } - }, - "outputs": [], - "source": [ - "# 由于usercf计算时候太耗费内存了,这里就不直接运行了\n", - "# 如果是采样的话,是可以运行的\n", - "user_activate_degree_dict = get_user_activate_degree_dict(all_click_df)\n", - "u2u_sim = usercf_sim(all_click_df, user_activate_degree_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### item embedding sim\n", - "\n", - "使用Embedding计算item之间的相似度是为了后续冷启动的时候可以获取未出现在点击数据中的文章,后面有对冷启动专门的介绍,这里简单的说一下faiss。\n", - "\n", - "aiss是Facebook的AI团队开源的一套用于做聚类或者相似性搜索的软件库,底层是用C++实现。Faiss因为超级优越的性能,被广泛应用于推荐相关的业务当中.\n", - "\n", - "faiss工具包一般使用在推荐系统中的向量召回部分。在做向量召回的时候要么是u2u,u2i或者i2i,这里的u和i指的是user和item.我们知道在实际的场景中user和item的数量都是海量的,我们最容易想到的基于向量相似度的召回就是使用两层循环遍历user列表或者item列表计算两个向量的相似度,但是这样做在面对海量数据是不切实际的,faiss就是用来加速计算某个查询向量最相似的topk个索引向量。\n", - "\n", - "**faiss查询的原理:**\n", - "\n", - "faiss使用了PCA和PQ(Product quantization乘积量化)两种技术进行向量压缩和编码,当然还使用了其他的技术进行优化,但是PCA和PQ是其中最核心部分。\n", - "\n", - "1. PCA降维算法细节参考下面这个链接进行学习 \n", - "[主成分分析(PCA)原理总结](https://www.cnblogs.com/pinard/p/6239403.html) \n", - "\n", - "2. PQ编码的细节下面这个链接进行学习 \n", - "[实例理解product quantization算法](http://www.fabwrite.com/productquantization)\n", - "\n", - "**faiss使用**\n", - "\n", - "[faiss官方教程](https://github.com/facebookresearch/faiss/wiki/Getting-started)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T09:11:28.631803Z", - "start_time": "2020-11-16T09:11:28.619926Z" - } - }, - "outputs": [], - "source": [ - "# 向量检索相似度计算\n", - "# topk指的是每个item, faiss搜索后返回最相似的topk个item\n", - "def embdding_sim(click_df, item_emb_df, save_path, topk):\n", - " \"\"\"\n", - " 基于内容的文章embedding相似性矩阵计算\n", - " :param click_df: 数据表\n", - " :param item_emb_df: 文章的embedding\n", - " :param save_path: 保存路径\n", - " :patam topk: 找最相似的topk篇\n", - " return 文章相似性矩阵\n", - " \n", - " 思路: 对于每一篇文章, 基于embedding的相似性返回topk个与其最相似的文章, 只不过由于文章数量太多,这里用了faiss进行加速\n", - " \"\"\"\n", - " \n", - " # 文章索引与文章id的字典映射\n", - " item_idx_2_rawid_dict = dict(zip(item_emb_df.index, item_emb_df['article_id']))\n", - " \n", - " item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]\n", - " item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols].values, dtype=np.float32)\n", - " # 向量进行单位化\n", - " item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)\n", - " \n", - " # 建立faiss索引\n", - " item_index = faiss.IndexFlatIP(item_emb_np.shape[1])\n", - " item_index.add(item_emb_np)\n", - " # 相似度查询,给每个索引位置上的向量返回topk个item以及相似度\n", - " sim, idx = item_index.search(item_emb_np, topk) # 返回的是列表\n", - " \n", - " # 将向量检索的结果保存成原始id的对应关系\n", - " item_sim_dict = collections.defaultdict(dict)\n", - " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(item_emb_np)), sim, idx)):\n", - " target_raw_id = item_idx_2_rawid_dict[target_idx]\n", - " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", - " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", - " rele_raw_id = item_idx_2_rawid_dict[rele_idx]\n", - " item_sim_dict[target_raw_id][rele_raw_id] = item_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value\n", - " \n", - " # 保存i2i相似度矩阵\n", - " pickle.dump(item_sim_dict, open(save_path + 'emb_i2i_sim.pkl', 'wb')) \n", - " \n", - " return item_sim_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T09:32:35.926116Z", - "start_time": "2020-11-16T09:11:44.586967Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "364047it [00:23, 15292.14it/s]\n" - ] - } - ], - "source": [ - "item_emb_df = pd.read_csv(data_path + '/articles_emb.csv')\n", - "emb_i2i_sim = embdding_sim(all_click_df, item_emb_df, save_path, topk=10) # topk可以自行设置" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 召回\n", - "这个就是我们开篇提到的那个问题, 面的36万篇文章, 20多万用户的推荐, 我们又有哪些策略来缩减问题的规模? 我们就可以再召回阶段筛选出用户对于点击文章的候选集合, 从而降低问题的规模。召回常用的策略:\n", - "* Youtube DNN 召回\n", - "* 基于文章的召回\n", - " * 文章的协同过滤\n", - " * 基于文章embedding的召回\n", - "* 基于用户的召回\n", - " * 用户的协同过滤\n", - " * 用户embedding\n", - "\n", - "上面的各种召回方式一部分在基于用户已经看得文章的基础上去召回与这些文章相似的一些文章, 而这个相似性的计算方式不同, 就得到了不同的召回方式, 比如文章的协同过滤, 文章内容的embedding等。还有一部分是根据用户的相似性进行推荐,对于某用户推荐与其相似的其他用户看过的文章,比如用户的协同过滤和用户embedding。 还有一种思路是类似矩阵分解的思路,先计算出用户和文章的embedding之后,就可以直接算用户和文章的相似度, 根据这个相似度进行推荐, 比如YouTube DNN。 我们下面详细来看一下每一个召回方法:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### YoutubeDNN召回\n", - "**(这一步是直接获取用户召回的候选文章列表)**\n", - "\n", - "[论文下载地址](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)\n", - "\n", - "**Youtubednn召回架构**\n", - "\n", - "![image-20201111160516562](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201111160516562.png)\n", - "\n", - "\n", - "\n", - "关于YoutubeDNN原理和应用推荐看王喆的两篇博客:\n", - "\n", - "1. [重读Youtube深度学习推荐系统论文,字字珠玑,惊为神文](https://zhuanlan.zhihu.com/p/52169807)\n", - "2. [YouTube深度学习推荐系统的十大工程问题](https://zhuanlan.zhihu.com/p/52504407)\n", - "\n", - "\n", - "**参考文献:**\n", - "1. https://zhuanlan.zhihu.com/p/52169807 (YouTubeDNN原理)\n", - "2. https://zhuanlan.zhihu.com/p/26306795 (Word2Vec知乎众赞文章) --- word2vec放到排序中的w2v的介绍部分\n" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:13:11.058766Z", - "start_time": "2020-11-16T10:13:11.041084Z" - } - }, - "outputs": [], - "source": [ - "# 获取双塔召回时的训练验证数据\n", - "# negsample指的是通过滑窗构建样本的时候,负样本的数量\n", - "def gen_data_set(data, negsample=0):\n", - " data.sort_values(\"click_timestamp\", inplace=True)\n", - " item_ids = data['click_article_id'].unique()\n", - "\n", - " train_set = []\n", - " test_set = []\n", - " for reviewerID, hist in tqdm(data.groupby('user_id')):\n", - " pos_list = hist['click_article_id'].tolist()\n", - " \n", - " if negsample > 0:\n", - " candidate_set = list(set(item_ids) - set(pos_list)) # 用户没看过的文章里面选择负样本\n", - " neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True) # 对于每个正样本,选择n个负样本\n", - " \n", - " # 长度只有一个的时候,需要把这条数据也放到训练集中,不然的话最终学到的embedding就会有缺失\n", - " if len(pos_list) == 1:\n", - " train_set.append((reviewerID, [pos_list[0]], pos_list[0],1,len(pos_list)))\n", - " test_set.append((reviewerID, [pos_list[0]], pos_list[0],1,len(pos_list)))\n", - " \n", - " # 滑窗构造正负样本\n", - " for i in range(1, len(pos_list)):\n", - " hist = pos_list[:i]\n", - " \n", - " if i != len(pos_list) - 1:\n", - " train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]))) # 正样本 [user_id, his_item, pos_item, label, len(his_item)]\n", - " for negi in range(negsample):\n", - " train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1]))) # 负样本 [user_id, his_item, neg_item, label, len(his_item)]\n", - " else:\n", - " # 将最长的那一个序列长度作为测试数据\n", - " test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1])))\n", - " \n", - " random.shuffle(train_set)\n", - " random.shuffle(test_set)\n", - " \n", - " return train_set, test_set\n", - "\n", - "# 将输入的数据进行padding,使得序列特征的长度都一致\n", - "def gen_model_input(train_set,user_profile,seq_max_len):\n", - "\n", - " train_uid = np.array([line[0] for line in train_set])\n", - " train_seq = [line[1] for line in train_set]\n", - " train_iid = np.array([line[2] for line in train_set])\n", - " train_label = np.array([line[3] for line in train_set])\n", - " train_hist_len = np.array([line[4] for line in train_set])\n", - "\n", - " train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)\n", - " train_model_input = {\"user_id\": train_uid, \"click_article_id\": train_iid, \"hist_article_id\": train_seq_pad,\n", - " \"hist_len\": train_hist_len}\n", - "\n", - " return train_model_input, train_label" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:13:18.124452Z", - "start_time": "2020-11-16T10:13:18.098284Z" - } - }, - "outputs": [], - "source": [ - "def youtubednn_u2i_dict(data, topk=20): \n", - " sparse_features = [\"click_article_id\", \"user_id\"]\n", - " SEQ_LEN = 30 # 用户点击序列的长度,短的填充,长的截断\n", - " \n", - " user_profile_ = data[[\"user_id\"]].drop_duplicates('user_id')\n", - " item_profile_ = data[[\"click_article_id\"]].drop_duplicates('click_article_id') \n", - " \n", - " # 类别编码\n", - " features = [\"click_article_id\", \"user_id\"]\n", - " feature_max_idx = {}\n", - " \n", - " for feature in features:\n", - " lbe = LabelEncoder()\n", - " data[feature] = lbe.fit_transform(data[feature])\n", - " feature_max_idx[feature] = data[feature].max() + 1\n", - " \n", - " # 提取user和item的画像,这里具体选择哪些特征还需要进一步的分析和考虑\n", - " user_profile = data[[\"user_id\"]].drop_duplicates('user_id')\n", - " item_profile = data[[\"click_article_id\"]].drop_duplicates('click_article_id') \n", - " \n", - " user_index_2_rawid = dict(zip(user_profile['user_id'], user_profile_['user_id']))\n", - " item_index_2_rawid = dict(zip(item_profile['click_article_id'], item_profile_['click_article_id']))\n", - " \n", - " # 划分训练和测试集\n", - " # 由于深度学习需要的数据量通常都是非常大的,所以为了保证召回的效果,往往会通过滑窗的形式扩充训练样本\n", - " train_set, test_set = gen_data_set(data, 0)\n", - " # 整理输入数据,具体的操作可以看上面的函数\n", - " train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)\n", - " test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)\n", - " \n", - " # 确定Embedding的维度\n", - " embedding_dim = 16\n", - " \n", - " # 将数据整理成模型可以直接输入的形式\n", - " user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),\n", - " VarLenSparseFeat(SparseFeat('hist_article_id', feature_max_idx['click_article_id'], embedding_dim,\n", - " embedding_name=\"click_article_id\"), SEQ_LEN, 'mean', 'hist_len'),]\n", - " item_feature_columns = [SparseFeat('click_article_id', feature_max_idx['click_article_id'], embedding_dim)]\n", - " \n", - " # 模型的定义 \n", - " # num_sampled: 负采样时的样本数量\n", - " model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim))\n", - " # 模型编译\n", - " model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss) \n", - " \n", - " # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练\n", - " history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0)\n", - " \n", - " # 训练完模型之后,提取训练的Embedding,包括user端和item端\n", - " test_user_model_input = test_model_input\n", - " all_item_model_input = {\"click_article_id\": item_profile['click_article_id'].values}\n", - "\n", - " user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n", - " item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n", - " \n", - " # 保存当前的item_embedding 和 user_embedding 排序的时候可能能够用到,但是需要注意保存的时候需要和原始的id对应\n", - " user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n", - " item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n", - " \n", - " # embedding保存之前归一化一下\n", - " user_embs = user_embs / np.linalg.norm(user_embs, axis=1, keepdims=True)\n", - " item_embs = item_embs / np.linalg.norm(item_embs, axis=1, keepdims=True)\n", - " \n", - " # 将Embedding转换成字典的形式方便查询\n", - " raw_user_id_emb_dict = {user_index_2_rawid[k]: \\\n", - " v for k, v in zip(user_profile['user_id'], user_embs)}\n", - " raw_item_id_emb_dict = {item_index_2_rawid[k]: \\\n", - " v for k, v in zip(item_profile['click_article_id'], item_embs)}\n", - " # 将Embedding保存到本地\n", - " pickle.dump(raw_user_id_emb_dict, open(save_path + 'user_youtube_emb.pkl', 'wb'))\n", - " pickle.dump(raw_item_id_emb_dict, open(save_path + 'item_youtube_emb.pkl', 'wb'))\n", - " \n", - " # faiss紧邻搜索,通过user_embedding 搜索与其相似性最高的topk个item\n", - " index = faiss.IndexFlatIP(embedding_dim)\n", - " # 上面已经进行了归一化,这里可以不进行归一化了\n", - "# faiss.normalize_L2(user_embs)\n", - "# faiss.normalize_L2(item_embs)\n", - " index.add(item_embs) # 将item向量构建索引\n", - " sim, idx = index.search(np.ascontiguousarray(user_embs), topk) # 通过user去查询最相似的topk个item\n", - " \n", - " user_recall_items_dict = collections.defaultdict(dict)\n", - " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(test_user_model_input['user_id'], sim, idx)):\n", - " target_raw_id = user_index_2_rawid[target_idx]\n", - " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", - " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", - " rele_raw_id = item_index_2_rawid[rele_idx]\n", - " user_recall_items_dict[target_raw_id][rele_raw_id] = user_recall_items_dict.get(target_raw_id, {})\\\n", - " .get(rele_raw_id, 0) + sim_value\n", - " \n", - " user_recall_items_dict = {k: sorted(v.items(), key=lambda x: x[1], reverse=True) for k, v in user_recall_items_dict.items()}\n", - " # 将召回的结果进行排序\n", - " \n", - " # 保存召回的结果\n", - " # 这里是直接通过向量的方式得到了召回结果,相比于上面的召回方法,上面的只是得到了i2i及u2u的相似性矩阵,还需要进行协同过滤召回才能得到召回结果\n", - " # 可以直接对这个召回结果进行评估,为了方便可以统一写一个评估函数对所有的召回结果进行评估\n", - " pickle.dump(user_recall_items_dict, open(save_path + 'youtube_u2i_dict.pkl', 'wb'))\n", - " return user_recall_items_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T10:21:46.420014Z", - "start_time": "2020-11-16T10:13:35.351131Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [02:02<00:00, 2038.57it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:253: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "keep_dims is deprecated, use keepdims instead\n", - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:253: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Deprecated in favor of operator or tf.math.divide.\n", - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "1149673/1149673 [==============================] - 216s 188us/sample - loss: 0.1326\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "250000it [00:32, 7720.75it/s]\n" - ] - } - ], - "source": [ - "# 由于这里需要做召回评估,所以讲训练集中的最后一次点击都提取了出来\n", - "if not metric_recall:\n", - " user_multi_recall_dict['youtubednn_recall'] = youtubednn_u2i_dict(all_click_df, topk=20)\n", - "else:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - " user_multi_recall_dict['youtubednn_recall'] = youtubednn_u2i_dict(trn_hist_click_df, topk=20)\n", - " # 召回效果评估\n", - " metrics_recall(user_multi_recall_dict['youtubednn_recall'], trn_last_click_df, topk=20)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### itemcf recall\n", - "\n", - "上面已经通过协同过滤,Embedding检索的方式得到了文章的相似度矩阵,下面使用协同过滤的思想,给用户召回与其历史文章相似的文章。\n", - "这里在召回的时候,也是用了关联规则的方式:\n", - "1. 考虑相似文章与历史点击文章顺序的权重(细节看代码)\n", - "2. 考虑文章创建时间的权重,也就是考虑相似文章与历史点击文章创建时间差的权重\n", - "3. 考虑文章内容相似度权重(使用Embedding计算相似文章相似度,但是这里需要注意,在Embedding的时候并没有计算所有商品两两之间的相似度,所以相似的文章与历史点击文章不存在相似度,需要做特殊处理)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T11:48:40.580553Z", - "start_time": "2020-11-16T11:48:40.567130Z" - } - }, - "outputs": [], - "source": [ - "# 基于商品的召回i2i\n", - "def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim):\n", - " \"\"\"\n", - " 基于文章协同过滤的召回\n", - " :param user_id: 用户id\n", - " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - " :param i2i_sim: 字典,文章相似性矩阵\n", - " :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章\n", - " :param recall_item_num: 整数, 最后的召回文章数量\n", - " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全\n", - " :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵\n", - " \n", - " return: 召回的文章列表 [(item1, score1), (item2, score2)...]\n", - " \"\"\"\n", - " # 获取用户历史交互的文章\n", - " user_hist_items = user_item_time_dict[user_id]\n", - " user_hist_items_ = {user_id for user_id, _ in user_hist_items}\n", - " \n", - " item_rank = {}\n", - " for loc, (i, click_time) in enumerate(user_hist_items):\n", - " for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:\n", - " if j in user_hist_items_:\n", - " continue\n", - " \n", - " # 文章创建时间差权重\n", - " created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", - " # 相似文章和历史点击文章序列中历史文章所在的位置权重\n", - " loc_weight = (0.9 ** (len(user_hist_items) - loc))\n", - " \n", - " content_weight = 1.0\n", - " if emb_i2i_sim.get(i, {}).get(j, None) is not None:\n", - " content_weight += emb_i2i_sim[i][j]\n", - " if emb_i2i_sim.get(j, {}).get(i, None) is not None:\n", - " content_weight += emb_i2i_sim[j][i]\n", - " \n", - " item_rank.setdefault(j, 0)\n", - " item_rank[j] += created_time_weight * loc_weight * content_weight * wij\n", - " \n", - " # 不足10个,用热门商品补全\n", - " if len(item_rank) < recall_item_num:\n", - " for i, item in enumerate(item_topk_click):\n", - " if item in item_rank.items(): # 填充的item应该不在原来的列表中\n", - " continue\n", - " item_rank[item] = - i - 100 # 随便给个负数就行\n", - " if len(item_rank) == recall_item_num:\n", - " break\n", - " \n", - " item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]\n", - " \n", - " return item_rank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### itemcf sim召回" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T14:41:23.433038Z", - "start_time": "2020-11-16T11:48:46.286350Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [2:51:13<00:00, 24.33it/s] \n" - ] - } - ], - "source": [ - "# 先进行itemcf召回, 为了召回评估,所以提取最后一次点击\n", - "\n", - "if metric_recall:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - "else:\n", - " trn_hist_click_df = all_click_df\n", - "\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "\n", - "i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))\n", - "emb_i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl', 'rb'))\n", - "\n", - "sim_item_topk = 20\n", - "recall_item_num = 10\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, \\\n", - " i2i_sim, sim_item_topk, recall_item_num, \\\n", - " item_topk_click, item_created_time_dict, emb_i2i_sim)\n", - "\n", - "user_multi_recall_dict['itemcf_sim_itemcf_recall'] = user_recall_items_dict\n", - "pickle.dump(user_multi_recall_dict['itemcf_sim_itemcf_recall'], open(save_path + 'itemcf_recall_dict.pkl', 'wb'))\n", - "\n", - "if metric_recall:\n", - " # 召回效果评估\n", - " metrics_recall(user_multi_recall_dict['itemcf_sim_itemcf_recall'], trn_last_click_df, topk=recall_item_num)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### embedding sim 召回" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T15:04:51.527795Z", - "start_time": "2020-11-16T14:59:03.907519Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [04:35<00:00, 905.85it/s] \n" - ] - } - ], - "source": [ - "# 这里是为了召回评估,所以提取最后一次点击\n", - "if metric_recall:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - "else:\n", - " trn_hist_click_df = all_click_df\n", - "\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))\n", - "\n", - "sim_item_topk = 20\n", - "recall_item_num = 10\n", - "\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, \n", - " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)\n", - " \n", - "user_multi_recall_dict['embedding_sim_item_recall'] = user_recall_items_dict\n", - "pickle.dump(user_multi_recall_dict['embedding_sim_item_recall'], open(save_path + 'embedding_sim_item_recall.pkl', 'wb'))\n", - "\n", - "if metric_recall:\n", - " # 召回效果评估\n", - " metrics_recall(user_multi_recall_dict['embedding_sim_item_recall'], trn_last_click_df, topk=recall_item_num)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### usercf召回\n", - "\n", - "基于用户协同过滤,核心思想是给用户推荐与其相似的用户历史点击文章,因为这里涉及到了相似用户的历史文章,这里仍然可以加上一些关联规则来给用户可能点击的文章进行加权,这里使用的关联规则主要是考虑相似用户的历史点击文章与被推荐用户历史点击商品的关系权重,而这里的关系就可以直接借鉴基于物品的协同过滤相似的做法,只不过这里是对被推荐物品关系的一个累加的过程,下面是使用的一些关系权重,及相关的代码:\n", - "\n", - "1. 计算被推荐用户历史点击文章与相似用户历史点击文章的相似度,文章创建时间差,相对位置的总和,作为各自的权重" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:09:32.293990Z", - "start_time": "2020-11-17T02:09:32.278678Z" - } - }, - "outputs": [], - "source": [ - "# 基于用户的召回 u2u2i\n", - "def user_based_recommend(user_id, user_item_time_dict, u2u_sim, sim_user_topk, recall_item_num, \n", - " item_topk_click, item_created_time_dict, emb_i2i_sim):\n", - " \"\"\"\n", - " 基于文章协同过滤的召回\n", - " :param user_id: 用户id\n", - " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", - " :param u2u_sim: 字典,文章相似性矩阵\n", - " :param sim_user_topk: 整数, 选择与当前用户最相似的前k个用户\n", - " :param recall_item_num: 整数, 最后的召回文章数量\n", - " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全\n", - " :param item_created_time_dict: 文章创建时间列表\n", - " :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵\n", - " \n", - " return: 召回的文章列表 [(item1, score1), (item2, score2)...]\n", - " \"\"\"\n", - " # 历史交互\n", - " user_item_time_list = user_item_time_dict[user_id] # [(item1, time1), (item2, time2)..]\n", - " user_hist_items = set([i for i, t in user_item_time_list]) # 存在一个用户与某篇文章的多次交互, 这里得去重\n", - " \n", - " items_rank = {}\n", - " for sim_u, wuv in sorted(u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[:sim_user_topk]:\n", - " for i, click_time in user_item_time_dict[sim_u]:\n", - " if i in user_hist_items:\n", - " continue\n", - " items_rank.setdefault(i, 0)\n", - " \n", - " loc_weight = 1.0\n", - " content_weight = 1.0\n", - " created_time_weight = 1.0\n", - " \n", - " # 当前文章与该用户看的历史文章进行一个权重交互\n", - " for loc, (j, click_time) in enumerate(user_item_time_list):\n", - " # 点击时的相对位置权重\n", - " loc_weight += 0.9 ** (len(user_item_time_list) - loc)\n", - " # 内容相似性权重\n", - " if emb_i2i_sim.get(i, {}).get(j, None) is not None:\n", - " content_weight += emb_i2i_sim[i][j]\n", - " if emb_i2i_sim.get(j, {}).get(i, None) is not None:\n", - " content_weight += emb_i2i_sim[j][i]\n", - " \n", - " # 创建时间差权重\n", - " created_time_weight += np.exp(0.8 * np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", - " \n", - " items_rank[i] += loc_weight * content_weight * created_time_weight * wuv\n", - " \n", - " # 热度补全\n", - " if len(items_rank) < recall_item_num:\n", - " for i, item in enumerate(item_topk_click):\n", - " if item in items_rank.items(): # 填充的item应该不在原来的列表中\n", - " continue\n", - " items_rank[item] = - i - 100 # 随便给个复数就行\n", - " if len(items_rank) == recall_item_num:\n", - " break\n", - " \n", - " items_rank = sorted(items_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num] \n", - " \n", - " return items_rank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### usercf sim召回" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:05:41.652501Z", - "start_time": "2020-11-16T07:05:40.953871Z" - } - }, - "outputs": [], - "source": [ - "# 这里是为了召回评估,所以提取最后一次点击\n", - "# 由于usercf中计算user之间的相似度的过程太费内存了,全量数据这里就没有跑,跑了一个采样之后的数据\n", - "if metric_recall:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - "else:\n", - " trn_hist_click_df = all_click_df\n", - " \n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "\n", - "u2u_sim = pickle.load(open(save_path + 'usercf_u2u_sim.pkl', 'rb'))\n", - "\n", - "sim_user_topk = 20\n", - "recall_item_num = 10\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, \\\n", - " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim) \n", - "\n", - "pickle.dump(user_recall_items_dict, open(save_path + 'usercf_u2u2i_recall.pkl', 'wb'))\n", - "\n", - "if metric_recall:\n", - " # 召回效果评估\n", - " metrics_recall(user_recall_items_dict, trn_last_click_df, topk=recall_item_num)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T03:09:35.853516Z", - "start_time": "2020-11-16T03:09:35.737625Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### user embedding sim召回\n", - "\n", - "虽然没有直接跑usercf的计算用户之间的相似度,为了验证上述基于用户的协同过滤的代码,下面使用了YoutubeDNN过程中产生的user embedding来进行向量检索每个user最相似的topk个user,在使用这里得到的u2u的相似性矩阵,使用usercf进行召回,具体代码如下" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:09:46.807811Z", - "start_time": "2020-11-17T02:09:46.798033Z" - } - }, - "outputs": [], - "source": [ - "# 使用Embedding的方式获取u2u的相似性矩阵\n", - "# topk指的是每个user, faiss搜索后返回最相似的topk个user\n", - "def u2u_embdding_sim(click_df, user_emb_dict, save_path, topk):\n", - " \n", - " user_list = []\n", - " user_emb_list = []\n", - " for user_id, user_emb in user_emb_dict.items():\n", - " user_list.append(user_id)\n", - " user_emb_list.append(user_emb)\n", - " \n", - " user_index_2_rawid_dict = {k: v for k, v in zip(range(len(user_list)), user_list)} \n", - " \n", - " user_emb_np = np.array(user_emb_list, dtype=np.float32)\n", - " \n", - " # 建立faiss索引\n", - " user_index = faiss.IndexFlatIP(user_emb_np.shape[1])\n", - " user_index.add(user_emb_np)\n", - " # 相似度查询,给每个索引位置上的向量返回topk个item以及相似度\n", - " sim, idx = user_index.search(user_emb_np, topk) # 返回的是列表\n", - " \n", - " # 将向量检索的结果保存成原始id的对应关系\n", - " user_sim_dict = collections.defaultdict(dict)\n", - " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(user_emb_np)), sim, idx)):\n", - " target_raw_id = user_index_2_rawid_dict[target_idx]\n", - " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", - " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", - " rele_raw_id = user_index_2_rawid_dict[rele_idx]\n", - " user_sim_dict[target_raw_id][rele_raw_id] = user_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value\n", - " \n", - " # 保存i2i相似度矩阵\n", - " pickle.dump(user_sim_dict, open(save_path + 'youtube_u2u_sim.pkl', 'wb')) \n", - " return user_sim_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:14:31.355905Z", - "start_time": "2020-11-17T02:09:53.236531Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "250000it [00:23, 10507.45it/s]\n" - ] - } - ], - "source": [ - "# 读取YoutubeDNN过程中产生的user embedding, 然后使用faiss计算用户之间的相似度\n", - "# 这里需要注意,这里得到的user embedding其实并不是很好,因为YoutubeDNN中使用的是用户点击序列来训练的user embedding,\n", - "# 如果序列普遍都比较短的话,其实效果并不是很好\n", - "user_emb_dict = pickle.load(open(save_path + 'user_youtube_emb.pkl', 'rb'))\n", - "u2u_sim = u2u_embdding_sim(all_click_df, user_emb_dict, save_path, topk=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "通过YoutubeDNN得到的user_embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:49:40.755431Z", - "start_time": "2020-11-17T02:28:47.003514Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [19:43<00:00, 211.22it/s]\n" - ] - } - ], - "source": [ - "# 使用召回评估函数验证当前召回方式的效果\n", - "if metric_recall:\n", - " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", - "else:\n", - " trn_hist_click_df = all_click_df\n", - "\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "u2u_sim = pickle.load(open(save_path + 'youtube_u2u_sim.pkl', 'rb'))\n", - "\n", - "sim_user_topk = 20\n", - "recall_item_num = 10\n", - "\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, \\\n", - " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)\n", - " \n", - "user_multi_recall_dict['youtubednn_usercf_recall'] = user_recall_items_dict\n", - "pickle.dump(user_multi_recall_dict['youtubednn_usercf_recall'], open(save_path + 'youtubednn_usercf_recall.pkl', 'wb'))\n", - "\n", - "if metric_recall:\n", - " # 召回效果评估\n", - " metrics_recall(user_multi_recall_dict['youtubednn_usercf_recall'], trn_last_click_df, topk=recall_item_num)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:07:44.326253Z", - "start_time": "2020-11-16T07:07:43.798931Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 冷启动问题" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**冷启动问题可以分成三类:文章冷启动,用户冷启动,系统冷启动。**\n", - "\n", - "- 文章冷启动:对于一个平台系统新加入的文章,该文章没有任何的交互记录,如何推荐给用户的问题。(对于我们场景可以认为是,日志数据中没有出现过的文章都可以认为是冷启动的文章)\n", - "- 用户冷启动:对于一个平台系统新来的用户,该用户还没有文章的交互信息,如何给该用户进行推荐。(对于我们场景就是,测试集中的用户是否在测试集对应的log数据中出现过,如果没有出现过,那么可以认为该用户是冷启动用户。但是有时候并没有这么严格,我们也可以自己设定某些指标来判别哪些用户是冷启动用户,比如通过使用时长,点击率,留存率等等)\n", - "- 系统冷启动:就是对于一个平台刚上线,还没有任何的相关历史数据,此时就是系统冷启动,其实也就是前面两种的一个综合。\n", - "\n", - "**当前场景下冷启动问题的分析:**\n", - "\n", - "对当前的数据进行分析会发现,日志中所有出现过的点击文章只有3w多个,而整个文章库中却有30多万,那么测试集中的用户最后一次点击是否会点击没有出现在日志中的文章呢?如果存在这种情况,说明用户点击的文章之前没有任何的交互信息,这也就是我们所说的文章冷启动。通过数据分析还可以发现,测试集用户只有一次点击的数据占得比例还不少,其实仅仅通过用户的一次点击就给用户推荐文章使用模型的方式也是比较难的,这里其实也可以考虑用户冷启动的问题,但是这里只给出物品冷启动的一些解决方案及代码,关于用户冷启动的话提一些可行性的做法。\n", - "\n", - "1. 文章冷启动(没有冷启动的探索问题) \n", - " 其实我们这里不是为了做文章的冷启动而做冷启动,而是猜测用户可能会点击一些没有在log数据中出现的文章,我们要做的就是如何从将近27万的文章中选择一些文章作为用户冷启动的文章,这里其实也可以看成是一种召回策略,我们这里就采用简单的比较好理解的基于规则的召回策略来获取用户可能点击的未出现在log数据中的文章。\n", - " 现在的问题变成了:如何给每个用户考虑从27万个商品中获取一小部分商品?随机选一些可能是一种方案。下面给出一些参考的方案。\n", - " 1. 首先基于Embedding召回一部分与用户历史相似的文章\n", - " 2. 从基于Embedding召回的文章中通过一些规则过滤掉一些文章,使得留下的文章用户更可能点击。我们这里的规则,可以是,留下那些与用户历史点击文章主题相同的文章,或者字数相差不大的文章。并且留下的文章尽量是与测试集用户最后一次点击时间更接近的文章,或者是当天的文章也行。\n", - "2. 用户冷启动 \n", - " 这里对测试集中的用户点击数据进行分析会发现,测试集中有百分之20的用户只有一次点击,那么这些点击特别少的用户的召回是不是可以单独做一些策略上的补充呢?或者是在排序后直接基于规则加上一些文章呢?这些都可以去尝试,这里没有提供具体的做法。\n", - " \n", - "**注意:** \n", - "\n", - "这里看似和基于embedding计算的item之间相似度然后做itemcf是一致的,但是现在我们的目的不一样,我们这里的目的是找到相似的向量,并且还没有出现在log日志中的商品,再加上一些其他的冷启动的策略,这里需要找回的数量会偏多一点,不然被筛选完之后可能都没有文章了" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T04:30:23.027164Z", - "start_time": "2020-11-17T04:23:09.960235Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [05:01<00:00, 828.60it/s] \n" - ] - } - ], - "source": [ - "# 先进行itemcf召回,这里不需要做召回评估,这里只是一种策略\n", - "trn_hist_click_df = all_click_df\n", - "\n", - "user_recall_items_dict = collections.defaultdict(dict)\n", - "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", - "i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))\n", - "\n", - "sim_item_topk = 150\n", - "recall_item_num = 100 # 稍微召回多一点文章,便于后续的规则筛选\n", - "\n", - "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", - "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", - " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, \n", - " recall_item_num, item_topk_click,item_created_time_dict, emb_i2i_sim)\n", - "pickle.dump(user_recall_items_dict, open(save_path + 'cold_start_items_raw_dict.pkl', 'wb'))" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:11:39.267581Z", - "start_time": "2020-11-17T06:11:39.252563Z" - } - }, - "outputs": [], - "source": [ - "# 基于规则进行文章过滤\n", - "# 保留文章主题与用户历史浏览主题相似的文章\n", - "# 保留文章字数与用户历史浏览文章字数相差不大的文章\n", - "# 保留最后一次点击当天的文章\n", - "# 按照相似度返回最终的结果\n", - "\n", - "def get_click_article_ids_set(all_click_df):\n", - " return set(all_click_df.click_article_id.values)\n", - "\n", - "def cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \\\n", - " user_last_item_created_time_dict, item_type_dict, item_words_dict, \n", - " item_created_time_dict, click_article_ids_set, recall_item_num):\n", - " \"\"\"\n", - " 冷启动的情况下召回一些文章\n", - " :param user_recall_items_dict: 基于内容embedding相似性召回来的很多文章, 字典, {user1: [(item1, item2), ..], }\n", - " :param user_hist_item_typs_dict: 字典, 用户点击的文章的主题映射\n", - " :param user_hist_item_words_dict: 字典, 用户点击的历史文章的字数映射\n", - " :param user_last_item_created_time_idct: 字典,用户点击的历史文章创建时间映射\n", - " :param item_tpye_idct: 字典,文章主题映射\n", - " :param item_words_dict: 字典,文章字数映射\n", - " :param item_created_time_dict: 字典, 文章创建时间映射\n", - " :param click_article_ids_set: 集合,用户点击过得文章, 也就是日志里面出现过的文章\n", - " :param recall_item_num: 召回文章的数量, 这个指的是没有出现在日志里面的文章数量\n", - " \"\"\"\n", - " \n", - " cold_start_user_items_dict = {}\n", - " for user, item_list in tqdm(user_recall_items_dict.items()):\n", - " cold_start_user_items_dict.setdefault(user, [])\n", - " for item, score in item_list:\n", - " # 获取历史文章信息\n", - " hist_item_type_set = user_hist_item_typs_dict[user]\n", - " hist_mean_words = user_hist_item_words_dict[user]\n", - " hist_last_item_created_time = user_last_item_created_time_dict[user]\n", - " hist_last_item_created_time = datetime.fromtimestamp(hist_last_item_created_time)\n", - " \n", - " # 获取当前召回文章的信息\n", - " curr_item_type = item_type_dict[item]\n", - " curr_item_words = item_words_dict[item]\n", - " curr_item_created_time = item_created_time_dict[item]\n", - " curr_item_created_time = datetime.fromtimestamp(curr_item_created_time)\n", - "\n", - " # 首先,文章不能出现在用户的历史点击中, 然后根据文章主题,文章单词数,文章创建时间进行筛选\n", - " if curr_item_type not in hist_item_type_set or \\\n", - " item in click_article_ids_set or \\\n", - " abs(curr_item_words - hist_mean_words) > 200 or \\\n", - " abs((curr_item_created_time - hist_last_item_created_time).days) > 90: \n", - " continue\n", - " \n", - " cold_start_user_items_dict[user].append((item, score)) # {user1: [(item1, score1), (item2, score2)..]...}\n", - " \n", - " # 需要控制一下冷启动召回的数量\n", - " cold_start_user_items_dict = {k: sorted(v, key=lambda x:x[1], reverse=True)[:recall_item_num] \\\n", - " for k, v in cold_start_user_items_dict.items()}\n", - " \n", - " pickle.dump(cold_start_user_items_dict, open(save_path + 'cold_start_user_items_dict.pkl', 'wb'))\n", - " \n", - " return cold_start_user_items_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:35:38.758278Z", - "start_time": "2020-11-17T06:31:40.164332Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [01:49<00:00, 2293.37it/s]\n" - ] - } - ], - "source": [ - "all_click_df_ = all_click_df.copy()\n", - "all_click_df_ = all_click_df_.merge(item_info_df, how='left', on='click_article_id')\n", - "user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict = get_user_hist_item_info_dict(all_click_df_)\n", - "click_article_ids_set = get_click_article_ids_set(all_click_df)\n", - "# 需要注意的是\n", - "# 这里使用了很多规则来筛选冷启动的文章,所以前面再召回的阶段就应该尽可能的多召回一些文章,否则很容易被删掉\n", - "cold_start_user_items_dict = cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \\\n", - " user_last_item_created_time_dict, item_type_dict, item_words_dict, \\\n", - " item_created_time_dict, click_article_ids_set, recall_item_num)\n", - "\n", - "user_multi_recall_dict['cold_start_recall'] = cold_start_user_items_dict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-16T07:13:33.099298Z", - "start_time": "2020-11-16T07:13:32.655036Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 多路召回合并\n", - "多路召回合并就是将前面所有的召回策略得到的用户文章列表合并起来,下面是对前面所有召回结果的汇总\n", - "1. 基于itemcf计算的item之间的相似度sim进行的召回 \n", - "2. 基于embedding搜索得到的item之间的相似度进行的召回\n", - "3. YoutubeDNN召回\n", - "4. YoutubeDNN得到的user之间的相似度进行的召回\n", - "5. 基于冷启动策略的召回\n", - "\n", - "**注意:** \n", - "在做召回评估的时候就会发现有些召回的效果不错有些召回的效果很差,所以对每一路召回的结果,我们可以认为的定义一些权重,来做最终的相似度融合" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T07:02:16.033971Z", - "start_time": "2020-11-17T07:02:16.019819Z" - } - }, - "outputs": [], - "source": [ - "def combine_recall_results(user_multi_recall_dict, weight_dict=None, topk=25):\n", - " final_recall_items_dict = {}\n", - " \n", - " # 对每一种召回结果按照用户进行归一化,方便后面多种召回结果,相同用户的物品之间权重相加\n", - " def norm_user_recall_items_sim(sorted_item_list):\n", - " # 如果冷启动中没有文章或者只有一篇文章,直接返回,出现这种情况的原因可能是冷启动召回的文章数量太少了,\n", - " # 基于规则筛选之后就没有文章了, 这里还可以做一些其他的策略性的筛选\n", - " if len(sorted_item_list) < 2:\n", - " return sorted_item_list\n", - " \n", - " min_sim = sorted_item_list[-1][1]\n", - " max_sim = sorted_item_list[0][1]\n", - " \n", - " norm_sorted_item_list = []\n", - " for item, score in sorted_item_list:\n", - " if max_sim > 0:\n", - " norm_score = 1.0 * (score - min_sim) / (max_sim - min_sim) if max_sim > min_sim else 1.0\n", - " else:\n", - " norm_score = 0.0\n", - " norm_sorted_item_list.append((item, norm_score))\n", - " \n", - " return norm_sorted_item_list\n", - " \n", - " print('多路召回合并...')\n", - " for method, user_recall_items in tqdm(user_multi_recall_dict.items()):\n", - " print(method + '...')\n", - " # 在计算最终召回结果的时候,也可以为每一种召回结果设置一个权重\n", - " if weight_dict == None:\n", - " recall_method_weight = 1\n", - " else:\n", - " recall_method_weight = weight_dict[method]\n", - " \n", - " for user_id, sorted_item_list in user_recall_items.items(): # 进行归一化\n", - " user_recall_items[user_id] = norm_user_recall_items_sim(sorted_item_list)\n", - " \n", - " for user_id, sorted_item_list in user_recall_items.items():\n", - " # print('user_id')\n", - " final_recall_items_dict.setdefault(user_id, {})\n", - " for item, score in sorted_item_list:\n", - " final_recall_items_dict[user_id].setdefault(item, 0)\n", - " final_recall_items_dict[user_id][item] += recall_method_weight * score \n", - " \n", - " final_recall_items_dict_rank = {}\n", - " # 多路召回时也可以控制最终的召回数量\n", - " for user, recall_item_dict in final_recall_items_dict.items():\n", - " final_recall_items_dict_rank[user] = sorted(recall_item_dict.items(), key=lambda x: x[1], reverse=True)[:topk]\n", - "\n", - " # 将多路召回后的最终结果字典保存到本地\n", - " pickle.dump(final_recall_items_dict_rank, open(os.path.join(save_path, 'final_recall_items_dict.pkl'),'wb'))\n", - "\n", - " return final_recall_items_dict_rank" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T07:02:21.078455Z", - "start_time": "2020-11-17T07:02:21.074060Z" - } - }, - "outputs": [], - "source": [ - "# 这里直接对多路召回的权重给了一个相同的值,其实可以根据前面召回的情况来调整参数的值\n", - "weight_dict = {'itemcf_sim_itemcf_recall': 1.0,\n", - " 'embedding_sim_item_recall': 1.0,\n", - " 'youtubednn_recall': 1.0,\n", - " 'youtubednn_usercf_recall': 1.0, \n", - " 'cold_start_recall': 1.0}" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T07:04:35.747924Z", - "start_time": "2020-11-17T07:02:26.889573Z" - } - }, - "outputs": [ + "cells": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/5 [00:00\n", + "\n", + "上图只是一个多路召回的例子,也就是说可以使用多种不同的策略来获取用户排序的候选商品集合,而具体使用哪些召回策略其实是与业务强相关的 ,针对不同的任务就会有对于该业务真实场景下需要考虑的召回规则。例如新闻推荐,召回规则可以是“热门新闻”、“作者召回”、“关键词召回”、“主题召回“、”协同过滤召回“等等。 \n", + "\n" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "多路召回合并...\n", - "itemcf_sim_itemcf_recall...\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 导包" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - " 20%|██ | 1/5 [00:08<00:34, 8.66s/it]" - ] + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:29.834662Z", + "start_time": "2020-11-16T11:26:27.811511Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import numpy as np\n", + "from tqdm import tqdm \n", + "from collections import defaultdict \n", + "import os, math, warnings, math, pickle\n", + "from tqdm import tqdm\n", + "import faiss\n", + "import collections\n", + "import random\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from datetime import datetime\n", + "from deepctr.feature_column import SparseFeat, VarLenSparseFeat\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from tensorflow.python.keras import backend as K\n", + "from tensorflow.python.keras.models import Model\n", + "from tensorflow.python.keras.preprocessing.sequence import pad_sequences\n", + "\n", + "from deepmatch.models import *\n", + "from deepmatch.utils import sampledsoftmaxloss\n", + "warnings.filterwarnings('ignore')" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "embedding_sim_item_recall...\n" - ] + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:31.831215Z", + "start_time": "2020-11-16T11:26:31.826939Z" + } + }, + "outputs": [], + "source": [ + "data_path = './data_raw/'\n", + "save_path = './temp_results/'\n", + "# 做召回评估的一个标志, 如果不进行评估就是直接使用全量数据进行召回\n", + "metric_recall = False" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - " 40%|████ | 2/5 [00:16<00:24, 8.29s/it]" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取数据\n", + "在一般的rs比赛中读取数据部分主要分为三种模式, 不同的模式对应的不同的数据集:\n", + "1. debug模式: 这个的目的是帮助我们基于数据先搭建一个简易的baseline并跑通, 保证写的baseline代码没有什么问题。 由于推荐比赛的数据往往非常巨大, 如果一上来直接采用全部的数据进行分析,搭建baseline框架, 往往会带来时间和设备上的损耗, **所以这时候我们往往需要从海量数据的训练集中随机抽取一部分样本来进行调试(train_click_log_sample)**, 先跑通一个baseline。\n", + "2. 线下验证模式: 这个的目的是帮助我们在线下基于已有的训练集数据, 来选择好合适的模型和一些超参数。 **所以我们这一块只需要加载整个训练集(train_click_log)**, 然后把整个训练集再分成训练集和验证集。 训练集是模型的训练数据, 验证集部分帮助我们调整模型的参数和其他的一些超参数。\n", + "3. 线上模式: 我们用debug模式搭建起一个推荐系统比赛的baseline, 用线下验证模式选择好了模型和一些超参数, 这一部分就是真正的对于给定的测试集进行预测, 提交到线上, **所以这一块使用的训练数据集是全量的数据集(train_click_log+test_click_log)**\n", + "\n", + "下面就分别对这三种不同的数据读取模式先建立不同的代导入函数, 方便后面针对不同的模式下导入数据。" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "youtubednn_recall...\n", - "youtubednn_usercf_recall...\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:34.476240Z", + "start_time": "2020-11-16T11:26:34.467352Z" + } + }, + "outputs": [], + "source": [ + "# debug模式: 从训练集中划出一部分数据来调试代码\n", + "def get_all_click_sample(data_path, sample_nums=10000):\n", + " \"\"\"\n", + " 训练集中采样一部分数据调试\n", + " data_path: 原数据的存储路径\n", + " sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)\n", + " \"\"\"\n", + " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " all_user_ids = all_click.user_id.unique()\n", + "\n", + " sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) \n", + " all_click = all_click[all_click['user_id'].isin(sample_user_ids)]\n", + " \n", + " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", + " return all_click\n", + "\n", + "# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中\n", + "# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集\n", + "def get_all_click_df(data_path='./data_raw/', offline=True):\n", + " if offline:\n", + " all_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " else:\n", + " trn_click = pd.read_csv(data_path + 'train_click_log.csv')\n", + " tst_click = pd.read_csv(data_path + 'testA_click_log.csv')\n", + "\n", + " all_click = trn_click.append(tst_click)\n", + " \n", + " all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))\n", + " return all_click" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - " 80%|████████ | 4/5 [00:23<00:06, 6.98s/it]" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:35.168738Z", + "start_time": "2020-11-16T11:26:35.163210Z" + } + }, + "outputs": [], + "source": [ + "# 读取文章的基本属性\n", + "def get_item_info_df(data_path):\n", + " item_info_df = pd.read_csv(data_path + 'articles.csv')\n", + " \n", + " # 为了方便与训练集中的click_article_id拼接,需要把article_id修改成click_article_id\n", + " item_info_df = item_info_df.rename(columns={'article_id': 'click_article_id'})\n", + " \n", + " return item_info_df" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "cold_start_recall...\n" - ] + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:36.152958Z", + "start_time": "2020-11-16T11:26:36.146324Z" + } + }, + "outputs": [], + "source": [ + "# 读取文章的Embedding数据\n", + "def get_item_emb_dict(data_path):\n", + " item_emb_df = pd.read_csv(data_path + 'articles_emb.csv')\n", + " \n", + " item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]\n", + " item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])\n", + " # 进行归一化\n", + " item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)\n", + "\n", + " item_emb_dict = dict(zip(item_emb_df['article_id'], item_emb_np))\n", + " pickle.dump(item_emb_dict, open(save_path + 'item_content_emb.pkl', 'wb'))\n", + " \n", + " return item_emb_dict" + ] }, { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:42<00:00, 8.40s/it]\n" - ] - } - ], - "source": [ - "# 最终合并之后每个用户召回150个商品进行排序\n", - "final_recall_items_dict_rank = combine_recall_results(user_multi_recall_dict, weight_dict, topk=150)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 总结\n", - "\n", - "上述实现了如下召回策略:\n", - "\n", - "1. 基于关联规则的itemcf\n", - "2. 基于关联规则的usercf\n", - "3. youtubednn召回\n", - "4. 冷启动召回\n", - "\n", - "对于上述实现的召回策略其实都不是最优的结果,我们只是做了个简单的尝试,其中还有很多地方可以优化,包括已经实现的这些召回策略的参数或者新加一些,修改一些关联规则都可以。当然还可以尝试更多的召回策略,比如对新闻进行热度召回等等。\n", - "\n", - "\n", - "\n", - "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "nbTranslate": { - "displayLangs": [ - "*" - ], - "hotkey": "alt-t", - "langInMainMenu": true, - "sourceLang": "en", - "targetLang": "fr", - "useGoogleTranslate": true - }, - "tianchi_metadata": { - "competitions": [], - "datasets": [ + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:37.333536Z", + "start_time": "2020-11-16T11:26:37.329545Z" + } + }, + "outputs": [], + "source": [ + "max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:42.163494Z", + "start_time": "2020-11-16T11:26:38.018094Z" + } + }, + "outputs": [], + "source": [ + "# 采样数据\n", + "# all_click_df = get_all_click_sample(data_path)\n", + "\n", + "# 全量训练集\n", + "all_click_df = get_all_click_df(offline=False)\n", + "\n", + "# 对时间戳进行归一化,用于在关联规则的时候计算权重\n", + "all_click_df['click_timestamp'] = all_click_df[['click_timestamp']].apply(max_min_scaler)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:26:44.343500Z", + "start_time": "2020-11-16T11:26:44.113891Z" + } + }, + "outputs": [], + "source": [ + "item_info_df = get_item_info_df(data_path)" + ] + }, { - "id": "83580", - "title": "零基础入门推荐系统 - 新闻推荐" + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:24.295343Z", + "start_time": "2020-11-16T11:26:44.398007Z" + } + }, + "outputs": [], + "source": [ + "item_emb_dict = get_item_emb_dict(data_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 工具函数" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取用户-文章-时间函数\n", + "这个在基于关联规则的用户协同过滤的时候会用到" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:33.791656Z", + "start_time": "2020-11-16T11:27:33.784305Z" + } + }, + "outputs": [], + "source": [ + "# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + "def get_user_item_time(click_df):\n", + " \n", + " click_df = click_df.sort_values('click_timestamp')\n", + " \n", + " def make_item_time_pair(df):\n", + " return list(zip(df['click_article_id'], df['click_timestamp']))\n", + " \n", + " user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\\\n", + " .reset_index().rename(columns={0: 'item_time_list'})\n", + " user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))\n", + " \n", + " return user_item_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取文章-用户-时间函数\n", + "这个在基于关联规则的文章协同过滤的时候会用到" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:38.327581Z", + "start_time": "2020-11-16T11:27:38.321059Z" + } + }, + "outputs": [], + "source": [ + "# 根据时间获取商品被点击的用户序列 {item1: [(user1, time1), (user2, time2)...]...}\n", + "# 这里的时间是用户点击当前商品的时间,好像没有直接的关系。\n", + "def get_item_user_time_dict(click_df):\n", + " def make_user_time_pair(df):\n", + " return list(zip(df['user_id'], df['click_timestamp']))\n", + " \n", + " click_df = click_df.sort_values('click_timestamp')\n", + " item_user_time_df = click_df.groupby('click_article_id')['user_id', 'click_timestamp'].apply(lambda x: make_user_time_pair(x))\\\n", + " .reset_index().rename(columns={0: 'user_time_list'})\n", + " \n", + " item_user_time_dict = dict(zip(item_user_time_df['click_article_id'], item_user_time_df['user_time_list']))\n", + " return item_user_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取历史和最后一次点击\n", + "这个在评估召回结果, 特征工程和制作标签转成监督学习测试集的时候回用到" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:50.894683Z", + "start_time": "2020-11-16T11:27:50.888002Z" + } + }, + "outputs": [], + "source": [ + "# 获取当前数据的历史点击和最后一次点击\n", + "def get_hist_and_last_click(all_click):\n", + " \n", + " all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])\n", + " click_last_df = all_click.groupby('user_id').tail(1)\n", + "\n", + " # 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下\n", + " def hist_func(user_df):\n", + " if len(user_df) == 1:\n", + " return user_df\n", + " else:\n", + " return user_df[:-1]\n", + "\n", + " click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n", + "\n", + " return click_hist_df, click_last_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取文章属性特征" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:55.893810Z", + "start_time": "2020-11-16T11:27:55.887623Z" + } + }, + "outputs": [], + "source": [ + "# 获取文章id对应的基本属性,保存成字典的形式,方便后面召回阶段,冷启动阶段直接使用\n", + "def get_item_info_dict(item_info_df):\n", + " max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n", + " item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].apply(max_min_scaler)\n", + " \n", + " item_type_dict = dict(zip(item_info_df['click_article_id'], item_info_df['category_id']))\n", + " item_words_dict = dict(zip(item_info_df['click_article_id'], item_info_df['words_count']))\n", + " item_created_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))\n", + " \n", + " return item_type_dict, item_words_dict, item_created_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-13T06:42:38.730939Z", + "start_time": "2020-11-13T06:42:38.728461Z" + } + }, + "source": [ + "### 获取用户历史点击的文章信息" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:27:59.650781Z", + "start_time": "2020-11-16T11:27:59.640572Z" + } + }, + "outputs": [], + "source": [ + "def get_user_hist_item_info_dict(all_click):\n", + " \n", + " # 获取user_id对应的用户历史点击文章类型的集合字典\n", + " user_hist_item_typs = all_click.groupby('user_id')['category_id'].agg(set).reset_index()\n", + " user_hist_item_typs_dict = dict(zip(user_hist_item_typs['user_id'], user_hist_item_typs['category_id']))\n", + " \n", + " # 获取user_id对应的用户点击文章的集合\n", + " user_hist_item_ids_dict = all_click.groupby('user_id')['click_article_id'].agg(set).reset_index()\n", + " user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['click_article_id']))\n", + " \n", + " # 获取user_id对应的用户历史点击的文章的平均字数字典\n", + " user_hist_item_words = all_click.groupby('user_id')['words_count'].agg('mean').reset_index()\n", + " user_hist_item_words_dict = dict(zip(user_hist_item_words['user_id'], user_hist_item_words['words_count']))\n", + " \n", + " # 获取user_id对应的用户最后一次点击的文章的创建时间\n", + " all_click_ = all_click.sort_values('click_timestamp')\n", + " user_last_item_created_time = all_click_.groupby('user_id')['created_at_ts'].apply(lambda x: x.iloc[-1]).reset_index()\n", + " \n", + " max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))\n", + " user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']].apply(max_min_scaler)\n", + " \n", + " user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \\\n", + " user_last_item_created_time['created_at_ts']))\n", + " \n", + " return user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 获取点击次数最多的topk个文章" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:28:04.761105Z", + "start_time": "2020-11-16T11:28:04.756419Z" + } + }, + "outputs": [], + "source": [ + "# 获取近期点击最多的文章\n", + "def get_item_topk_click(click_df, k):\n", + " topk_click = click_df['click_article_id'].value_counts().index[:k]\n", + " return topk_click" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 定义多路召回字典" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:28:08.321506Z", + "start_time": "2020-11-16T11:28:07.623281Z" + } + }, + "outputs": [], + "source": [ + "# 获取文章的属性信息,保存成字典的形式方便查询\n", + "item_type_dict, item_words_dict, item_created_time_dict = get_item_info_dict(item_info_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:28:13.791569Z", + "start_time": "2020-11-16T11:28:13.786522Z" + } + }, + "outputs": [], + "source": [ + "# 定义一个多路召回的字典,将各路召回的结果都保存在这个字典当中\n", + "user_multi_recall_dict = {'itemcf_sim_itemcf_recall': {},\n", + " 'embedding_sim_item_recall': {},\n", + " 'youtubednn_recall': {},\n", + " 'youtubednn_usercf_recall': {}, \n", + " 'cold_start_recall': {}}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T05:41:12.710754Z", + "start_time": "2020-11-16T05:40:57.842614Z" + } + }, + "outputs": [], + "source": [ + "# 提取最后一次点击作为召回评估,如果不需要做召回评估直接使用全量的训练集进行召回(线下验证模型)\n", + "# 如果不是召回评估,直接使用全量数据进行召回,不用将最后一次提取出来\n", + "trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 召回效果评估函数\n", + "做完了召回有时候也需要对当前的召回方法或者参数进行调整以达到更好的召回效果,因为召回的结果决定了最终排序的上限,下面也会提供一个召回评估的方法" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T05:41:18.579118Z", + "start_time": "2020-11-16T05:41:18.571887Z" + } + }, + "outputs": [], + "source": [ + "# 依次评估召回的前10, 20, 30, 40, 50个文章中的击中率\n", + "def metrics_recall(user_recall_items_dict, trn_last_click_df, topk=5):\n", + " last_click_item_dict = dict(zip(trn_last_click_df['user_id'], trn_last_click_df['click_article_id']))\n", + " user_num = len(user_recall_items_dict)\n", + " \n", + " for k in range(10, topk+1, 10):\n", + " hit_num = 0\n", + " for user, item_list in user_recall_items_dict.items():\n", + " # 获取前k个召回的结果\n", + " tmp_recall_items = [x[0] for x in user_recall_items_dict[user][:k]]\n", + " if last_click_item_dict[user] in set(tmp_recall_items):\n", + " hit_num += 1\n", + " \n", + " hit_rate = round(hit_num * 1.0 / user_num, 5)\n", + " print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 计算相似性矩阵\n", + "\n", + "这一部分主要是通过协同过滤以及向量检索得到相似性矩阵,相似性矩阵主要分为user2user和item2item,下面依次获取基于itemcf的item2item的相似性矩阵," + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### itemcf i2i_sim\n", + "\n", + "借鉴KDD2020的去偏商品推荐,在计算item2item相似性矩阵时,使用关联规则,使得计算的文章的相似性还考虑到了:\n", + "1. 用户点击的时间权重\n", + "2. 用户点击的顺序权重\n", + "3. 文章创建的时间权重" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:30:51.872262Z", + "start_time": "2020-11-16T11:30:51.860099Z" + } + }, + "outputs": [], + "source": [ + "def itemcf_sim(df, item_created_time_dict):\n", + " \"\"\"\n", + " 文章与文章之间的相似性矩阵计算\n", + " :param df: 数据表\n", + " :item_created_time_dict: 文章创建时间的字典\n", + " return : 文章与文章的相似性矩阵\n", + " \n", + " 思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则\n", + " \"\"\"\n", + " \n", + " user_item_time_dict = get_user_item_time(df)\n", + " \n", + " # 计算物品相似度\n", + " i2i_sim = {}\n", + " item_cnt = defaultdict(int)\n", + " for user, item_time_list in tqdm(user_item_time_dict.items()):\n", + " # 在基于商品的协同过滤优化的时候可以考虑时间因素\n", + " for loc1, (i, i_click_time) in enumerate(item_time_list):\n", + " item_cnt[i] += 1\n", + " i2i_sim.setdefault(i, {})\n", + " for loc2, (j, j_click_time) in enumerate(item_time_list):\n", + " if(i == j):\n", + " continue\n", + " \n", + " # 考虑文章的正向顺序点击和反向顺序点击 \n", + " loc_alpha = 1.0 if loc2 > loc1 else 0.7\n", + " # 位置信息权重,其中的参数可以调节\n", + " loc_weight = loc_alpha * (0.9 ** (np.abs(loc2 - loc1) - 1))\n", + " # 点击时间权重,其中的参数可以调节\n", + " click_time_weight = np.exp(0.7 ** np.abs(i_click_time - j_click_time))\n", + " # 两篇文章创建时间的权重,其中的参数可以调节\n", + " created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", + " i2i_sim[i].setdefault(j, 0)\n", + " # 考虑多种因素的权重计算最终的文章之间的相似度\n", + " i2i_sim[i][j] += loc_weight * click_time_weight * created_time_weight / math.log(len(item_time_list) + 1)\n", + " \n", + " i2i_sim_ = i2i_sim.copy()\n", + " for i, related_items in i2i_sim.items():\n", + " for j, wij in related_items.items():\n", + " i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])\n", + " \n", + " # 将得到的相似性矩阵保存到本地\n", + " pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))\n", + " \n", + " return i2i_sim_" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:47:09.937002Z", + "start_time": "2020-11-16T11:30:57.394334Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [14:20<00:00, 290.38it/s]\n" + ] + } + ], + "source": [ + "i2i_sim = itemcf_sim(all_click_df, item_created_time_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### usercf u2u_sim\n", + "\n", + "在计算用户之间的相似度的时候,也可以使用一些简单的关联规则,比如用户活跃度权重,这里将用户的点击次数作为用户活跃度的指标" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T09:11:14.951940Z", + "start_time": "2020-11-16T09:11:14.945654Z" + } + }, + "outputs": [], + "source": [ + "def get_user_activate_degree_dict(all_click_df):\n", + " all_click_df_ = all_click_df.groupby('user_id')['click_article_id'].count().reset_index()\n", + " \n", + " # 用户活跃度归一化\n", + " mm = MinMaxScaler()\n", + " all_click_df_['click_article_id'] = mm.fit_transform(all_click_df_[['click_article_id']])\n", + " user_activate_degree_dict = dict(zip(all_click_df_['user_id'], all_click_df_['click_article_id']))\n", + " \n", + " return user_activate_degree_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T09:11:19.879276Z", + "start_time": "2020-11-16T09:11:19.868808Z" + } + }, + "outputs": [], + "source": [ + "def usercf_sim(all_click_df, user_activate_degree_dict):\n", + " \"\"\"\n", + " 用户相似性矩阵计算\n", + " :param all_click_df: 数据表\n", + " :param user_activate_degree_dict: 用户活跃度的字典\n", + " return 用户相似性矩阵\n", + " \n", + " 思路: 基于用户的协同过滤(详细请参考上一期推荐系统基础的组队学习) + 关联规则\n", + " \"\"\"\n", + " item_user_time_dict = get_item_user_time_dict(all_click_df)\n", + " \n", + " u2u_sim = {}\n", + " user_cnt = defaultdict(int)\n", + " for item, user_time_list in tqdm(item_user_time_dict.items()):\n", + " for u, click_time in user_time_list:\n", + " user_cnt[u] += 1\n", + " u2u_sim.setdefault(u, {})\n", + " for v, click_time in user_time_list:\n", + " u2u_sim[u].setdefault(v, 0)\n", + " if u == v:\n", + " continue\n", + " # 用户平均活跃度作为活跃度的权重,这里的式子也可以改善\n", + " activate_weight = 100 * 0.5 * (user_activate_degree_dict[u] + user_activate_degree_dict[v]) \n", + " u2u_sim[u][v] += activate_weight / math.log(len(user_time_list) + 1)\n", + " \n", + " u2u_sim_ = u2u_sim.copy()\n", + " for u, related_users in u2u_sim.items():\n", + " for v, wij in related_users.items():\n", + " u2u_sim_[u][v] = wij / math.sqrt(user_cnt[u] * user_cnt[v])\n", + " \n", + " # 将得到的相似性矩阵保存到本地\n", + " pickle.dump(u2u_sim_, open(save_path + 'usercf_u2u_sim.pkl', 'wb'))\n", + "\n", + " return u2u_sim_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T06:59:46.701572Z", + "start_time": "2020-11-16T06:59:26.852246Z" + } + }, + "outputs": [], + "source": [ + "# 由于usercf计算时候太耗费内存了,这里就不直接运行了\n", + "# 如果是采样的话,是可以运行的\n", + "user_activate_degree_dict = get_user_activate_degree_dict(all_click_df)\n", + "u2u_sim = usercf_sim(all_click_df, user_activate_degree_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### item embedding sim\n", + "\n", + "使用Embedding计算item之间的相似度是为了后续冷启动的时候可以获取未出现在点击数据中的文章,后面有对冷启动专门的介绍,这里简单的说一下faiss。\n", + "\n", + "aiss是Facebook的AI团队开源的一套用于做聚类或者相似性搜索的软件库,底层是用C++实现。Faiss因为超级优越的性能,被广泛应用于推荐相关的业务当中.\n", + "\n", + "faiss工具包一般使用在推荐系统中的向量召回部分。在做向量召回的时候要么是u2u,u2i或者i2i,这里的u和i指的是user和item.我们知道在实际的场景中user和item的数量都是海量的,我们最容易想到的基于向量相似度的召回就是使用两层循环遍历user列表或者item列表计算两个向量的相似度,但是这样做在面对海量数据是不切实际的,faiss就是用来加速计算某个查询向量最相似的topk个索引向量。\n", + "\n", + "**faiss查询的原理:**\n", + "\n", + "faiss使用了PCA和PQ(Product quantization乘积量化)两种技术进行向量压缩和编码,当然还使用了其他的技术进行优化,但是PCA和PQ是其中最核心部分。\n", + "\n", + "1. PCA降维算法细节参考下面这个链接进行学习 \n", + "[主成分分析(PCA)原理总结](https://www.cnblogs.com/pinard/p/6239403.html) \n", + "\n", + "2. PQ编码的细节下面这个链接进行学习 \n", + "[实例理解product quantization算法](http://www.fabwrite.com/productquantization)\n", + "\n", + "**faiss使用**\n", + "\n", + "[faiss官方教程](https://github.com/facebookresearch/faiss/wiki/Getting-started)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T09:11:28.631803Z", + "start_time": "2020-11-16T09:11:28.619926Z" + } + }, + "outputs": [], + "source": [ + "# 向量检索相似度计算\n", + "# topk指的是每个item, faiss搜索后返回最相似的topk个item\n", + "def embdding_sim(click_df, item_emb_df, save_path, topk):\n", + " \"\"\"\n", + " 基于内容的文章embedding相似性矩阵计算\n", + " :param click_df: 数据表\n", + " :param item_emb_df: 文章的embedding\n", + " :param save_path: 保存路径\n", + " :patam topk: 找最相似的topk篇\n", + " return 文章相似性矩阵\n", + " \n", + " 思路: 对于每一篇文章, 基于embedding的相似性返回topk个与其最相似的文章, 只不过由于文章数量太多,这里用了faiss进行加速\n", + " \"\"\"\n", + " \n", + " # 文章索引与文章id的字典映射\n", + " item_idx_2_rawid_dict = dict(zip(item_emb_df.index, item_emb_df['article_id']))\n", + " \n", + " item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]\n", + " item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols].values, dtype=np.float32)\n", + " # 向量进行单位化\n", + " item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)\n", + " \n", + " # 建立faiss索引\n", + " item_index = faiss.IndexFlatIP(item_emb_np.shape[1])\n", + " item_index.add(item_emb_np)\n", + " # 相似度查询,给每个索引位置上的向量返回topk个item以及相似度\n", + " sim, idx = item_index.search(item_emb_np, topk) # 返回的是列表\n", + " \n", + " # 将向量检索的结果保存成原始id的对应关系\n", + " item_sim_dict = collections.defaultdict(dict)\n", + " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(item_emb_np)), sim, idx)):\n", + " target_raw_id = item_idx_2_rawid_dict[target_idx]\n", + " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", + " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", + " rele_raw_id = item_idx_2_rawid_dict[rele_idx]\n", + " item_sim_dict[target_raw_id][rele_raw_id] = item_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value\n", + " \n", + " # 保存i2i相似度矩阵\n", + " pickle.dump(item_sim_dict, open(save_path + 'emb_i2i_sim.pkl', 'wb')) \n", + " \n", + " return item_sim_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T09:32:35.926116Z", + "start_time": "2020-11-16T09:11:44.586967Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "364047it [00:23, 15292.14it/s]\n" + ] + } + ], + "source": [ + "item_emb_df = pd.read_csv(data_path + '/articles_emb.csv')\n", + "emb_i2i_sim = embdding_sim(all_click_df, item_emb_df, save_path, topk=10) # topk可以自行设置" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 召回\n", + "这个就是我们开篇提到的那个问题, 面的36万篇文章, 20多万用户的推荐, 我们又有哪些策略来缩减问题的规模? 我们就可以再召回阶段筛选出用户对于点击文章的候选集合, 从而降低问题的规模。召回常用的策略:\n", + "* Youtube DNN 召回\n", + "* 基于文章的召回\n", + " * 文章的协同过滤\n", + " * 基于文章embedding的召回\n", + "* 基于用户的召回\n", + " * 用户的协同过滤\n", + " * 用户embedding\n", + "\n", + "上面的各种召回方式一部分在基于用户已经看得文章的基础上去召回与这些文章相似的一些文章, 而这个相似性的计算方式不同, 就得到了不同的召回方式, 比如文章的协同过滤, 文章内容的embedding等。还有一部分是根据用户的相似性进行推荐,对于某用户推荐与其相似的其他用户看过的文章,比如用户的协同过滤和用户embedding。 还有一种思路是类似矩阵分解的思路,先计算出用户和文章的embedding之后,就可以直接算用户和文章的相似度, 根据这个相似度进行推荐, 比如YouTube DNN。 我们下面详细来看一下每一个召回方法:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### YoutubeDNN召回\n", + "**(这一步是直接获取用户召回的候选文章列表)**\n", + "\n", + "[论文下载地址](https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/45530.pdf)\n", + "\n", + "**Youtubednn召回架构**\n", + "\n", + "![image-20201111160516562](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201111160516562.png)\n", + "\n", + "\n", + "\n", + "关于YoutubeDNN原理和应用推荐看王喆的两篇博客:\n", + "\n", + "1. [重读Youtube深度学习推荐系统论文,字字珠玑,惊为神文](https://zhuanlan.zhihu.com/p/52169807)\n", + "2. [YouTube深度学习推荐系统的十大工程问题](https://zhuanlan.zhihu.com/p/52504407)\n", + "\n", + "\n", + "**参考文献:**\n", + "1. https://zhuanlan.zhihu.com/p/52169807 (YouTubeDNN原理)\n", + "2. https://zhuanlan.zhihu.com/p/26306795 (Word2Vec知乎众赞文章) --- word2vec放到排序中的w2v的介绍部分\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:13:11.058766Z", + "start_time": "2020-11-16T10:13:11.041084Z" + } + }, + "outputs": [], + "source": [ + "# 获取双塔召回时的训练验证数据\n", + "# negsample指的是通过滑窗构建样本的时候,负样本的数量\n", + "def gen_data_set(data, negsample=0):\n", + " data.sort_values(\"click_timestamp\", inplace=True)\n", + " item_ids = data['click_article_id'].unique()\n", + "\n", + " train_set = []\n", + " test_set = []\n", + " for reviewerID, hist in tqdm(data.groupby('user_id')):\n", + " pos_list = hist['click_article_id'].tolist()\n", + " \n", + " if negsample > 0:\n", + " candidate_set = list(set(item_ids) - set(pos_list)) # 用户没看过的文章里面选择负样本\n", + " neg_list = np.random.choice(candidate_set,size=len(pos_list)*negsample,replace=True) # 对于每个正样本,选择n个负样本\n", + " \n", + " # 长度只有一个的时候,需要把这条数据也放到训练集中,不然的话最终学到的embedding就会有缺失\n", + " if len(pos_list) == 1:\n", + " train_set.append((reviewerID, [pos_list[0]], pos_list[0],1,len(pos_list)))\n", + " test_set.append((reviewerID, [pos_list[0]], pos_list[0],1,len(pos_list)))\n", + " \n", + " # 滑窗构造正负样本\n", + " for i in range(1, len(pos_list)):\n", + " hist = pos_list[:i]\n", + " \n", + " if i != len(pos_list) - 1:\n", + " train_set.append((reviewerID, hist[::-1], pos_list[i], 1, len(hist[::-1]))) # 正样本 [user_id, his_item, pos_item, label, len(his_item)]\n", + " for negi in range(negsample):\n", + " train_set.append((reviewerID, hist[::-1], neg_list[i*negsample+negi], 0,len(hist[::-1]))) # 负样本 [user_id, his_item, neg_item, label, len(his_item)]\n", + " else:\n", + " # 将最长的那一个序列长度作为测试数据\n", + " test_set.append((reviewerID, hist[::-1], pos_list[i],1,len(hist[::-1])))\n", + " \n", + " random.shuffle(train_set)\n", + " random.shuffle(test_set)\n", + " \n", + " return train_set, test_set\n", + "\n", + "# 将输入的数据进行padding,使得序列特征的长度都一致\n", + "def gen_model_input(train_set,user_profile,seq_max_len):\n", + "\n", + " train_uid = np.array([line[0] for line in train_set])\n", + " train_seq = [line[1] for line in train_set]\n", + " train_iid = np.array([line[2] for line in train_set])\n", + " train_label = np.array([line[3] for line in train_set])\n", + " train_hist_len = np.array([line[4] for line in train_set])\n", + "\n", + " train_seq_pad = pad_sequences(train_seq, maxlen=seq_max_len, padding='post', truncating='post', value=0)\n", + " train_model_input = {\"user_id\": train_uid, \"click_article_id\": train_iid, \"hist_article_id\": train_seq_pad,\n", + " \"hist_len\": train_hist_len}\n", + "\n", + " return train_model_input, train_label" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:13:18.124452Z", + "start_time": "2020-11-16T10:13:18.098284Z" + } + }, + "outputs": [], + "source": [ + "def youtubednn_u2i_dict(data, topk=20): \n", + " sparse_features = [\"click_article_id\", \"user_id\"]\n", + " SEQ_LEN = 30 # 用户点击序列的长度,短的填充,长的截断\n", + " \n", + " user_profile_ = data[[\"user_id\"]].drop_duplicates('user_id')\n", + " item_profile_ = data[[\"click_article_id\"]].drop_duplicates('click_article_id') \n", + " \n", + " # 类别编码\n", + " features = [\"click_article_id\", \"user_id\"]\n", + " feature_max_idx = {}\n", + " \n", + " for feature in features:\n", + " lbe = LabelEncoder()\n", + " data[feature] = lbe.fit_transform(data[feature])\n", + " feature_max_idx[feature] = data[feature].max() + 1\n", + " \n", + " # 提取user和item的画像,这里具体选择哪些特征还需要进一步的分析和考虑\n", + " user_profile = data[[\"user_id\"]].drop_duplicates('user_id')\n", + " item_profile = data[[\"click_article_id\"]].drop_duplicates('click_article_id') \n", + " \n", + " user_index_2_rawid = dict(zip(user_profile['user_id'], user_profile_['user_id']))\n", + " item_index_2_rawid = dict(zip(item_profile['click_article_id'], item_profile_['click_article_id']))\n", + " \n", + " # 划分训练和测试集\n", + " # 由于深度学习需要的数据量通常都是非常大的,所以为了保证召回的效果,往往会通过滑窗的形式扩充训练样本\n", + " train_set, test_set = gen_data_set(data, 0)\n", + " # 整理输入数据,具体的操作可以看上面的函数\n", + " train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)\n", + " test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)\n", + " \n", + " # 确定Embedding的维度\n", + " embedding_dim = 16\n", + " \n", + " # 将数据整理成模型可以直接输入的形式\n", + " user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),\n", + " VarLenSparseFeat(SparseFeat('hist_article_id', feature_max_idx['click_article_id'], embedding_dim,\n", + " embedding_name=\"click_article_id\"), SEQ_LEN, 'mean', 'hist_len'),]\n", + " item_feature_columns = [SparseFeat('click_article_id', feature_max_idx['click_article_id'], embedding_dim)]\n", + " \n", + " # 模型的定义 \n", + " # num_sampled: 负采样时的样本数量\n", + " model = YoutubeDNN(user_feature_columns, item_feature_columns, num_sampled=5, user_dnn_hidden_units=(64, embedding_dim))\n", + " # 模型编译\n", + " model.compile(optimizer=\"adam\", loss=sampledsoftmaxloss) \n", + " \n", + " # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练\n", + " history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0)\n", + " \n", + " # 训练完模型之后,提取训练的Embedding,包括user端和item端\n", + " test_user_model_input = test_model_input\n", + " all_item_model_input = {\"click_article_id\": item_profile['click_article_id'].values}\n", + "\n", + " user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)\n", + " item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)\n", + " \n", + " # 保存当前的item_embedding 和 user_embedding 排序的时候可能能够用到,但是需要注意保存的时候需要和原始的id对应\n", + " user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)\n", + " item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)\n", + " \n", + " # embedding保存之前归一化一下\n", + " user_embs = user_embs / np.linalg.norm(user_embs, axis=1, keepdims=True)\n", + " item_embs = item_embs / np.linalg.norm(item_embs, axis=1, keepdims=True)\n", + " \n", + " # 将Embedding转换成字典的形式方便查询\n", + " raw_user_id_emb_dict = {user_index_2_rawid[k]: \\\n", + " v for k, v in zip(user_profile['user_id'], user_embs)}\n", + " raw_item_id_emb_dict = {item_index_2_rawid[k]: \\\n", + " v for k, v in zip(item_profile['click_article_id'], item_embs)}\n", + " # 将Embedding保存到本地\n", + " pickle.dump(raw_user_id_emb_dict, open(save_path + 'user_youtube_emb.pkl', 'wb'))\n", + " pickle.dump(raw_item_id_emb_dict, open(save_path + 'item_youtube_emb.pkl', 'wb'))\n", + " \n", + " # faiss紧邻搜索,通过user_embedding 搜索与其相似性最高的topk个item\n", + " index = faiss.IndexFlatIP(embedding_dim)\n", + " # 上面已经进行了归一化,这里可以不进行归一化了\n", + "# faiss.normalize_L2(user_embs)\n", + "# faiss.normalize_L2(item_embs)\n", + " index.add(item_embs) # 将item向量构建索引\n", + " sim, idx = index.search(np.ascontiguousarray(user_embs), topk) # 通过user去查询最相似的topk个item\n", + " \n", + " user_recall_items_dict = collections.defaultdict(dict)\n", + " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(test_user_model_input['user_id'], sim, idx)):\n", + " target_raw_id = user_index_2_rawid[target_idx]\n", + " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", + " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", + " rele_raw_id = item_index_2_rawid[rele_idx]\n", + " user_recall_items_dict[target_raw_id][rele_raw_id] = user_recall_items_dict.get(target_raw_id, {})\\\n", + " .get(rele_raw_id, 0) + sim_value\n", + " \n", + " user_recall_items_dict = {k: sorted(v.items(), key=lambda x: x[1], reverse=True) for k, v in user_recall_items_dict.items()}\n", + " # 将召回的结果进行排序\n", + " \n", + " # 保存召回的结果\n", + " # 这里是直接通过向量的方式得到了召回结果,相比于上面的召回方法,上面的只是得到了i2i及u2u的相似性矩阵,还需要进行协同过滤召回才能得到召回结果\n", + " # 可以直接对这个召回结果进行评估,为了方便可以统一写一个评估函数对所有的召回结果进行评估\n", + " pickle.dump(user_recall_items_dict, open(save_path + 'youtube_u2i_dict.pkl', 'wb'))\n", + " return user_recall_items_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T10:21:46.420014Z", + "start_time": "2020-11-16T10:13:35.351131Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [02:02<00:00, 2038.57it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:253: calling reduce_sum_v1 (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "keep_dims is deprecated, use keepdims instead\n", + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:253: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Deprecated in favor of operator or tf.math.divide.\n", + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "1149673/1149673 [==============================] - 216s 188us/sample - loss: 0.1326\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "250000it [00:32, 7720.75it/s]\n" + ] + } + ], + "source": [ + "# 由于这里需要做召回评估,所以讲训练集中的最后一次点击都提取了出来\n", + "if not metric_recall:\n", + " user_multi_recall_dict['youtubednn_recall'] = youtubednn_u2i_dict(all_click_df, topk=20)\n", + "else:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + " user_multi_recall_dict['youtubednn_recall'] = youtubednn_u2i_dict(trn_hist_click_df, topk=20)\n", + " # 召回效果评估\n", + " metrics_recall(user_multi_recall_dict['youtubednn_recall'], trn_last_click_df, topk=20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### itemcf recall\n", + "\n", + "上面已经通过协同过滤,Embedding检索的方式得到了文章的相似度矩阵,下面使用协同过滤的思想,给用户召回与其历史文章相似的文章。\n", + "这里在召回的时候,也是用了关联规则的方式:\n", + "1. 考虑相似文章与历史点击文章顺序的权重(细节看代码)\n", + "2. 考虑文章创建时间的权重,也就是考虑相似文章与历史点击文章创建时间差的权重\n", + "3. 考虑文章内容相似度权重(使用Embedding计算相似文章相似度,但是这里需要注意,在Embedding的时候并没有计算所有商品两两之间的相似度,所以相似的文章与历史点击文章不存在相似度,需要做特殊处理)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T11:48:40.580553Z", + "start_time": "2020-11-16T11:48:40.567130Z" + } + }, + "outputs": [], + "source": [ + "# 基于商品的召回i2i\n", + "def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim):\n", + " \"\"\"\n", + " 基于文章协同过滤的召回\n", + " :param user_id: 用户id\n", + " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + " :param i2i_sim: 字典,文章相似性矩阵\n", + " :param sim_item_topk: 整数, 选择与当前文章最相似的前k篇文章\n", + " :param recall_item_num: 整数, 最后的召回文章数量\n", + " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全\n", + " :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵\n", + " \n", + " return: 召回的文章列表 [(item1, score1), (item2, score2)...]\n", + " \"\"\"\n", + " # 获取用户历史交互的文章\n", + " user_hist_items = user_item_time_dict[user_id]\n", + " user_hist_items_ = {user_id for user_id, _ in user_hist_items}\n", + " \n", + " item_rank = {}\n", + " for loc, (i, click_time) in enumerate(user_hist_items):\n", + " for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:\n", + " if j in user_hist_items_:\n", + " continue\n", + " \n", + " # 文章创建时间差权重\n", + " created_time_weight = np.exp(0.8 ** np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", + " # 相似文章和历史点击文章序列中历史文章所在的位置权重\n", + " loc_weight = (0.9 ** (len(user_hist_items) - loc))\n", + " \n", + " content_weight = 1.0\n", + " if emb_i2i_sim.get(i, {}).get(j, None) is not None:\n", + " content_weight += emb_i2i_sim[i][j]\n", + " if emb_i2i_sim.get(j, {}).get(i, None) is not None:\n", + " content_weight += emb_i2i_sim[j][i]\n", + " \n", + " item_rank.setdefault(j, 0)\n", + " item_rank[j] += created_time_weight * loc_weight * content_weight * wij\n", + " \n", + " # 不足10个,用热门商品补全\n", + " if len(item_rank) < recall_item_num:\n", + " for i, item in enumerate(item_topk_click):\n", + " if item in item_rank.items(): # 填充的item应该不在原来的列表中\n", + " continue\n", + " item_rank[item] = - i - 100 # 随便给个负数就行\n", + " if len(item_rank) == recall_item_num:\n", + " break\n", + " \n", + " item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]\n", + " \n", + " return item_rank" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### itemcf sim召回" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T14:41:23.433038Z", + "start_time": "2020-11-16T11:48:46.286350Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [2:51:13<00:00, 24.33it/s] \n" + ] + } + ], + "source": [ + "# 先进行itemcf召回, 为了召回评估,所以提取最后一次点击\n", + "\n", + "if metric_recall:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + "else:\n", + " trn_hist_click_df = all_click_df\n", + "\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "\n", + "i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))\n", + "emb_i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl', 'rb'))\n", + "\n", + "sim_item_topk = 20\n", + "recall_item_num = 10\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, \\\n", + " i2i_sim, sim_item_topk, recall_item_num, \\\n", + " item_topk_click, item_created_time_dict, emb_i2i_sim)\n", + "\n", + "user_multi_recall_dict['itemcf_sim_itemcf_recall'] = user_recall_items_dict\n", + "pickle.dump(user_multi_recall_dict['itemcf_sim_itemcf_recall'], open(save_path + 'itemcf_recall_dict.pkl', 'wb'))\n", + "\n", + "if metric_recall:\n", + " # 召回效果评估\n", + " metrics_recall(user_multi_recall_dict['itemcf_sim_itemcf_recall'], trn_last_click_df, topk=recall_item_num)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### embedding sim 召回" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T15:04:51.527795Z", + "start_time": "2020-11-16T14:59:03.907519Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [04:35<00:00, 905.85it/s] \n" + ] + } + ], + "source": [ + "# 这里是为了召回评估,所以提取最后一次点击\n", + "if metric_recall:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + "else:\n", + " trn_hist_click_df = all_click_df\n", + "\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))\n", + "\n", + "sim_item_topk = 20\n", + "recall_item_num = 10\n", + "\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, \n", + " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)\n", + " \n", + "user_multi_recall_dict['embedding_sim_item_recall'] = user_recall_items_dict\n", + "pickle.dump(user_multi_recall_dict['embedding_sim_item_recall'], open(save_path + 'embedding_sim_item_recall.pkl', 'wb'))\n", + "\n", + "if metric_recall:\n", + " # 召回效果评估\n", + " metrics_recall(user_multi_recall_dict['embedding_sim_item_recall'], trn_last_click_df, topk=recall_item_num)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### usercf召回\n", + "\n", + "基于用户协同过滤,核心思想是给用户推荐与其相似的用户历史点击文章,因为这里涉及到了相似用户的历史文章,这里仍然可以加上一些关联规则来给用户可能点击的文章进行加权,这里使用的关联规则主要是考虑相似用户的历史点击文章与被推荐用户历史点击商品的关系权重,而这里的关系就可以直接借鉴基于物品的协同过滤相似的做法,只不过这里是对被推荐物品关系的一个累加的过程,下面是使用的一些关系权重,及相关的代码:\n", + "\n", + "1. 计算被推荐用户历史点击文章与相似用户历史点击文章的相似度,文章创建时间差,相对位置的总和,作为各自的权重" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:09:32.293990Z", + "start_time": "2020-11-17T02:09:32.278678Z" + } + }, + "outputs": [], + "source": [ + "# 基于用户的召回 u2u2i\n", + "def user_based_recommend(user_id, user_item_time_dict, u2u_sim, sim_user_topk, recall_item_num, \n", + " item_topk_click, item_created_time_dict, emb_i2i_sim):\n", + " \"\"\"\n", + " 基于文章协同过滤的召回\n", + " :param user_id: 用户id\n", + " :param user_item_time_dict: 字典, 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}\n", + " :param u2u_sim: 字典,文章相似性矩阵\n", + " :param sim_user_topk: 整数, 选择与当前用户最相似的前k个用户\n", + " :param recall_item_num: 整数, 最后的召回文章数量\n", + " :param item_topk_click: 列表,点击次数最多的文章列表,用户召回补全\n", + " :param item_created_time_dict: 文章创建时间列表\n", + " :param emb_i2i_sim: 字典基于内容embedding算的文章相似矩阵\n", + " \n", + " return: 召回的文章列表 [(item1, score1), (item2, score2)...]\n", + " \"\"\"\n", + " # 历史交互\n", + " user_item_time_list = user_item_time_dict[user_id] # [(item1, time1), (item2, time2)..]\n", + " user_hist_items = set([i for i, t in user_item_time_list]) # 存在一个用户与某篇文章的多次交互, 这里得去重\n", + " \n", + " items_rank = {}\n", + " for sim_u, wuv in sorted(u2u_sim[user_id].items(), key=lambda x: x[1], reverse=True)[:sim_user_topk]:\n", + " for i, click_time in user_item_time_dict[sim_u]:\n", + " if i in user_hist_items:\n", + " continue\n", + " items_rank.setdefault(i, 0)\n", + " \n", + " loc_weight = 1.0\n", + " content_weight = 1.0\n", + " created_time_weight = 1.0\n", + " \n", + " # 当前文章与该用户看的历史文章进行一个权重交互\n", + " for loc, (j, click_time) in enumerate(user_item_time_list):\n", + " # 点击时的相对位置权重\n", + " loc_weight += 0.9 ** (len(user_item_time_list) - loc)\n", + " # 内容相似性权重\n", + " if emb_i2i_sim.get(i, {}).get(j, None) is not None:\n", + " content_weight += emb_i2i_sim[i][j]\n", + " if emb_i2i_sim.get(j, {}).get(i, None) is not None:\n", + " content_weight += emb_i2i_sim[j][i]\n", + " \n", + " # 创建时间差权重\n", + " created_time_weight += np.exp(0.8 * np.abs(item_created_time_dict[i] - item_created_time_dict[j]))\n", + " \n", + " items_rank[i] += loc_weight * content_weight * created_time_weight * wuv\n", + " \n", + " # 热度补全\n", + " if len(items_rank) < recall_item_num:\n", + " for i, item in enumerate(item_topk_click):\n", + " if item in items_rank.items(): # 填充的item应该不在原来的列表中\n", + " continue\n", + " items_rank[item] = - i - 100 # 随便给个复数就行\n", + " if len(items_rank) == recall_item_num:\n", + " break\n", + " \n", + " items_rank = sorted(items_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num] \n", + " \n", + " return items_rank" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### usercf sim召回" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:05:41.652501Z", + "start_time": "2020-11-16T07:05:40.953871Z" + } + }, + "outputs": [], + "source": [ + "# 这里是为了召回评估,所以提取最后一次点击\n", + "# 由于usercf中计算user之间的相似度的过程太费内存了,全量数据这里就没有跑,跑了一个采样之后的数据\n", + "if metric_recall:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + "else:\n", + " trn_hist_click_df = all_click_df\n", + " \n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "\n", + "u2u_sim = pickle.load(open(save_path + 'usercf_u2u_sim.pkl', 'rb'))\n", + "\n", + "sim_user_topk = 20\n", + "recall_item_num = 10\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, \\\n", + " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim) \n", + "\n", + "pickle.dump(user_recall_items_dict, open(save_path + 'usercf_u2u2i_recall.pkl', 'wb'))\n", + "\n", + "if metric_recall:\n", + " # 召回效果评估\n", + " metrics_recall(user_recall_items_dict, trn_last_click_df, topk=recall_item_num)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T03:09:35.853516Z", + "start_time": "2020-11-16T03:09:35.737625Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### user embedding sim召回\n", + "\n", + "虽然没有直接跑usercf的计算用户之间的相似度,为了验证上述基于用户的协同过滤的代码,下面使用了YoutubeDNN过程中产生的user embedding来进行向量检索每个user最相似的topk个user,在使用这里得到的u2u的相似性矩阵,使用usercf进行召回,具体代码如下" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:09:46.807811Z", + "start_time": "2020-11-17T02:09:46.798033Z" + } + }, + "outputs": [], + "source": [ + "# 使用Embedding的方式获取u2u的相似性矩阵\n", + "# topk指的是每个user, faiss搜索后返回最相似的topk个user\n", + "def u2u_embdding_sim(click_df, user_emb_dict, save_path, topk):\n", + " \n", + " user_list = []\n", + " user_emb_list = []\n", + " for user_id, user_emb in user_emb_dict.items():\n", + " user_list.append(user_id)\n", + " user_emb_list.append(user_emb)\n", + " \n", + " user_index_2_rawid_dict = {k: v for k, v in zip(range(len(user_list)), user_list)} \n", + " \n", + " user_emb_np = np.array(user_emb_list, dtype=np.float32)\n", + " \n", + " # 建立faiss索引\n", + " user_index = faiss.IndexFlatIP(user_emb_np.shape[1])\n", + " user_index.add(user_emb_np)\n", + " # 相似度查询,给每个索引位置上的向量返回topk个item以及相似度\n", + " sim, idx = user_index.search(user_emb_np, topk) # 返回的是列表\n", + " \n", + " # 将向量检索的结果保存成原始id的对应关系\n", + " user_sim_dict = collections.defaultdict(dict)\n", + " for target_idx, sim_value_list, rele_idx_list in tqdm(zip(range(len(user_emb_np)), sim, idx)):\n", + " target_raw_id = user_index_2_rawid_dict[target_idx]\n", + " # 从1开始是为了去掉商品本身, 所以最终获得的相似商品只有topk-1\n", + " for rele_idx, sim_value in zip(rele_idx_list[1:], sim_value_list[1:]): \n", + " rele_raw_id = user_index_2_rawid_dict[rele_idx]\n", + " user_sim_dict[target_raw_id][rele_raw_id] = user_sim_dict.get(target_raw_id, {}).get(rele_raw_id, 0) + sim_value\n", + " \n", + " # 保存i2i相似度矩阵\n", + " pickle.dump(user_sim_dict, open(save_path + 'youtube_u2u_sim.pkl', 'wb')) \n", + " return user_sim_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:14:31.355905Z", + "start_time": "2020-11-17T02:09:53.236531Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "250000it [00:23, 10507.45it/s]\n" + ] + } + ], + "source": [ + "# 读取YoutubeDNN过程中产生的user embedding, 然后使用faiss计算用户之间的相似度\n", + "# 这里需要注意,这里得到的user embedding其实并不是很好,因为YoutubeDNN中使用的是用户点击序列来训练的user embedding,\n", + "# 如果序列普遍都比较短的话,其实效果并不是很好\n", + "user_emb_dict = pickle.load(open(save_path + 'user_youtube_emb.pkl', 'rb'))\n", + "u2u_sim = u2u_embdding_sim(all_click_df, user_emb_dict, save_path, topk=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "通过YoutubeDNN得到的user_embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:49:40.755431Z", + "start_time": "2020-11-17T02:28:47.003514Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [19:43<00:00, 211.22it/s]\n" + ] + } + ], + "source": [ + "# 使用召回评估函数验证当前召回方式的效果\n", + "if metric_recall:\n", + " trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)\n", + "else:\n", + " trn_hist_click_df = all_click_df\n", + "\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "u2u_sim = pickle.load(open(save_path + 'youtube_u2u_sim.pkl', 'rb'))\n", + "\n", + "sim_user_topk = 20\n", + "recall_item_num = 10\n", + "\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = user_based_recommend(user, user_item_time_dict, u2u_sim, sim_user_topk, \\\n", + " recall_item_num, item_topk_click, item_created_time_dict, emb_i2i_sim)\n", + " \n", + "user_multi_recall_dict['youtubednn_usercf_recall'] = user_recall_items_dict\n", + "pickle.dump(user_multi_recall_dict['youtubednn_usercf_recall'], open(save_path + 'youtubednn_usercf_recall.pkl', 'wb'))\n", + "\n", + "if metric_recall:\n", + " # 召回效果评估\n", + " metrics_recall(user_multi_recall_dict['youtubednn_usercf_recall'], trn_last_click_df, topk=recall_item_num)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:07:44.326253Z", + "start_time": "2020-11-16T07:07:43.798931Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 冷启动问题" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**冷启动问题可以分成三类:文章冷启动,用户冷启动,系统冷启动。**\n", + "\n", + "- 文章冷启动:对于一个平台系统新加入的文章,该文章没有任何的交互记录,如何推荐给用户的问题。(对于我们场景可以认为是,日志数据中没有出现过的文章都可以认为是冷启动的文章)\n", + "- 用户冷启动:对于一个平台系统新来的用户,该用户还没有文章的交互信息,如何给该用户进行推荐。(对于我们场景就是,测试集中的用户是否在测试集对应的log数据中出现过,如果没有出现过,那么可以认为该用户是冷启动用户。但是有时候并没有这么严格,我们也可以自己设定某些指标来判别哪些用户是冷启动用户,比如通过使用时长,点击率,留存率等等)\n", + "- 系统冷启动:就是对于一个平台刚上线,还没有任何的相关历史数据,此时就是系统冷启动,其实也就是前面两种的一个综合。\n", + "\n", + "**当前场景下冷启动问题的分析:**\n", + "\n", + "对当前的数据进行分析会发现,日志中所有出现过的点击文章只有3w多个,而整个文章库中却有30多万,那么测试集中的用户最后一次点击是否会点击没有出现在日志中的文章呢?如果存在这种情况,说明用户点击的文章之前没有任何的交互信息,这也就是我们所说的文章冷启动。通过数据分析还可以发现,测试集用户只有一次点击的数据占得比例还不少,其实仅仅通过用户的一次点击就给用户推荐文章使用模型的方式也是比较难的,这里其实也可以考虑用户冷启动的问题,但是这里只给出物品冷启动的一些解决方案及代码,关于用户冷启动的话提一些可行性的做法。\n", + "\n", + "1. 文章冷启动(没有冷启动的探索问题) \n", + " 其实我们这里不是为了做文章的冷启动而做冷启动,而是猜测用户可能会点击一些没有在log数据中出现的文章,我们要做的就是如何从将近27万的文章中选择一些文章作为用户冷启动的文章,这里其实也可以看成是一种召回策略,我们这里就采用简单的比较好理解的基于规则的召回策略来获取用户可能点击的未出现在log数据中的文章。\n", + " 现在的问题变成了:如何给每个用户考虑从27万个商品中获取一小部分商品?随机选一些可能是一种方案。下面给出一些参考的方案。\n", + " 1. 首先基于Embedding召回一部分与用户历史相似的文章\n", + " 2. 从基于Embedding召回的文章中通过一些规则过滤掉一些文章,使得留下的文章用户更可能点击。我们这里的规则,可以是,留下那些与用户历史点击文章主题相同的文章,或者字数相差不大的文章。并且留下的文章尽量是与测试集用户最后一次点击时间更接近的文章,或者是当天的文章也行。\n", + "2. 用户冷启动 \n", + " 这里对测试集中的用户点击数据进行分析会发现,测试集中有百分之20的用户只有一次点击,那么这些点击特别少的用户的召回是不是可以单独做一些策略上的补充呢?或者是在排序后直接基于规则加上一些文章呢?这些都可以去尝试,这里没有提供具体的做法。\n", + " \n", + "**注意:** \n", + "\n", + "这里看似和基于embedding计算的item之间相似度然后做itemcf是一致的,但是现在我们的目的不一样,我们这里的目的是找到相似的向量,并且还没有出现在log日志中的商品,再加上一些其他的冷启动的策略,这里需要找回的数量会偏多一点,不然被筛选完之后可能都没有文章了" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T04:30:23.027164Z", + "start_time": "2020-11-17T04:23:09.960235Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [05:01<00:00, 828.60it/s] \n" + ] + } + ], + "source": [ + "# 先进行itemcf召回,这里不需要做召回评估,这里只是一种策略\n", + "trn_hist_click_df = all_click_df\n", + "\n", + "user_recall_items_dict = collections.defaultdict(dict)\n", + "user_item_time_dict = get_user_item_time(trn_hist_click_df)\n", + "i2i_sim = pickle.load(open(save_path + 'emb_i2i_sim.pkl','rb'))\n", + "\n", + "sim_item_topk = 150\n", + "recall_item_num = 100 # 稍微召回多一点文章,便于后续的规则筛选\n", + "\n", + "item_topk_click = get_item_topk_click(trn_hist_click_df, k=50)\n", + "for user in tqdm(trn_hist_click_df['user_id'].unique()):\n", + " user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, \n", + " recall_item_num, item_topk_click,item_created_time_dict, emb_i2i_sim)\n", + "pickle.dump(user_recall_items_dict, open(save_path + 'cold_start_items_raw_dict.pkl', 'wb'))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:11:39.267581Z", + "start_time": "2020-11-17T06:11:39.252563Z" + } + }, + "outputs": [], + "source": [ + "# 基于规则进行文章过滤\n", + "# 保留文章主题与用户历史浏览主题相似的文章\n", + "# 保留文章字数与用户历史浏览文章字数相差不大的文章\n", + "# 保留最后一次点击当天的文章\n", + "# 按照相似度返回最终的结果\n", + "\n", + "def get_click_article_ids_set(all_click_df):\n", + " return set(all_click_df.click_article_id.values)\n", + "\n", + "def cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \\\n", + " user_last_item_created_time_dict, item_type_dict, item_words_dict, \n", + " item_created_time_dict, click_article_ids_set, recall_item_num):\n", + " \"\"\"\n", + " 冷启动的情况下召回一些文章\n", + " :param user_recall_items_dict: 基于内容embedding相似性召回来的很多文章, 字典, {user1: [(item1, item2), ..], }\n", + " :param user_hist_item_typs_dict: 字典, 用户点击的文章的主题映射\n", + " :param user_hist_item_words_dict: 字典, 用户点击的历史文章的字数映射\n", + " :param user_last_item_created_time_idct: 字典,用户点击的历史文章创建时间映射\n", + " :param item_tpye_idct: 字典,文章主题映射\n", + " :param item_words_dict: 字典,文章字数映射\n", + " :param item_created_time_dict: 字典, 文章创建时间映射\n", + " :param click_article_ids_set: 集合,用户点击过得文章, 也就是日志里面出现过的文章\n", + " :param recall_item_num: 召回文章的数量, 这个指的是没有出现在日志里面的文章数量\n", + " \"\"\"\n", + " \n", + " cold_start_user_items_dict = {}\n", + " for user, item_list in tqdm(user_recall_items_dict.items()):\n", + " cold_start_user_items_dict.setdefault(user, [])\n", + " for item, score in item_list:\n", + " # 获取历史文章信息\n", + " hist_item_type_set = user_hist_item_typs_dict[user]\n", + " hist_mean_words = user_hist_item_words_dict[user]\n", + " hist_last_item_created_time = user_last_item_created_time_dict[user]\n", + " hist_last_item_created_time = datetime.fromtimestamp(hist_last_item_created_time)\n", + " \n", + " # 获取当前召回文章的信息\n", + " curr_item_type = item_type_dict[item]\n", + " curr_item_words = item_words_dict[item]\n", + " curr_item_created_time = item_created_time_dict[item]\n", + " curr_item_created_time = datetime.fromtimestamp(curr_item_created_time)\n", + "\n", + " # 首先,文章不能出现在用户的历史点击中, 然后根据文章主题,文章单词数,文章创建时间进行筛选\n", + " if curr_item_type not in hist_item_type_set or \\\n", + " item in click_article_ids_set or \\\n", + " abs(curr_item_words - hist_mean_words) > 200 or \\\n", + " abs((curr_item_created_time - hist_last_item_created_time).days) > 90: \n", + " continue\n", + " \n", + " cold_start_user_items_dict[user].append((item, score)) # {user1: [(item1, score1), (item2, score2)..]...}\n", + " \n", + " # 需要控制一下冷启动召回的数量\n", + " cold_start_user_items_dict = {k: sorted(v, key=lambda x:x[1], reverse=True)[:recall_item_num] \\\n", + " for k, v in cold_start_user_items_dict.items()}\n", + " \n", + " pickle.dump(cold_start_user_items_dict, open(save_path + 'cold_start_user_items_dict.pkl', 'wb'))\n", + " \n", + " return cold_start_user_items_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:35:38.758278Z", + "start_time": "2020-11-17T06:31:40.164332Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [01:49<00:00, 2293.37it/s]\n" + ] + } + ], + "source": [ + "all_click_df_ = all_click_df.copy()\n", + "all_click_df_ = all_click_df_.merge(item_info_df, how='left', on='click_article_id')\n", + "user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict = get_user_hist_item_info_dict(all_click_df_)\n", + "click_article_ids_set = get_click_article_ids_set(all_click_df)\n", + "# 需要注意的是\n", + "# 这里使用了很多规则来筛选冷启动的文章,所以前面再召回的阶段就应该尽可能的多召回一些文章,否则很容易被删掉\n", + "cold_start_user_items_dict = cold_start_items(user_recall_items_dict, user_hist_item_typs_dict, user_hist_item_words_dict, \\\n", + " user_last_item_created_time_dict, item_type_dict, item_words_dict, \\\n", + " item_created_time_dict, click_article_ids_set, recall_item_num)\n", + "\n", + "user_multi_recall_dict['cold_start_recall'] = cold_start_user_items_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-16T07:13:33.099298Z", + "start_time": "2020-11-16T07:13:32.655036Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 多路召回合并\n", + "多路召回合并就是将前面所有的召回策略得到的用户文章列表合并起来,下面是对前面所有召回结果的汇总\n", + "1. 基于itemcf计算的item之间的相似度sim进行的召回 \n", + "2. 基于embedding搜索得到的item之间的相似度进行的召回\n", + "3. YoutubeDNN召回\n", + "4. YoutubeDNN得到的user之间的相似度进行的召回\n", + "5. 基于冷启动策略的召回\n", + "\n", + "**注意:** \n", + "在做召回评估的时候就会发现有些召回的效果不错有些召回的效果很差,所以对每一路召回的结果,我们可以认为的定义一些权重,来做最终的相似度融合" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T07:02:16.033971Z", + "start_time": "2020-11-17T07:02:16.019819Z" + } + }, + "outputs": [], + "source": [ + "def combine_recall_results(user_multi_recall_dict, weight_dict=None, topk=25):\n", + " final_recall_items_dict = {}\n", + " \n", + " # 对每一种召回结果按照用户进行归一化,方便后面多种召回结果,相同用户的物品之间权重相加\n", + " def norm_user_recall_items_sim(sorted_item_list):\n", + " # 如果冷启动中没有文章或者只有一篇文章,直接返回,出现这种情况的原因可能是冷启动召回的文章数量太少了,\n", + " # 基于规则筛选之后就没有文章了, 这里还可以做一些其他的策略性的筛选\n", + " if len(sorted_item_list) < 2:\n", + " return sorted_item_list\n", + " \n", + " min_sim = sorted_item_list[-1][1]\n", + " max_sim = sorted_item_list[0][1]\n", + " \n", + " norm_sorted_item_list = []\n", + " for item, score in sorted_item_list:\n", + " if max_sim > 0:\n", + " norm_score = 1.0 * (score - min_sim) / (max_sim - min_sim) if max_sim > min_sim else 1.0\n", + " else:\n", + " norm_score = 0.0\n", + " norm_sorted_item_list.append((item, norm_score))\n", + " \n", + " return norm_sorted_item_list\n", + " \n", + " print('多路召回合并...')\n", + " for method, user_recall_items in tqdm(user_multi_recall_dict.items()):\n", + " print(method + '...')\n", + " # 在计算最终召回结果的时候,也可以为每一种召回结果设置一个权重\n", + " if weight_dict == None:\n", + " recall_method_weight = 1\n", + " else:\n", + " recall_method_weight = weight_dict[method]\n", + " \n", + " for user_id, sorted_item_list in user_recall_items.items(): # 进行归一化\n", + " user_recall_items[user_id] = norm_user_recall_items_sim(sorted_item_list)\n", + " \n", + " for user_id, sorted_item_list in user_recall_items.items():\n", + " # print('user_id')\n", + " final_recall_items_dict.setdefault(user_id, {})\n", + " for item, score in sorted_item_list:\n", + " final_recall_items_dict[user_id].setdefault(item, 0)\n", + " final_recall_items_dict[user_id][item] += recall_method_weight * score \n", + " \n", + " final_recall_items_dict_rank = {}\n", + " # 多路召回时也可以控制最终的召回数量\n", + " for user, recall_item_dict in final_recall_items_dict.items():\n", + " final_recall_items_dict_rank[user] = sorted(recall_item_dict.items(), key=lambda x: x[1], reverse=True)[:topk]\n", + "\n", + " # 将多路召回后的最终结果字典保存到本地\n", + " pickle.dump(final_recall_items_dict_rank, open(os.path.join(save_path, 'final_recall_items_dict.pkl'),'wb'))\n", + "\n", + " return final_recall_items_dict_rank" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T07:02:21.078455Z", + "start_time": "2020-11-17T07:02:21.074060Z" + } + }, + "outputs": [], + "source": [ + "# 这里直接对多路召回的权重给了一个相同的值,其实可以根据前面召回的情况来调整参数的值\n", + "weight_dict = {'itemcf_sim_itemcf_recall': 1.0,\n", + " 'embedding_sim_item_recall': 1.0,\n", + " 'youtubednn_recall': 1.0,\n", + " 'youtubednn_usercf_recall': 1.0, \n", + " 'cold_start_recall': 1.0}" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T07:04:35.747924Z", + "start_time": "2020-11-17T07:02:26.889573Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/5 [00:00
\n", - "构造监督数据集的思路, 根据召回结果, 我们会得到一个{user_id: [可能点击的文章列表]}形式的字典。 那么我们就可以对于每个用户, 每篇可能点击的文章构造一个监督测试集, 比如对于用户user1, 假设得到的他的召回列表{user1: [item1, item2, item3]}, 我们就可以得到三行数据(user1, item1), (user1, item2), (user1, item3)的形式, 这就是监督测试集时候的前两列特征。

\n", - "\n", - "构造特征的思路是这样, 我们知道每个用户的点击文章是与其历史点击的文章信息是有很大关联的, 比如同一个主题, 相似等等。 所以特征构造这块很重要的一系列特征**是要结合用户的历史点击文章信息**。我们已经得到了每个用户及点击候选文章的两列的一个数据集, 而我们的目的是要预测最后一次点击的文章, 比较自然的一个思路就是和其最后几次点击的文章产生关系, 这样既考虑了其历史点击文章信息, 又得离最后一次点击较近,因为新闻很大的一个特点就是注重时效性。 往往用户的最后一次点击会和其最后几次点击有很大的关联。 所以我们就可以对于每个候选文章, 做出与最后几次点击相关的特征如下:\n", - "1. 候选item与最后几次点击的相似性特征(embedding内积) --- 这个直接关联用户历史行为\n", - "2. 候选item与最后几次点击的相似性特征的统计特征 --- 统计特征可以减少一些波动和异常\n", - "3. 候选item与最后几次点击文章的字数差的特征 --- 可以通过字数看用户偏好\n", - "4. 候选item与最后几次点击的文章建立的时间差特征 --- 时间差特征可以看出该用户对于文章的实时性的偏好 \n", - "\n", - "\n", - "还需要考虑一下\n", - "**5. 如果使用了youtube召回的话, 我们还可以制作用户与候选item的相似特征**\n", - "\n", - "\n", - "\n", - "当然, 上面只是提供了一种基于用户历史行为做特征工程的思路, 大家也可以思维风暴一下,尝试一些其他的特征。 下面我们就实现上面的这些特征的制作, 下面的逻辑是这样:\n", - "1. 我们首先获得用户的最后一次点击操作和用户的历史点击, 这个基于我们的日志数据集做\n", - "2. 基于用户的历史行为制作特征, 这个会用到用户的历史点击表, 最后的召回列表, 文章的信息表和embedding向量\n", - "3. 制作标签, 形成最后的监督学习数据集" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 导包" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:00.341709Z", - "start_time": "2020-11-17T09:06:58.723900Z" - }, - "cell_style": "center", - "scrolled": true - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import pickle\n", - "from tqdm import tqdm\n", - "import gc, os\n", - "import logging\n", - "import time\n", - "import lightgbm as lgb\n", - "from gensim.models import Word2Vec\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# df节省内存函数" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:02.411005Z", - "start_time": "2020-11-17T09:07:02.397830Z" - } - }, - "outputs": [], - "source": [ - "# 节省内存的一个函数\n", - "# 减少内存\n", - "def reduce_mem(df):\n", - " starttime = time.time()\n", - " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", - " start_mem = df.memory_usage().sum() / 1024**2\n", - " for col in df.columns:\n", - " col_type = df[col].dtypes\n", - " if col_type in numerics:\n", - " c_min = df[col].min()\n", - " c_max = df[col].max()\n", - " if pd.isnull(c_min) or pd.isnull(c_max):\n", - " continue\n", - " if str(col_type)[:3] == 'int':\n", - " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", - " df[col] = df[col].astype(np.int8)\n", - " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", - " df[col] = df[col].astype(np.int16)\n", - " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", - " df[col] = df[col].astype(np.int32)\n", - " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", - " df[col] = df[col].astype(np.int64)\n", - " else:\n", - " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", - " df[col] = df[col].astype(np.float16)\n", - " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", - " df[col] = df[col].astype(np.float32)\n", - " else:\n", - " df[col] = df[col].astype(np.float64)\n", - " end_mem = df.memory_usage().sum() / 1024**2\n", - " print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,\n", - " 100*(start_mem-end_mem)/start_mem,\n", - " (time.time()-starttime)/60))\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:05.031436Z", - "start_time": "2020-11-17T09:07:05.026822Z" - } - }, - "outputs": [], - "source": [ - "data_path = './data_raw/'\n", - "save_path = './temp_results/'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 数据读取\n", - "\n", - "## 训练和验证集的划分\n", - "\n", - "划分训练和验证集的原因是为了在线下验证模型参数的好坏,为了完全模拟测试集,我们这里就在训练集中抽取部分用户的所有信息来作为验证集。提前做训练验证集划分的好处就是可以分解制作排序特征时的压力,一次性做整个数据集的排序特征可能时间会比较长。" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:07.230308Z", - "start_time": "2020-11-17T09:07:07.221081Z" - } - }, - "outputs": [], - "source": [ - "# all_click_df指的是训练集\n", - "# sample_user_nums 采样作为验证集的用户数量\n", - "def trn_val_split(all_click_df, sample_user_nums):\n", - " all_click = all_click_df\n", - " all_user_ids = all_click.user_id.unique()\n", - " \n", - " # replace=True表示可以重复抽样,反之不可以\n", - " sample_user_ids = np.random.choice(all_user_ids, size=sample_user_nums, replace=False) \n", - " \n", - " click_val = all_click[all_click['user_id'].isin(sample_user_ids)]\n", - " click_trn = all_click[~all_click['user_id'].isin(sample_user_ids)]\n", - " \n", - " # 将验证集中的最后一次点击给抽取出来作为答案\n", - " click_val = click_val.sort_values(['user_id', 'click_timestamp'])\n", - " val_ans = click_val.groupby('user_id').tail(1)\n", - " \n", - " click_val = click_val.groupby('user_id').apply(lambda x: x[:-1]).reset_index(drop=True)\n", - " \n", - " # 去除val_ans中某些用户只有一个点击数据的情况,如果该用户只有一个点击数据,又被分到ans中,\n", - " # 那么训练集中就没有这个用户的点击数据,出现用户冷启动问题,给自己模型验证带来麻烦\n", - " val_ans = val_ans[val_ans.user_id.isin(click_val.user_id.unique())] # 保证答案中出现的用户再验证集中还有\n", - " click_val = click_val[click_val.user_id.isin(val_ans.user_id.unique())]\n", - " \n", - " return click_trn, click_val, val_ans" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 获取历史点击和最后一次点击" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:19.202550Z", - "start_time": "2020-11-17T09:07:19.195766Z" - } - }, - "outputs": [], - "source": [ - "# 获取当前数据的历史点击和最后一次点击\n", - "def get_hist_and_last_click(all_click):\n", - " all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])\n", - " click_last_df = all_click.groupby('user_id').tail(1)\n", - "\n", - " # 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下\n", - " def hist_func(user_df):\n", - " if len(user_df) == 1:\n", - " return user_df\n", - " else:\n", - " return user_df[:-1]\n", - "\n", - " click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n", - "\n", - " return click_hist_df, click_last_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取训练、验证及测试集" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:21.181211Z", - "start_time": "2020-11-17T09:07:21.171338Z" - } - }, - "outputs": [], - "source": [ - "def get_trn_val_tst_data(data_path, offline=True):\n", - " if offline:\n", - " click_trn_data = pd.read_csv(data_path+'train_click_log.csv') # 训练集用户点击日志\n", - " click_trn_data = reduce_mem(click_trn_data)\n", - " click_trn, click_val, val_ans = trn_val_split(click_trn_data, sample_user_nums)\n", - " else:\n", - " click_trn = pd.read_csv(data_path+'train_click_log.csv')\n", - " click_trn = reduce_mem(click_trn)\n", - " click_val = None\n", - " val_ans = None\n", - " \n", - " click_tst = pd.read_csv(data_path+'testA_click_log.csv')\n", - " \n", - " return click_trn, click_val, click_tst, val_ans" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取召回列表" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:23.210604Z", - "start_time": "2020-11-17T09:07:23.203652Z" - } - }, - "outputs": [], - "source": [ - "# 返回多路召回列表或者单路召回\n", - "def get_recall_list(save_path, single_recall_model=None, multi_recall=False):\n", - " if multi_recall:\n", - " return pickle.load(open(save_path + 'final_recall_items_dict.pkl', 'rb'))\n", - " \n", - " if single_recall_model == 'i2i_itemcf':\n", - " return pickle.load(open(save_path + 'itemcf_recall_dict.pkl', 'rb'))\n", - " elif single_recall_model == 'i2i_emb_itemcf':\n", - " return pickle.load(open(save_path + 'itemcf_emb_dict.pkl', 'rb'))\n", - " elif single_recall_model == 'user_cf':\n", - " return pickle.load(open(save_path + 'youtubednn_usercf_dict.pkl', 'rb'))\n", - " elif single_recall_model == 'youtubednn':\n", - " return pickle.load(open(save_path + 'youtube_u2i_dict.pkl', 'rb'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取各种Embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Word2Vec训练及gensim的使用\n", - "\n", - "Word2Vec主要思想是:一个词的上下文可以很好的表达出词的语义。通过无监督学习产生词向量的方式。word2vec中有两个非常经典的模型:skip-gram和cbow。\n", - "\n", - "- skip-gram:已知中心词预测周围词。\n", - "- cbow:已知周围词预测中心词。\n", - "![image-20201106225233086](http://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20201106225233086.png)\n", - "\n", - "在使用gensim训练word2vec的时候,有几个比较重要的参数\n", - "- size: 表示词向量的维度。\n", - "- window:决定了目标词会与多远距离的上下文产生关系。\n", - "- sg: 如果是0,则是CBOW模型,是1则是Skip-Gram模型。\n", - "- workers: 表示训练时候的线程数量\n", - "- min_count: 设置最小的\n", - "- iter: 训练时遍历整个数据集的次数\n", - "\n", - "**注意**\n", - "1. 训练的时候输入的语料库一定要是字符组成的二维数组,如:[['北', '京', '你', '好'], ['上', '海', '你', '好']]\n", - "2. 使用模型的时候有一些默认值,可以通过在Jupyter里面通过`Word2Vec??`查看\n", - "\n", - "\n", - "下面是个简单的测试样例:\n", - "```\n", - "from gensim.models import Word2Vec\n", - "doc = [['30760', '157507'],\n", - " ['289197', '63746'],\n", - " ['36162', '168401'],\n", - " ['50644', '36162']]\n", - "w2v = Word2Vec(docs, size=12, sg=1, window=2, seed=2020, workers=2, min_count=1, iter=1)\n", - "\n", - "# 查看'30760'表示的词向量\n", - "w2v['30760']\n", - "```\n", - "\n", - "skip-gram和cbow的详细原理可以参考下面的博客:\n", - "- [word2vec原理(一) CBOW与Skip-Gram模型基础](https://www.cnblogs.com/pinard/p/7160330.html) \n", - "- [word2vec原理(二) 基于Hierarchical Softmax的模型](https://www.cnblogs.com/pinard/p/7160330.html) \n", - "- [word2vec原理(三) 基于Negative Sampling的模型](https://www.cnblogs.com/pinard/p/7249903.html) " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:26.676173Z", - "start_time": "2020-11-17T09:07:26.667926Z" - } - }, - "outputs": [], - "source": [ - "def trian_item_word2vec(click_df, embed_size=64, save_name='item_w2v_emb.pkl', split_char=' '):\n", - " click_df = click_df.sort_values('click_timestamp')\n", - " # 只有转换成字符串才可以进行训练\n", - " click_df['click_article_id'] = click_df['click_article_id'].astype(str)\n", - " # 转换成句子的形式\n", - " docs = click_df.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()\n", - " docs = docs['click_article_id'].values.tolist()\n", - "\n", - " # 为了方便查看训练的进度,这里设定一个log信息\n", - " logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", - "\n", - " # 这里的参数对训练得到的向量影响也很大,默认负采样为5\n", - " w2v = Word2Vec(docs, size=16, sg=1, window=5, seed=2020, workers=24, min_count=1, iter=1)\n", - " \n", - " # 保存成字典的形式\n", - " item_w2v_emb_dict = {k: w2v[k] for k in click_df['click_article_id']}\n", - " pickle.dump(item_w2v_emb_dict, open(save_path + 'item_w2v_emb.pkl', 'wb'))\n", - " \n", - " return item_w2v_emb_dict" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:27.285690Z", - "start_time": "2020-11-17T09:07:27.276646Z" - } - }, - "outputs": [], - "source": [ - "# 可以通过字典查询对应的item的Embedding\n", - "def get_embedding(save_path, all_click_df):\n", - " if os.path.exists(save_path + 'item_content_emb.pkl'):\n", - " item_content_emb_dict = pickle.load(open(save_path + 'item_content_emb.pkl', 'rb'))\n", - " else:\n", - " print('item_content_emb.pkl 文件不存在...')\n", - " \n", - " # w2v Embedding是需要提前训练好的\n", - " if os.path.exists(save_path + 'item_w2v_emb.pkl'):\n", - " item_w2v_emb_dict = pickle.load(open(save_path + 'item_w2v_emb.pkl', 'rb'))\n", - " else:\n", - " item_w2v_emb_dict = trian_item_word2vec(all_click_df)\n", - " \n", - " if os.path.exists(save_path + 'item_youtube_emb.pkl'):\n", - " item_youtube_emb_dict = pickle.load(open(save_path + 'item_youtube_emb.pkl', 'rb'))\n", - " else:\n", - " print('item_youtube_emb.pkl 文件不存在...')\n", - " \n", - " if os.path.exists(save_path + 'user_youtube_emb.pkl'):\n", - " user_youtube_emb_dict = pickle.load(open(save_path + 'user_youtube_emb.pkl', 'rb'))\n", - " else:\n", - " print('user_youtube_emb.pkl 文件不存在...')\n", - " \n", - " return item_content_emb_dict, item_w2v_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取文章信息" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:28.391797Z", - "start_time": "2020-11-17T09:07:28.386650Z" - } - }, - "outputs": [], - "source": [ - "def get_article_info_df():\n", - " article_info_df = pd.read_csv(data_path + 'articles.csv')\n", - " article_info_df = reduce_mem(article_info_df)\n", - " \n", - " return article_info_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取数据" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:07:32.362045Z", - "start_time": "2020-11-17T09:07:29.490413Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-- Mem. usage decreased to 23.34 Mb (69.4% reduction),time spend:0.00 min\n" - ] - } - ], - "source": [ - "# 这里offline的online的区别就是验证集是否为空\n", - "click_trn, click_val, click_tst, val_ans = get_trn_val_tst_data(data_path, offline=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:10.378966Z", - "start_time": "2020-11-17T09:07:32.468580Z" - } - }, - "outputs": [], - "source": [ - "click_trn_hist, click_trn_last = get_hist_and_last_click(click_trn)\n", - "\n", - "if click_val is not None:\n", - " click_val_hist, click_val_last = click_val, val_ans\n", - "else:\n", - " click_val_hist, click_val_last = None, None\n", - " \n", - "click_tst_hist = click_tst" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 对训练数据做负采样\n", - "\n", - "通过召回我们将数据转换成三元组的形式(user1, item1, label)的形式,观察发现正负样本差距极度不平衡,我们可以先对负样本进行下采样,下采样的目的一方面缓解了正负样本比例的问题,另一方面也减小了我们做排序特征的压力,我们在做负采样的时候又有哪些东西是需要注意的呢?\n", - "\n", - "1. 只对负样本进行下采样(如果有比较好的正样本扩充的方法其实也是可以考虑的)\n", - "2. 负采样之后,保证所有的用户和文章仍然出现在采样之后的数据中\n", - "3. 下采样的比例可以根据实际情况人为的控制\n", - "4. 做完负采样之后,更新此时新的用户召回文章列表,因为后续做特征的时候可能用到相对位置的信息。\n", - "\n", - "其实负采样也可以留在后面做完特征在进行,这里由于做排序特征太慢了,所以把负采样的环节提到前面了。" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:36.096678Z", - "start_time": "2020-11-17T09:11:36.090911Z" - } - }, - "outputs": [], - "source": [ - "# 将召回列表转换成df的形式\n", - "def recall_dict_2_df(recall_list_dict):\n", - " df_row_list = [] # [user, item, score]\n", - " for user, recall_list in tqdm(recall_list_dict.items()):\n", - " for item, score in recall_list:\n", - " df_row_list.append([user, item, score])\n", - " \n", - " col_names = ['user_id', 'sim_item', 'score']\n", - " recall_list_df = pd.DataFrame(df_row_list, columns=col_names)\n", - " \n", - " return recall_list_df" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:37.668844Z", - "start_time": "2020-11-17T09:11:37.659774Z" - } - }, - "outputs": [], - "source": [ - "# 负采样函数,这里可以控制负采样时的比例, 这里给了一个默认的值\n", - "def neg_sample_recall_data(recall_items_df, sample_rate=0.001):\n", - " pos_data = recall_items_df[recall_items_df['label'] == 1]\n", - " neg_data = recall_items_df[recall_items_df['label'] == 0]\n", - " \n", - " print('pos_data_num:', len(pos_data), 'neg_data_num:', len(neg_data), 'pos/neg:', len(pos_data)/len(neg_data))\n", - " \n", - " # 分组采样函数\n", - " def neg_sample_func(group_df):\n", - " neg_num = len(group_df)\n", - " sample_num = max(int(neg_num * sample_rate), 1) # 保证最少有一个\n", - " sample_num = min(sample_num, 5) # 保证最多不超过5个,这里可以根据实际情况进行选择\n", - " return group_df.sample(n=sample_num, replace=True)\n", - " \n", - " # 对用户进行负采样,保证所有用户都在采样后的数据中\n", - " neg_data_user_sample = neg_data.groupby('user_id', group_keys=False).apply(neg_sample_func)\n", - " # 对文章进行负采样,保证所有文章都在采样后的数据中\n", - " neg_data_item_sample = neg_data.groupby('sim_item', group_keys=False).apply(neg_sample_func)\n", - " \n", - " # 将上述两种情况下的采样数据合并\n", - " neg_data_new = neg_data_user_sample.append(neg_data_item_sample)\n", - " # 由于上述两个操作是分开的,可能将两个相同的数据给重复选择了,所以需要对合并后的数据进行去重\n", - " neg_data_new = neg_data_new.sort_values(['user_id', 'score']).drop_duplicates(['user_id', 'sim_item'], keep='last')\n", - " \n", - " # 将正样本数据合并\n", - " data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)\n", - " \n", - " return data_new" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:39.481715Z", - "start_time": "2020-11-17T09:11:39.475144Z" - } - }, - "outputs": [], - "source": [ - "# 召回数据打标签\n", - "def get_rank_label_df(recall_list_df, label_df, is_test=False):\n", - " # 测试集是没有标签了,为了后面代码同一一些,这里直接给一个负数替代\n", - " if is_test:\n", - " recall_list_df['label'] = -1\n", - " return recall_list_df\n", - " \n", - " label_df = label_df.rename(columns={'click_article_id': 'sim_item'})\n", - " recall_list_df_ = recall_list_df.merge(label_df[['user_id', 'sim_item', 'click_timestamp']], \\\n", - " how='left', on=['user_id', 'sim_item'])\n", - " recall_list_df_['label'] = recall_list_df_['click_timestamp'].apply(lambda x: 0.0 if np.isnan(x) else 1.0)\n", - " del recall_list_df_['click_timestamp']\n", - " \n", - " return recall_list_df_" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T09:11:41.555566Z", - "start_time": "2020-11-17T09:11:41.546766Z" - } - }, - "outputs": [], - "source": [ - "def get_user_recall_item_label_df(click_trn_hist, click_val_hist, click_tst_hist,click_trn_last, click_val_last, recall_list_df):\n", - " # 获取训练数据的召回列表\n", - " trn_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_trn_hist['user_id'].unique())]\n", - " # 训练数据打标签\n", - " trn_user_item_label_df = get_rank_label_df(trn_user_items_df, click_trn_last, is_test=False)\n", - " # 训练数据负采样\n", - " trn_user_item_label_df = neg_sample_recall_data(trn_user_item_label_df)\n", - " \n", - " if click_val is not None:\n", - " val_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_val_hist['user_id'].unique())]\n", - " val_user_item_label_df = get_rank_label_df(val_user_items_df, click_val_last, is_test=False)\n", - " val_user_item_label_df = neg_sample_recall_data(val_user_item_label_df)\n", - " else:\n", - " val_user_item_label_df = None\n", - " \n", - " # 测试数据不需要进行负采样,直接对所有的召回商品进行打-1标签\n", - " tst_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_tst_hist['user_id'].unique())]\n", - " tst_user_item_label_df = get_rank_label_df(tst_user_items_df, None, is_test=True)\n", - " \n", - " return trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:23:35.357045Z", - "start_time": "2020-11-17T17:23:12.378284Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 250000/250000 [00:12<00:00, 20689.39it/s]\n" - ] - } - ], - "source": [ - "# 读取召回列表\n", - "recall_list_dict = get_recall_list(save_path, single_recall_model='i2i_itemcf') # 这里只选择了单路召回的结果,也可以选择多路召回结果\n", - "# 将召回数据转换成df\n", - "recall_list_df = recall_dict_2_df(recall_list_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:29:04.598214Z", - "start_time": "2020-11-17T17:23:40.001052Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pos_data_num: 64190 neg_data_num: 1935810 pos/neg: 0.03315924600038227\n" - ] - } - ], - "source": [ - "# 给训练验证数据打标签,并负采样(这一部分时间比较久)\n", - "trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df = get_user_recall_item_label_df(click_trn_hist, \n", - " click_val_hist, \n", - " click_tst_hist,\n", - " click_trn_last, \n", - " click_val_last, \n", - " recall_list_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:23:11.642944Z", - "start_time": "2020-11-17T17:23:08.475Z" - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "trn_user_item_label_df.label" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 将召回数据转换成字典" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:36:22.800449Z", - "start_time": "2020-11-17T17:36:22.794670Z" - } - }, - "outputs": [], - "source": [ - "# 将最终的召回的df数据转换成字典的形式做排序特征\n", - "def make_tuple_func(group_df):\n", - " row_data = []\n", - " for name, row_df in group_df.iterrows():\n", - " row_data.append((row_df['sim_item'], row_df['score'], row_df['label']))\n", - " \n", - " return row_data" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T17:40:05.991819Z", - "start_time": "2020-11-17T17:36:26.536429Z" - } - }, - "outputs": [], - "source": [ - "trn_user_item_label_tuples = trn_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", - "trn_user_item_label_tuples_dict = dict(zip(trn_user_item_label_tuples['user_id'], trn_user_item_label_tuples[0]))\n", - "\n", - "if val_user_item_label_df is not None:\n", - " val_user_item_label_tuples = val_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", - " val_user_item_label_tuples_dict = dict(zip(val_user_item_label_tuples['user_id'], val_user_item_label_tuples[0]))\n", - "else:\n", - " val_user_item_label_tuples_dict = None\n", - " \n", - "tst_user_item_label_tuples = tst_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", - "tst_user_item_label_tuples_dict = dict(zip(tst_user_item_label_tuples['user_id'], tst_user_item_label_tuples[0]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T07:59:53.141560Z", - "start_time": "2020-11-17T07:59:53.133599Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 特征工程" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 制作与用户历史行为相关特征\n", - "对于每个用户召回的每个商品, 做特征。 具体步骤如下:\n", - "* 对于每个用户, 获取最后点击的N个商品的item_id, \n", - " * 对于该用户的每个召回商品, 计算与上面最后N次点击商品的相似度的和(最大, 最小,均值), 时间差特征,相似性特征,字数差特征,与该用户的相似性特征" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T01:07:47.268035Z", - "start_time": "2020-11-18T01:07:47.250449Z" - } - }, - "outputs": [], - "source": [ - "# 下面基于data做历史相关的特征\n", - "def create_feature(users_id, recall_list, click_hist_df, articles_info, articles_emb, user_emb=None, N=1):\n", - " \"\"\"\n", - " 基于用户的历史行为做相关特征\n", - " :param users_id: 用户id\n", - " :param recall_list: 对于每个用户召回的候选文章列表\n", - " :param click_hist_df: 用户的历史点击信息\n", - " :param articles_info: 文章信息\n", - " :param articles_emb: 文章的embedding向量, 这个可以用item_content_emb, item_w2v_emb, item_youtube_emb\n", - " :param user_emb: 用户的embedding向量, 这个是user_youtube_emb, 如果没有也可以不用, 但要注意如果要用的话, articles_emb就要用item_youtube_emb的形式, 这样维度才一样\n", - " :param N: 最近的N次点击 由于testA日志里面很多用户只存在一次历史点击, 所以为了不产生空值,默认是1\n", - " \"\"\"\n", - " \n", - " # 建立一个二维列表保存结果, 后面要转成DataFrame\n", - " all_user_feas = []\n", - " i = 0\n", - " for user_id in tqdm(users_id):\n", - " # 该用户的最后N次点击\n", - " hist_user_items = click_hist_df[click_hist_df['user_id']==user_id]['click_article_id'][-N:]\n", - " \n", - " # 遍历该用户的召回列表\n", - " for rank, (article_id, score, label) in enumerate(recall_list[user_id]):\n", - " # 该文章建立时间, 字数\n", - " a_create_time = articles_info[articles_info['article_id']==article_id]['created_at_ts'].values[0]\n", - " a_words_count = articles_info[articles_info['article_id']==article_id]['words_count'].values[0]\n", - " single_user_fea = [user_id, article_id]\n", - " # 计算与最后点击的商品的相似度的和, 最大值和最小值, 均值\n", - " sim_fea = []\n", - " time_fea = []\n", - " word_fea = []\n", - " # 遍历用户的最后N次点击文章\n", - " for hist_item in hist_user_items:\n", - " b_create_time = articles_info[articles_info['article_id']==hist_item]['created_at_ts'].values[0]\n", - " b_words_count = articles_info[articles_info['article_id']==hist_item]['words_count'].values[0]\n", - " \n", - " sim_fea.append(np.dot(articles_emb[hist_item], articles_emb[article_id]))\n", - " time_fea.append(abs(a_create_time-b_create_time))\n", - " word_fea.append(abs(a_words_count-b_words_count))\n", - " \n", - " single_user_fea.extend(sim_fea) # 相似性特征\n", - " single_user_fea.extend(time_fea) # 时间差特征\n", - " single_user_fea.extend(word_fea) # 字数差特征\n", - " single_user_fea.extend([max(sim_fea), min(sim_fea), sum(sim_fea), sum(sim_fea) / len(sim_fea)]) # 相似性的统计特征\n", - " \n", - " if user_emb: # 如果用户向量有的话, 这里计算该召回文章与用户的相似性特征 \n", - " single_user_fea.append(np.dot(user_emb[user_id], articles_emb[article_id]))\n", - " \n", - " single_user_fea.extend([score, rank, label]) \n", - " # 加入到总的表中\n", - " all_user_feas.append(single_user_fea)\n", - " \n", - " # 定义列名\n", - " id_cols = ['user_id', 'click_article_id']\n", - " sim_cols = ['sim' + str(i) for i in range(N)]\n", - " time_cols = ['time_diff' + str(i) for i in range(N)]\n", - " word_cols = ['word_diff' + str(i) for i in range(N)]\n", - " sat_cols = ['sim_max', 'sim_min', 'sim_sum', 'sim_mean']\n", - " user_item_sim_cols = ['user_item_sim'] if user_emb else []\n", - " user_score_rank_label = ['score', 'rank', 'label']\n", - " cols = id_cols + sim_cols + time_cols + word_cols + sat_cols + user_item_sim_cols + user_score_rank_label\n", - " \n", - " # 转成DataFrame\n", - " df = pd.DataFrame( all_user_feas, columns=cols)\n", - " \n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T01:08:17.531694Z", - "start_time": "2020-11-18T01:08:10.754702Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-- Mem. usage decreased to 5.56 Mb (50.0% reduction),time spend:0.00 min\n" - ] - } - ], - "source": [ - "article_info_df = get_article_info_df()\n", - "all_click = click_trn.append(click_tst)\n", - "item_content_emb_dict, item_w2v_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict = get_embedding(save_path, all_click)" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:06:22.709350Z", - "start_time": "2020-11-18T01:08:39.923811Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 200000/200000 [50:16<00:00, 66.31it/s] \n", - "100%|██████████| 50000/50000 [1:07:21<00:00, 12.37it/s]\n" - ] - } - ], - "source": [ - "# 获取训练验证及测试数据中召回列文章相关特征\n", - "trn_user_item_feats_df = create_feature(trn_user_item_label_tuples_dict.keys(), trn_user_item_label_tuples_dict, \\\n", - " click_trn_hist, article_info_df, item_content_emb_dict)\n", - "\n", - "if val_user_item_label_tuples_dict is not None:\n", - " val_user_item_feats_df = create_feature(val_user_item_label_tuples_dict.keys(), val_user_item_label_tuples_dict, \\\n", - " click_val_hist, article_info_df, item_content_emb_dict)\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "tst_user_item_feats_df = create_feature(tst_user_item_label_tuples_dict.keys(), tst_user_item_label_tuples_dict, \\\n", - " click_tst_hist, article_info_df, item_content_emb_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:13:58.573422Z", - "start_time": "2020-11-18T03:13:40.157228Z" - } - }, - "outputs": [], - "source": [ - "# 保存一份省的每次都要重新跑,每次跑的时间都比较长\n", - "trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n", - "\n", - "tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:14:22.838154Z", - "start_time": "2020-11-18T03:14:22.828212Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 用户和文章特征\n", - "### 用户相关特征\n", - "这一块,正式进行特征工程,既要拼接上已有的特征, 也会做更多的特征出来,我们来梳理一下已有的特征和可构造特征:\n", - "1. 文章自身的特征, 文章字数,文章创建时间, 文章的embedding (articles表中)\n", - "2. 用户点击环境特征, 那些设备的特征(这个在df中)\n", - "3. 对于用户和商品还可以构造的特征:\n", - " * 基于用户的点击文章次数和点击时间构造可以表现用户活跃度的特征\n", - " * 基于文章被点击次数和时间构造可以反映文章热度的特征\n", - " * 用户的时间统计特征: 根据其点击的历史文章列表的点击时间和文章的创建时间做统计特征,比如求均值, 这个可以反映用户对于文章时效的偏好\n", - " * 用户的主题爱好特征, 对于用户点击的历史文章主题进行一个统计, 然后对于当前文章看看是否属于用户已经点击过的主题\n", - " * 用户的字数爱好特征, 对于用户点击的历史文章的字数统计, 求一个均值" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T03:16:37.637495Z", - "start_time": "2020-11-14T03:16:37.618229Z" - } - }, - "outputs": [], - "source": [ - "click_tst.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:09:11.675550Z", - "start_time": "2020-11-17T02:09:10.265134Z" - } - }, - "outputs": [], - "source": [ - "# 读取文章特征\n", - "articles = pd.read_csv(data_path+'articles.csv')\n", - "articles = reduce_mem(articles)\n", - "\n", - "# 日志数据,就是前面的所有数据\n", - "if click_val is not None:\n", - " all_data = click_trn.append(click_val)\n", - "all_data = click_trn.append(click_tst)\n", - "all_data = reduce_mem(all_data)\n", - "\n", - "# 拼上文章信息\n", - "all_data = all_data.merge(articles, left_on='click_article_id', right_on='article_id')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T03:17:12.256244Z", - "start_time": "2020-11-14T03:17:12.250452Z" - } - }, - "outputs": [], - "source": [ - "all_data.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 分析一下点击时间和点击文章的次数,区分用户活跃度\n", - "如果某个用户点击文章之间的时间间隔比较小, 同时点击的文章次数很多的话, 那么我们认为这种用户一般就是活跃用户, 当然衡量用户活跃度的方式可能多种多样, 这里我们只提供其中一种,我们写一个函数, 得到可以衡量用户活跃度的特征,逻辑如下:\n", - "1. 首先根据用户user_id分组, 对于每个用户,计算点击文章的次数, 两两点击文章时间间隔的均值\n", - "2. 把点击次数取倒数和时间间隔的均值统一归一化,然后两者相加合并,该值越小, 说明用户越活跃\n", - "3. 注意, 上面两两点击文章的时间间隔均值, 会出现如果用户只点击了一次的情况,这时候时间间隔均值那里会出现空值, 对于这种情况最后特征那里给个大数进行区分\n", - "\n", - "这个的衡量标准就是先把点击的次数取到数然后归一化, 然后点击的时间差归一化, 然后两者相加进行合并, 该值越小, 说明被点击的次数越多, 且间隔时间短。 " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:28:55.336058Z", - "start_time": "2020-11-17T02:28:55.324332Z" - } - }, - "outputs": [], - "source": [ - " def active_level(all_data, cols):\n", - " \"\"\"\n", - " 制作区分用户活跃度的特征\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " data = all_data[cols]\n", - " data.sort_values(['user_id', 'click_timestamp'], inplace=True)\n", - " user_act = pd.DataFrame(data.groupby('user_id', as_index=False)[['click_article_id', 'click_timestamp']].\\\n", - " agg({'click_article_id':np.size, 'click_timestamp': {list}}).values, columns=['user_id', 'click_size', 'click_timestamp'])\n", - " \n", - " # 计算时间间隔的均值\n", - " def time_diff_mean(l):\n", - " if len(l) == 1:\n", - " return 1\n", - " else:\n", - " return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])\n", - " \n", - " user_act['time_diff_mean'] = user_act['click_timestamp'].apply(lambda x: time_diff_mean(x))\n", - " \n", - " # 点击次数取倒数\n", - " user_act['click_size'] = 1 / user_act['click_size']\n", - " \n", - " # 两者归一化\n", - " user_act['click_size'] = (user_act['click_size'] - user_act['click_size'].min()) / (user_act['click_size'].max() - user_act['click_size'].min())\n", - " user_act['time_diff_mean'] = (user_act['time_diff_mean'] - user_act['time_diff_mean'].min()) / (user_act['time_diff_mean'].max() - user_act['time_diff_mean'].min()) \n", - " user_act['active_level'] = user_act['click_size'] + user_act['time_diff_mean']\n", - " \n", - " user_act['user_id'] = user_act['user_id'].astype('int')\n", - " del user_act['click_timestamp']\n", - " \n", - " return user_act" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:30:12.696060Z", - "start_time": "2020-11-17T02:29:01.523837Z" - } - }, - "outputs": [], - "source": [ - "user_act_fea = active_level(all_data, ['user_id', 'click_article_id', 'click_timestamp'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:28:53.996742Z", - "start_time": "2020-11-17T02:09:18.374Z" - } - }, - "outputs": [], - "source": [ - "user_act_fea.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 分析一下点击时间和被点击文章的次数, 衡量文章热度特征\n", - "和上面同样的思路, 如果一篇文章在很短的时间间隔之内被点击了很多次, 说明文章比较热门,实现的逻辑和上面的基本一致, 只不过这里是按照点击的文章进行分组:\n", - "1. 根据文章进行分组, 对于每篇文章的用户, 计算点击的时间间隔\n", - "2. 将用户的数量取倒数, 然后用户的数量和时间间隔归一化, 然后相加得到热度特征, 该值越小, 说明被点击的次数越大且时间间隔越短, 文章比较热\n", - "\n", - "当然, 这只是给出一种判断文章热度的一种方法, 这里大家也可以头脑风暴一下" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:41:26.398567Z", - "start_time": "2020-11-17T02:41:26.386668Z" - } - }, - "outputs": [], - "source": [ - " def hot_level(all_data, cols):\n", - " \"\"\"\n", - " 制作衡量文章热度的特征\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " data = all_data[cols]\n", - " data.sort_values(['click_article_id', 'click_timestamp'], inplace=True)\n", - " article_hot = pd.DataFrame(data.groupby('click_article_id', as_index=False)[['user_id', 'click_timestamp']].\\\n", - " agg({'user_id':np.size, 'click_timestamp': {list}}).values, columns=['click_article_id', 'user_num', 'click_timestamp'])\n", - " \n", - " # 计算被点击时间间隔的均值\n", - " def time_diff_mean(l):\n", - " if len(l) == 1:\n", - " return 1\n", - " else:\n", - " return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])\n", - " \n", - " article_hot['time_diff_mean'] = article_hot['click_timestamp'].apply(lambda x: time_diff_mean(x))\n", - " \n", - " # 点击次数取倒数\n", - " article_hot['user_num'] = 1 / article_hot['user_num']\n", - " \n", - " # 两者归一化\n", - " article_hot['user_num'] = (article_hot['user_num'] - article_hot['user_num'].min()) / (article_hot['user_num'].max() - article_hot['user_num'].min())\n", - " article_hot['time_diff_mean'] = (article_hot['time_diff_mean'] - article_hot['time_diff_mean'].min()) / (article_hot['time_diff_mean'].max() - article_hot['time_diff_mean'].min()) \n", - " article_hot['hot_level'] = article_hot['user_num'] + article_hot['time_diff_mean']\n", - " \n", - " article_hot['click_article_id'] = article_hot['click_article_id'].astype('int')\n", - " \n", - " del article_hot['click_timestamp']\n", - " \n", - " return article_hot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T02:41:44.635900Z", - "start_time": "2020-11-17T02:41:31.473032Z" - } - }, - "outputs": [], - "source": [ - "article_hot_fea = hot_level(all_data, ['user_id', 'click_article_id', 'click_timestamp']) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T03:19:54.775290Z", - "start_time": "2020-11-14T03:19:54.763699Z" - } - }, - "outputs": [], - "source": [ - "article_hot_fea.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的系列习惯\n", - "这个基于原来的日志表做一个类似于article的那种DataFrame, 存放用户特有的信息, 主要包括点击习惯, 爱好特征之类的\n", - "* 用户的设备习惯, 这里取最常用的设备(众数)\n", - "* 用户的时间习惯: 根据其点击过得历史文章的时间来做一个统计(这个感觉最好是把时间戳里的时间特征的h特征提出来,看看用户习惯一天的啥时候点击文章), 但这里先用转换的时间吧, 求个均值\n", - "* 用户的爱好特征, 对于用户点击的历史文章主题进行用户的爱好判别, 更偏向于哪几个主题, 这个最好是multi-hot进行编码, 先试试行不\n", - "* 用户文章的字数差特征, 用户的爱好文章的字数习惯\n", - "\n", - "这些就是对用户进行分组, 然后统计即可" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的设备习惯" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T04:22:48.877978Z", - "start_time": "2020-11-17T04:22:48.872049Z" - } - }, - "outputs": [], - "source": [ - "def device_fea(all_data, cols):\n", - " \"\"\"\n", - " 制作用户的设备特征\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " user_device_info = all_data[cols]\n", - " \n", - " # 用众数来表示每个用户的设备信息\n", - " user_device_info = user_device_info.groupby('user_id').agg(lambda x: x.value_counts().index[0]).reset_index()\n", - " \n", - " return user_device_info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T05:27:10.897473Z", - "start_time": "2020-11-17T04:49:33.214865Z" - } - }, - "outputs": [], - "source": [ - "# 设备特征(这里时间会比较长)\n", - "device_cols = ['user_id', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', 'click_region', 'click_referrer_type']\n", - "user_device_info = device_fea(all_data, device_cols)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T04:20:39.765842Z", - "start_time": "2020-11-14T04:20:39.747087Z" - } - }, - "outputs": [], - "source": [ - "user_device_info.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的时间习惯" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:11:50.889905Z", - "start_time": "2020-11-17T06:11:50.882653Z" - } - }, - "outputs": [], - "source": [ - "def user_time_hob_fea(all_data, cols):\n", - " \"\"\"\n", - " 制作用户的时间习惯特征\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " user_time_hob_info = all_data[cols]\n", - " \n", - " # 先把时间戳进行归一化\n", - " mm = MinMaxScaler()\n", - " user_time_hob_info['click_timestamp'] = mm.fit_transform(user_time_hob_info[['click_timestamp']])\n", - " user_time_hob_info['created_at_ts'] = mm.fit_transform(user_time_hob_info[['created_at_ts']])\n", - "\n", - " user_time_hob_info = user_time_hob_info.groupby('user_id').agg('mean').reset_index()\n", - " \n", - " user_time_hob_info.rename(columns={'click_timestamp': 'user_time_hob1', 'created_at_ts': 'user_time_hob2'}, inplace=True)\n", - " return user_time_hob_info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:31:51.646110Z", - "start_time": "2020-11-17T06:31:51.171431Z" - } - }, - "outputs": [], - "source": [ - "user_time_hob_cols = ['user_id', 'click_timestamp', 'created_at_ts']\n", - "user_time_hob_info = user_time_hob_fea(all_data, user_time_hob_cols)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的主题爱好\n", - "这里先把用户点击的文章属于的主题转成一个列表, 后面再总的汇总的时候单独制作一个特征, 就是文章的主题如果属于这里面, 就是1, 否则就是0。" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:31:56.571088Z", - "start_time": "2020-11-17T06:31:56.565304Z" - } - }, - "outputs": [], - "source": [ - "def user_cat_hob_fea(all_data, cols):\n", - " \"\"\"\n", - " 用户的主题爱好\n", - " :param all_data: 数据集\n", - " :param cols: 用到的特征列\n", - " \"\"\"\n", - " user_category_hob_info = all_data[cols]\n", - " user_category_hob_info = user_category_hob_info.groupby('user_id').agg({list}).reset_index()\n", - " \n", - " user_cat_hob_info = pd.DataFrame()\n", - " user_cat_hob_info['user_id'] = user_category_hob_info['user_id']\n", - " user_cat_hob_info['cate_list'] = user_category_hob_info['category_id']\n", - " \n", - " return user_cat_hob_info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:32:55.150800Z", - "start_time": "2020-11-17T06:32:00.740046Z" - } - }, - "outputs": [], - "source": [ - "user_category_hob_cols = ['user_id', 'category_id']\n", - "user_cat_hob_info = user_cat_hob_fea(all_data, user_category_hob_cols)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的字数偏好特征" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:48:12.988460Z", - "start_time": "2020-11-17T06:48:12.547000Z" - } - }, - "outputs": [], - "source": [ - "user_wcou_info = all_data.groupby('user_id')['words_count'].agg('mean').reset_index()\n", - "user_wcou_info.rename(columns={'words_count': 'words_hbo'}, inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 用户的信息特征合并保存" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:48:18.289591Z", - "start_time": "2020-11-17T06:48:17.084408Z" - } - }, - "outputs": [], - "source": [ - "# 所有表进行合并\n", - "user_info = pd.merge(user_act_fea, user_device_info, on='user_id')\n", - "user_info = user_info.merge(user_time_hob_info, on='user_id')\n", - "user_info = user_info.merge(user_cat_hob_info, on='user_id')\n", - "user_info = user_info.merge(user_wcou_info, on='user_id')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-17T06:48:26.907785Z", - "start_time": "2020-11-17T06:48:21.457597Z" - } - }, - "outputs": [], - "source": [ - "# 这样用户特征以后就可以直接读取了\n", - "user_info.to_csv(save_path + 'user_info.csv', index=False) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户特征直接读入\n", - "如果前面关于用户的特征工程已经给做完了,后面可以直接读取" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:15:49.502826Z", - "start_time": "2020-11-18T03:15:48.062243Z" - } - }, - "outputs": [], - "source": [ - "# 把用户信息直接读入进来\n", - "user_info = pd.read_csv(save_path + 'user_info.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:15:56.899635Z", - "start_time": "2020-11-18T03:15:53.701818Z" - } - }, - "outputs": [], - "source": [ - "if os.path.exists(save_path + 'trn_user_item_feats_df.csv'):\n", - " trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')\n", - " \n", - "if os.path.exists(save_path + 'tst_user_item_feats_df.csv'):\n", - " tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')\n", - "\n", - "if os.path.exists(save_path + 'val_user_item_feats_df.csv'):\n", - " val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')\n", - "else:\n", - " val_user_item_feats_df = None" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:16:02.739197Z", - "start_time": "2020-11-18T03:16:01.725028Z" - } - }, - "outputs": [], - "source": [ - "# 拼上用户特征\n", - "# 下面是线下验证的\n", - "trn_user_item_feats_df = trn_user_item_feats_df.merge(user_info, on='user_id', how='left')\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df = val_user_item_feats_df.merge(user_info, on='user_id', how='left')\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "tst_user_item_feats_df = tst_user_item_feats_df.merge(user_info, on='user_id',how='left')" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:16:06.989877Z", - "start_time": "2020-11-18T03:16:06.983327Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['user_id', 'click_article_id', 'sim0', 'time_diff0', 'word_diff0',\n", - " 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank', 'label',\n", - " 'click_size', 'time_diff_mean', 'active_level', 'click_environment',\n", - " 'click_deviceGroup', 'click_os', 'click_country', 'click_region',\n", - " 'click_referrer_type', 'user_time_hob1', 'user_time_hob2', 'cate_list',\n", - " 'words_hbo'],\n", - " dtype='object')" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "trn_user_item_feats_df.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-14T03:13:36.071236Z", - "start_time": "2020-11-14T03:13:36.050188Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 文章的特征直接读入" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:16:12.793070Z", - "start_time": "2020-11-18T03:16:12.425380Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-- Mem. usage decreased to 5.56 Mb (50.0% reduction),time spend:0.00 min\n" - ] - } - ], - "source": [ - "articles = pd.read_csv(data_path+'articles.csv')\n", - "articles = reduce_mem(articles)" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:16:18.118507Z", - "start_time": "2020-11-18T03:16:16.344338Z" - } - }, - "outputs": [], - "source": [ - "# 拼上文章特征\n", - "trn_user_item_feats_df = trn_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df = val_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n", - "else:\n", - " val_user_item_feats_df = None\n", - "\n", - "tst_user_item_feats_df = tst_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 召回文章的主题是否在用户的爱好里面" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:17:40.251797Z", - "start_time": "2020-11-18T03:16:28.130012Z" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 制作特征和标签, 转成监督学习问题\n", + "我们先捋一下基于原始的给定数据, 有哪些特征可以直接利用:\n", + "1. 文章的自身特征, category_id表示这文章的类型, created_at_ts表示文章建立的时间, 这个关系着文章的时效性, words_count是文章的字数, 一般字数太长我们不太喜欢点击, 也不排除有人就喜欢读长文。\n", + "2. 文章的内容embedding特征, 这个召回的时候用过, 这里可以选择使用, 也可以选择不用, 也可以尝试其他类型的embedding特征, 比如W2V等\n", + "3. 用户的设备特征信息\n", + "\n", + "上面这些直接可以用的特征, 待做完特征工程之后, 直接就可以根据article_id或者是user_id把这些特征加入进去。 但是我们需要先基于召回的结果, 构造一些特征,然后制作标签,形成一个监督学习的数据集。

\n", + "构造监督数据集的思路, 根据召回结果, 我们会得到一个{user_id: [可能点击的文章列表]}形式的字典。 那么我们就可以对于每个用户, 每篇可能点击的文章构造一个监督测试集, 比如对于用户user1, 假设得到的他的召回列表{user1: [item1, item2, item3]}, 我们就可以得到三行数据(user1, item1), (user1, item2), (user1, item3)的形式, 这就是监督测试集时候的前两列特征。

\n", + "\n", + "构造特征的思路是这样, 我们知道每个用户的点击文章是与其历史点击的文章信息是有很大关联的, 比如同一个主题, 相似等等。 所以特征构造这块很重要的一系列特征**是要结合用户的历史点击文章信息**。我们已经得到了每个用户及点击候选文章的两列的一个数据集, 而我们的目的是要预测最后一次点击的文章, 比较自然的一个思路就是和其最后几次点击的文章产生关系, 这样既考虑了其历史点击文章信息, 又得离最后一次点击较近,因为新闻很大的一个特点就是注重时效性。 往往用户的最后一次点击会和其最后几次点击有很大的关联。 所以我们就可以对于每个候选文章, 做出与最后几次点击相关的特征如下:\n", + "1. 候选item与最后几次点击的相似性特征(embedding内积) --- 这个直接关联用户历史行为\n", + "2. 候选item与最后几次点击的相似性特征的统计特征 --- 统计特征可以减少一些波动和异常\n", + "3. 候选item与最后几次点击文章的字数差的特征 --- 可以通过字数看用户偏好\n", + "4. 候选item与最后几次点击的文章建立的时间差特征 --- 时间差特征可以看出该用户对于文章的实时性的偏好 \n", + "\n", + "\n", + "还需要考虑一下\n", + "**5. 如果使用了youtube召回的话, 我们还可以制作用户与候选item的相似特征**\n", + "\n", + "\n", + "\n", + "当然, 上面只是提供了一种基于用户历史行为做特征工程的思路, 大家也可以思维风暴一下,尝试一些其他的特征。 下面我们就实现上面的这些特征的制作, 下面的逻辑是这样:\n", + "1. 我们首先获得用户的最后一次点击操作和用户的历史点击, 这个基于我们的日志数据集做\n", + "2. 基于用户的历史行为制作特征, 这个会用到用户的历史点击表, 最后的召回列表, 文章的信息表和embedding向量\n", + "3. 制作标签, 形成最后的监督学习数据集" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 导包" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:00.341709Z", + "start_time": "2020-11-17T09:06:58.723900Z" + }, + "cell_style": "center", + "scrolled": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "from tqdm import tqdm\n", + "import gc, os\n", + "import logging\n", + "import time\n", + "import lightgbm as lgb\n", + "from gensim.models import Word2Vec\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# df节省内存函数" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:02.411005Z", + "start_time": "2020-11-17T09:07:02.397830Z" + } + }, + "outputs": [], + "source": [ + "# 节省内存的一个函数\n", + "# 减少内存\n", + "def reduce_mem(df):\n", + " starttime = time.time()\n", + " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", + " start_mem = df.memory_usage().sum() / 1024**2\n", + " for col in df.columns:\n", + " col_type = df[col].dtypes\n", + " if col_type in numerics:\n", + " c_min = df[col].min()\n", + " c_max = df[col].max()\n", + " if pd.isnull(c_min) or pd.isnull(c_max):\n", + " continue\n", + " if str(col_type)[:3] == 'int':\n", + " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", + " df[col] = df[col].astype(np.int8)\n", + " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", + " df[col] = df[col].astype(np.int16)\n", + " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", + " df[col] = df[col].astype(np.int32)\n", + " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", + " df[col] = df[col].astype(np.int64)\n", + " else:\n", + " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", + " df[col] = df[col].astype(np.float16)\n", + " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", + " df[col] = df[col].astype(np.float32)\n", + " else:\n", + " df[col] = df[col].astype(np.float64)\n", + " end_mem = df.memory_usage().sum() / 1024**2\n", + " print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,\n", + " 100*(start_mem-end_mem)/start_mem,\n", + " (time.time()-starttime)/60))\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:05.031436Z", + "start_time": "2020-11-17T09:07:05.026822Z" + } + }, + "outputs": [], + "source": [ + "data_path = './data_raw/'\n", + "save_path = './temp_results/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 数据读取\n", + "\n", + "## 训练和验证集的划分\n", + "\n", + "划分训练和验证集的原因是为了在线下验证模型参数的好坏,为了完全模拟测试集,我们这里就在训练集中抽取部分用户的所有信息来作为验证集。提前做训练验证集划分的好处就是可以分解制作排序特征时的压力,一次性做整个数据集的排序特征可能时间会比较长。" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:07.230308Z", + "start_time": "2020-11-17T09:07:07.221081Z" + } + }, + "outputs": [], + "source": [ + "# all_click_df指的是训练集\n", + "# sample_user_nums 采样作为验证集的用户数量\n", + "def trn_val_split(all_click_df, sample_user_nums):\n", + " all_click = all_click_df\n", + " all_user_ids = all_click.user_id.unique()\n", + " \n", + " # replace=True表示可以重复抽样,反之不可以\n", + " sample_user_ids = np.random.choice(all_user_ids, size=sample_user_nums, replace=False) \n", + " \n", + " click_val = all_click[all_click['user_id'].isin(sample_user_ids)]\n", + " click_trn = all_click[~all_click['user_id'].isin(sample_user_ids)]\n", + " \n", + " # 将验证集中的最后一次点击给抽取出来作为答案\n", + " click_val = click_val.sort_values(['user_id', 'click_timestamp'])\n", + " val_ans = click_val.groupby('user_id').tail(1)\n", + " \n", + " click_val = click_val.groupby('user_id').apply(lambda x: x[:-1]).reset_index(drop=True)\n", + " \n", + " # 去除val_ans中某些用户只有一个点击数据的情况,如果该用户只有一个点击数据,又被分到ans中,\n", + " # 那么训练集中就没有这个用户的点击数据,出现用户冷启动问题,给自己模型验证带来麻烦\n", + " val_ans = val_ans[val_ans.user_id.isin(click_val.user_id.unique())] # 保证答案中出现的用户再验证集中还有\n", + " click_val = click_val[click_val.user_id.isin(val_ans.user_id.unique())]\n", + " \n", + " return click_trn, click_val, val_ans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 获取历史点击和最后一次点击" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:19.202550Z", + "start_time": "2020-11-17T09:07:19.195766Z" + } + }, + "outputs": [], + "source": [ + "# 获取当前数据的历史点击和最后一次点击\n", + "def get_hist_and_last_click(all_click):\n", + " all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])\n", + " click_last_df = all_click.groupby('user_id').tail(1)\n", + "\n", + " # 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下\n", + " def hist_func(user_df):\n", + " if len(user_df) == 1:\n", + " return user_df\n", + " else:\n", + " return user_df[:-1]\n", + "\n", + " click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)\n", + "\n", + " return click_hist_df, click_last_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取训练、验证及测试集" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:21.181211Z", + "start_time": "2020-11-17T09:07:21.171338Z" + } + }, + "outputs": [], + "source": [ + "def get_trn_val_tst_data(data_path, offline=True):\n", + " if offline:\n", + " click_trn_data = pd.read_csv(data_path+'train_click_log.csv') # 训练集用户点击日志\n", + " click_trn_data = reduce_mem(click_trn_data)\n", + " click_trn, click_val, val_ans = trn_val_split(click_trn_data, sample_user_nums)\n", + " else:\n", + " click_trn = pd.read_csv(data_path+'train_click_log.csv')\n", + " click_trn = reduce_mem(click_trn)\n", + " click_val = None\n", + " val_ans = None\n", + " \n", + " click_tst = pd.read_csv(data_path+'testA_click_log.csv')\n", + " \n", + " return click_trn, click_val, click_tst, val_ans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取召回列表" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:23.210604Z", + "start_time": "2020-11-17T09:07:23.203652Z" + } + }, + "outputs": [], + "source": [ + "# 返回多路召回列表或者单路召回\n", + "def get_recall_list(save_path, single_recall_model=None, multi_recall=False):\n", + " if multi_recall:\n", + " return pickle.load(open(save_path + 'final_recall_items_dict.pkl', 'rb'))\n", + " \n", + " if single_recall_model == 'i2i_itemcf':\n", + " return pickle.load(open(save_path + 'itemcf_recall_dict.pkl', 'rb'))\n", + " elif single_recall_model == 'i2i_emb_itemcf':\n", + " return pickle.load(open(save_path + 'itemcf_emb_dict.pkl', 'rb'))\n", + " elif single_recall_model == 'user_cf':\n", + " return pickle.load(open(save_path + 'youtubednn_usercf_dict.pkl', 'rb'))\n", + " elif single_recall_model == 'youtubednn':\n", + " return pickle.load(open(save_path + 'youtube_u2i_dict.pkl', 'rb'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取各种Embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Word2Vec训练及gensim的使用\n", + "\n", + "Word2Vec主要思想是:一个词的上下文可以很好的表达出词的语义。通过无监督学习产生词向量的方式。word2vec中有两个非常经典的模型:skip-gram和cbow。\n", + "\n", + "- skip-gram:已知中心词预测周围词。\n", + "- cbow:已知周围词预测中心词。\n", + "![image-20201106225233086](https://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20201106225233086.png)\n", + "\n", + "在使用gensim训练word2vec的时候,有几个比较重要的参数\n", + "- size: 表示词向量的维度。\n", + "- window:决定了目标词会与多远距离的上下文产生关系。\n", + "- sg: 如果是0,则是CBOW模型,是1则是Skip-Gram模型。\n", + "- workers: 表示训练时候的线程数量\n", + "- min_count: 设置最小的\n", + "- iter: 训练时遍历整个数据集的次数\n", + "\n", + "**注意**\n", + "1. 训练的时候输入的语料库一定要是字符组成的二维数组,如:[['北', '京', '你', '好'], ['上', '海', '你', '好']]\n", + "2. 使用模型的时候有一些默认值,可以通过在Jupyter里面通过`Word2Vec??`查看\n", + "\n", + "\n", + "下面是个简单的测试样例:\n", + "```\n", + "from gensim.models import Word2Vec\n", + "doc = [['30760', '157507'],\n", + " ['289197', '63746'],\n", + " ['36162', '168401'],\n", + " ['50644', '36162']]\n", + "w2v = Word2Vec(docs, size=12, sg=1, window=2, seed=2020, workers=2, min_count=1, iter=1)\n", + "\n", + "# 查看'30760'表示的词向量\n", + "w2v['30760']\n", + "```\n", + "\n", + "skip-gram和cbow的详细原理可以参考下面的博客:\n", + "- [word2vec原理(一) CBOW与Skip-Gram模型基础](https://www.cnblogs.com/pinard/p/7160330.html) \n", + "- [word2vec原理(二) 基于Hierarchical Softmax的模型](https://www.cnblogs.com/pinard/p/7160330.html) \n", + "- [word2vec原理(三) 基于Negative Sampling的模型](https://www.cnblogs.com/pinard/p/7249903.html) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:26.676173Z", + "start_time": "2020-11-17T09:07:26.667926Z" + } + }, + "outputs": [], + "source": [ + "def trian_item_word2vec(click_df, embed_size=64, save_name='item_w2v_emb.pkl', split_char=' '):\n", + " click_df = click_df.sort_values('click_timestamp')\n", + " # 只有转换成字符串才可以进行训练\n", + " click_df['click_article_id'] = click_df['click_article_id'].astype(str)\n", + " # 转换成句子的形式\n", + " docs = click_df.groupby(['user_id'])['click_article_id'].apply(lambda x: list(x)).reset_index()\n", + " docs = docs['click_article_id'].values.tolist()\n", + "\n", + " # 为了方便查看训练的进度,这里设定一个log信息\n", + " logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", + "\n", + " # 这里的参数对训练得到的向量影响也很大,默认负采样为5\n", + " w2v = Word2Vec(docs, size=16, sg=1, window=5, seed=2020, workers=24, min_count=1, iter=1)\n", + " \n", + " # 保存成字典的形式\n", + " item_w2v_emb_dict = {k: w2v[k] for k in click_df['click_article_id']}\n", + " pickle.dump(item_w2v_emb_dict, open(save_path + 'item_w2v_emb.pkl', 'wb'))\n", + " \n", + " return item_w2v_emb_dict" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:27.285690Z", + "start_time": "2020-11-17T09:07:27.276646Z" + } + }, + "outputs": [], + "source": [ + "# 可以通过字典查询对应的item的Embedding\n", + "def get_embedding(save_path, all_click_df):\n", + " if os.path.exists(save_path + 'item_content_emb.pkl'):\n", + " item_content_emb_dict = pickle.load(open(save_path + 'item_content_emb.pkl', 'rb'))\n", + " else:\n", + " print('item_content_emb.pkl 文件不存在...')\n", + " \n", + " # w2v Embedding是需要提前训练好的\n", + " if os.path.exists(save_path + 'item_w2v_emb.pkl'):\n", + " item_w2v_emb_dict = pickle.load(open(save_path + 'item_w2v_emb.pkl', 'rb'))\n", + " else:\n", + " item_w2v_emb_dict = trian_item_word2vec(all_click_df)\n", + " \n", + " if os.path.exists(save_path + 'item_youtube_emb.pkl'):\n", + " item_youtube_emb_dict = pickle.load(open(save_path + 'item_youtube_emb.pkl', 'rb'))\n", + " else:\n", + " print('item_youtube_emb.pkl 文件不存在...')\n", + " \n", + " if os.path.exists(save_path + 'user_youtube_emb.pkl'):\n", + " user_youtube_emb_dict = pickle.load(open(save_path + 'user_youtube_emb.pkl', 'rb'))\n", + " else:\n", + " print('user_youtube_emb.pkl 文件不存在...')\n", + " \n", + " return item_content_emb_dict, item_w2v_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取文章信息" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:28.391797Z", + "start_time": "2020-11-17T09:07:28.386650Z" + } + }, + "outputs": [], + "source": [ + "def get_article_info_df():\n", + " article_info_df = pd.read_csv(data_path + 'articles.csv')\n", + " article_info_df = reduce_mem(article_info_df)\n", + " \n", + " return article_info_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取数据" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:07:32.362045Z", + "start_time": "2020-11-17T09:07:29.490413Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Mem. usage decreased to 23.34 Mb (69.4% reduction),time spend:0.00 min\n" + ] + } + ], + "source": [ + "# 这里offline的online的区别就是验证集是否为空\n", + "click_trn, click_val, click_tst, val_ans = get_trn_val_tst_data(data_path, offline=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:10.378966Z", + "start_time": "2020-11-17T09:07:32.468580Z" + } + }, + "outputs": [], + "source": [ + "click_trn_hist, click_trn_last = get_hist_and_last_click(click_trn)\n", + "\n", + "if click_val is not None:\n", + " click_val_hist, click_val_last = click_val, val_ans\n", + "else:\n", + " click_val_hist, click_val_last = None, None\n", + " \n", + "click_tst_hist = click_tst" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 对训练数据做负采样\n", + "\n", + "通过召回我们将数据转换成三元组的形式(user1, item1, label)的形式,观察发现正负样本差距极度不平衡,我们可以先对负样本进行下采样,下采样的目的一方面缓解了正负样本比例的问题,另一方面也减小了我们做排序特征的压力,我们在做负采样的时候又有哪些东西是需要注意的呢?\n", + "\n", + "1. 只对负样本进行下采样(如果有比较好的正样本扩充的方法其实也是可以考虑的)\n", + "2. 负采样之后,保证所有的用户和文章仍然出现在采样之后的数据中\n", + "3. 下采样的比例可以根据实际情况人为的控制\n", + "4. 做完负采样之后,更新此时新的用户召回文章列表,因为后续做特征的时候可能用到相对位置的信息。\n", + "\n", + "其实负采样也可以留在后面做完特征在进行,这里由于做排序特征太慢了,所以把负采样的环节提到前面了。" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:36.096678Z", + "start_time": "2020-11-17T09:11:36.090911Z" + } + }, + "outputs": [], + "source": [ + "# 将召回列表转换成df的形式\n", + "def recall_dict_2_df(recall_list_dict):\n", + " df_row_list = [] # [user, item, score]\n", + " for user, recall_list in tqdm(recall_list_dict.items()):\n", + " for item, score in recall_list:\n", + " df_row_list.append([user, item, score])\n", + " \n", + " col_names = ['user_id', 'sim_item', 'score']\n", + " recall_list_df = pd.DataFrame(df_row_list, columns=col_names)\n", + " \n", + " return recall_list_df" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:37.668844Z", + "start_time": "2020-11-17T09:11:37.659774Z" + } + }, + "outputs": [], + "source": [ + "# 负采样函数,这里可以控制负采样时的比例, 这里给了一个默认的值\n", + "def neg_sample_recall_data(recall_items_df, sample_rate=0.001):\n", + " pos_data = recall_items_df[recall_items_df['label'] == 1]\n", + " neg_data = recall_items_df[recall_items_df['label'] == 0]\n", + " \n", + " print('pos_data_num:', len(pos_data), 'neg_data_num:', len(neg_data), 'pos/neg:', len(pos_data)/len(neg_data))\n", + " \n", + " # 分组采样函数\n", + " def neg_sample_func(group_df):\n", + " neg_num = len(group_df)\n", + " sample_num = max(int(neg_num * sample_rate), 1) # 保证最少有一个\n", + " sample_num = min(sample_num, 5) # 保证最多不超过5个,这里可以根据实际情况进行选择\n", + " return group_df.sample(n=sample_num, replace=True)\n", + " \n", + " # 对用户进行负采样,保证所有用户都在采样后的数据中\n", + " neg_data_user_sample = neg_data.groupby('user_id', group_keys=False).apply(neg_sample_func)\n", + " # 对文章进行负采样,保证所有文章都在采样后的数据中\n", + " neg_data_item_sample = neg_data.groupby('sim_item', group_keys=False).apply(neg_sample_func)\n", + " \n", + " # 将上述两种情况下的采样数据合并\n", + " neg_data_new = neg_data_user_sample.append(neg_data_item_sample)\n", + " # 由于上述两个操作是分开的,可能将两个相同的数据给重复选择了,所以需要对合并后的数据进行去重\n", + " neg_data_new = neg_data_new.sort_values(['user_id', 'score']).drop_duplicates(['user_id', 'sim_item'], keep='last')\n", + " \n", + " # 将正样本数据合并\n", + " data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)\n", + " \n", + " return data_new" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:39.481715Z", + "start_time": "2020-11-17T09:11:39.475144Z" + } + }, + "outputs": [], + "source": [ + "# 召回数据打标签\n", + "def get_rank_label_df(recall_list_df, label_df, is_test=False):\n", + " # 测试集是没有标签了,为了后面代码同一一些,这里直接给一个负数替代\n", + " if is_test:\n", + " recall_list_df['label'] = -1\n", + " return recall_list_df\n", + " \n", + " label_df = label_df.rename(columns={'click_article_id': 'sim_item'})\n", + " recall_list_df_ = recall_list_df.merge(label_df[['user_id', 'sim_item', 'click_timestamp']], \\\n", + " how='left', on=['user_id', 'sim_item'])\n", + " recall_list_df_['label'] = recall_list_df_['click_timestamp'].apply(lambda x: 0.0 if np.isnan(x) else 1.0)\n", + " del recall_list_df_['click_timestamp']\n", + " \n", + " return recall_list_df_" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T09:11:41.555566Z", + "start_time": "2020-11-17T09:11:41.546766Z" + } + }, + "outputs": [], + "source": [ + "def get_user_recall_item_label_df(click_trn_hist, click_val_hist, click_tst_hist,click_trn_last, click_val_last, recall_list_df):\n", + " # 获取训练数据的召回列表\n", + " trn_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_trn_hist['user_id'].unique())]\n", + " # 训练数据打标签\n", + " trn_user_item_label_df = get_rank_label_df(trn_user_items_df, click_trn_last, is_test=False)\n", + " # 训练数据负采样\n", + " trn_user_item_label_df = neg_sample_recall_data(trn_user_item_label_df)\n", + " \n", + " if click_val is not None:\n", + " val_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_val_hist['user_id'].unique())]\n", + " val_user_item_label_df = get_rank_label_df(val_user_items_df, click_val_last, is_test=False)\n", + " val_user_item_label_df = neg_sample_recall_data(val_user_item_label_df)\n", + " else:\n", + " val_user_item_label_df = None\n", + " \n", + " # 测试数据不需要进行负采样,直接对所有的召回商品进行打-1标签\n", + " tst_user_items_df = recall_list_df[recall_list_df['user_id'].isin(click_tst_hist['user_id'].unique())]\n", + " tst_user_item_label_df = get_rank_label_df(tst_user_items_df, None, is_test=True)\n", + " \n", + " return trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:23:35.357045Z", + "start_time": "2020-11-17T17:23:12.378284Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 250000/250000 [00:12<00:00, 20689.39it/s]\n" + ] + } + ], + "source": [ + "# 读取召回列表\n", + "recall_list_dict = get_recall_list(save_path, single_recall_model='i2i_itemcf') # 这里只选择了单路召回的结果,也可以选择多路召回结果\n", + "# 将召回数据转换成df\n", + "recall_list_df = recall_dict_2_df(recall_list_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:29:04.598214Z", + "start_time": "2020-11-17T17:23:40.001052Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pos_data_num: 64190 neg_data_num: 1935810 pos/neg: 0.03315924600038227\n" + ] + } + ], + "source": [ + "# 给训练验证数据打标签,并负采样(这一部分时间比较久)\n", + "trn_user_item_label_df, val_user_item_label_df, tst_user_item_label_df = get_user_recall_item_label_df(click_trn_hist, \n", + " click_val_hist, \n", + " click_tst_hist,\n", + " click_trn_last, \n", + " click_val_last, \n", + " recall_list_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:23:11.642944Z", + "start_time": "2020-11-17T17:23:08.475Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "trn_user_item_label_df.label" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 将召回数据转换成字典" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:36:22.800449Z", + "start_time": "2020-11-17T17:36:22.794670Z" + } + }, + "outputs": [], + "source": [ + "# 将最终的召回的df数据转换成字典的形式做排序特征\n", + "def make_tuple_func(group_df):\n", + " row_data = []\n", + " for name, row_df in group_df.iterrows():\n", + " row_data.append((row_df['sim_item'], row_df['score'], row_df['label']))\n", + " \n", + " return row_data" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T17:40:05.991819Z", + "start_time": "2020-11-17T17:36:26.536429Z" + } + }, + "outputs": [], + "source": [ + "trn_user_item_label_tuples = trn_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", + "trn_user_item_label_tuples_dict = dict(zip(trn_user_item_label_tuples['user_id'], trn_user_item_label_tuples[0]))\n", + "\n", + "if val_user_item_label_df is not None:\n", + " val_user_item_label_tuples = val_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", + " val_user_item_label_tuples_dict = dict(zip(val_user_item_label_tuples['user_id'], val_user_item_label_tuples[0]))\n", + "else:\n", + " val_user_item_label_tuples_dict = None\n", + " \n", + "tst_user_item_label_tuples = tst_user_item_label_df.groupby('user_id').apply(make_tuple_func).reset_index()\n", + "tst_user_item_label_tuples_dict = dict(zip(tst_user_item_label_tuples['user_id'], tst_user_item_label_tuples[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T07:59:53.141560Z", + "start_time": "2020-11-17T07:59:53.133599Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 特征工程" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 制作与用户历史行为相关特征\n", + "对于每个用户召回的每个商品, 做特征。 具体步骤如下:\n", + "* 对于每个用户, 获取最后点击的N个商品的item_id, \n", + " * 对于该用户的每个召回商品, 计算与上面最后N次点击商品的相似度的和(最大, 最小,均值), 时间差特征,相似性特征,字数差特征,与该用户的相似性特征" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T01:07:47.268035Z", + "start_time": "2020-11-18T01:07:47.250449Z" + } + }, + "outputs": [], + "source": [ + "# 下面基于data做历史相关的特征\n", + "def create_feature(users_id, recall_list, click_hist_df, articles_info, articles_emb, user_emb=None, N=1):\n", + " \"\"\"\n", + " 基于用户的历史行为做相关特征\n", + " :param users_id: 用户id\n", + " :param recall_list: 对于每个用户召回的候选文章列表\n", + " :param click_hist_df: 用户的历史点击信息\n", + " :param articles_info: 文章信息\n", + " :param articles_emb: 文章的embedding向量, 这个可以用item_content_emb, item_w2v_emb, item_youtube_emb\n", + " :param user_emb: 用户的embedding向量, 这个是user_youtube_emb, 如果没有也可以不用, 但要注意如果要用的话, articles_emb就要用item_youtube_emb的形式, 这样维度才一样\n", + " :param N: 最近的N次点击 由于testA日志里面很多用户只存在一次历史点击, 所以为了不产生空值,默认是1\n", + " \"\"\"\n", + " \n", + " # 建立一个二维列表保存结果, 后面要转成DataFrame\n", + " all_user_feas = []\n", + " i = 0\n", + " for user_id in tqdm(users_id):\n", + " # 该用户的最后N次点击\n", + " hist_user_items = click_hist_df[click_hist_df['user_id']==user_id]['click_article_id'][-N:]\n", + " \n", + " # 遍历该用户的召回列表\n", + " for rank, (article_id, score, label) in enumerate(recall_list[user_id]):\n", + " # 该文章建立时间, 字数\n", + " a_create_time = articles_info[articles_info['article_id']==article_id]['created_at_ts'].values[0]\n", + " a_words_count = articles_info[articles_info['article_id']==article_id]['words_count'].values[0]\n", + " single_user_fea = [user_id, article_id]\n", + " # 计算与最后点击的商品的相似度的和, 最大值和最小值, 均值\n", + " sim_fea = []\n", + " time_fea = []\n", + " word_fea = []\n", + " # 遍历用户的最后N次点击文章\n", + " for hist_item in hist_user_items:\n", + " b_create_time = articles_info[articles_info['article_id']==hist_item]['created_at_ts'].values[0]\n", + " b_words_count = articles_info[articles_info['article_id']==hist_item]['words_count'].values[0]\n", + " \n", + " sim_fea.append(np.dot(articles_emb[hist_item], articles_emb[article_id]))\n", + " time_fea.append(abs(a_create_time-b_create_time))\n", + " word_fea.append(abs(a_words_count-b_words_count))\n", + " \n", + " single_user_fea.extend(sim_fea) # 相似性特征\n", + " single_user_fea.extend(time_fea) # 时间差特征\n", + " single_user_fea.extend(word_fea) # 字数差特征\n", + " single_user_fea.extend([max(sim_fea), min(sim_fea), sum(sim_fea), sum(sim_fea) / len(sim_fea)]) # 相似性的统计特征\n", + " \n", + " if user_emb: # 如果用户向量有的话, 这里计算该召回文章与用户的相似性特征 \n", + " single_user_fea.append(np.dot(user_emb[user_id], articles_emb[article_id]))\n", + " \n", + " single_user_fea.extend([score, rank, label]) \n", + " # 加入到总的表中\n", + " all_user_feas.append(single_user_fea)\n", + " \n", + " # 定义列名\n", + " id_cols = ['user_id', 'click_article_id']\n", + " sim_cols = ['sim' + str(i) for i in range(N)]\n", + " time_cols = ['time_diff' + str(i) for i in range(N)]\n", + " word_cols = ['word_diff' + str(i) for i in range(N)]\n", + " sat_cols = ['sim_max', 'sim_min', 'sim_sum', 'sim_mean']\n", + " user_item_sim_cols = ['user_item_sim'] if user_emb else []\n", + " user_score_rank_label = ['score', 'rank', 'label']\n", + " cols = id_cols + sim_cols + time_cols + word_cols + sat_cols + user_item_sim_cols + user_score_rank_label\n", + " \n", + " # 转成DataFrame\n", + " df = pd.DataFrame( all_user_feas, columns=cols)\n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T01:08:17.531694Z", + "start_time": "2020-11-18T01:08:10.754702Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Mem. usage decreased to 5.56 Mb (50.0% reduction),time spend:0.00 min\n" + ] + } + ], + "source": [ + "article_info_df = get_article_info_df()\n", + "all_click = click_trn.append(click_tst)\n", + "item_content_emb_dict, item_w2v_emb_dict, item_youtube_emb_dict, user_youtube_emb_dict = get_embedding(save_path, all_click)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:06:22.709350Z", + "start_time": "2020-11-18T01:08:39.923811Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 200000/200000 [50:16<00:00, 66.31it/s] \n", + "100%|██████████| 50000/50000 [1:07:21<00:00, 12.37it/s]\n" + ] + } + ], + "source": [ + "# 获取训练验证及测试数据中召回列文章相关特征\n", + "trn_user_item_feats_df = create_feature(trn_user_item_label_tuples_dict.keys(), trn_user_item_label_tuples_dict, \\\n", + " click_trn_hist, article_info_df, item_content_emb_dict)\n", + "\n", + "if val_user_item_label_tuples_dict is not None:\n", + " val_user_item_feats_df = create_feature(val_user_item_label_tuples_dict.keys(), val_user_item_label_tuples_dict, \\\n", + " click_val_hist, article_info_df, item_content_emb_dict)\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "tst_user_item_feats_df = create_feature(tst_user_item_label_tuples_dict.keys(), tst_user_item_label_tuples_dict, \\\n", + " click_tst_hist, article_info_df, item_content_emb_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:13:58.573422Z", + "start_time": "2020-11-18T03:13:40.157228Z" + } + }, + "outputs": [], + "source": [ + "# 保存一份省的每次都要重新跑,每次跑的时间都比较长\n", + "trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n", + "\n", + "tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:14:22.838154Z", + "start_time": "2020-11-18T03:14:22.828212Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 用户和文章特征\n", + "### 用户相关特征\n", + "这一块,正式进行特征工程,既要拼接上已有的特征, 也会做更多的特征出来,我们来梳理一下已有的特征和可构造特征:\n", + "1. 文章自身的特征, 文章字数,文章创建时间, 文章的embedding (articles表中)\n", + "2. 用户点击环境特征, 那些设备的特征(这个在df中)\n", + "3. 对于用户和商品还可以构造的特征:\n", + " * 基于用户的点击文章次数和点击时间构造可以表现用户活跃度的特征\n", + " * 基于文章被点击次数和时间构造可以反映文章热度的特征\n", + " * 用户的时间统计特征: 根据其点击的历史文章列表的点击时间和文章的创建时间做统计特征,比如求均值, 这个可以反映用户对于文章时效的偏好\n", + " * 用户的主题爱好特征, 对于用户点击的历史文章主题进行一个统计, 然后对于当前文章看看是否属于用户已经点击过的主题\n", + " * 用户的字数爱好特征, 对于用户点击的历史文章的字数统计, 求一个均值" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T03:16:37.637495Z", + "start_time": "2020-11-14T03:16:37.618229Z" + } + }, + "outputs": [], + "source": [ + "click_tst.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:09:11.675550Z", + "start_time": "2020-11-17T02:09:10.265134Z" + } + }, + "outputs": [], + "source": [ + "# 读取文章特征\n", + "articles = pd.read_csv(data_path+'articles.csv')\n", + "articles = reduce_mem(articles)\n", + "\n", + "# 日志数据,就是前面的所有数据\n", + "if click_val is not None:\n", + " all_data = click_trn.append(click_val)\n", + "all_data = click_trn.append(click_tst)\n", + "all_data = reduce_mem(all_data)\n", + "\n", + "# 拼上文章信息\n", + "all_data = all_data.merge(articles, left_on='click_article_id', right_on='article_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T03:17:12.256244Z", + "start_time": "2020-11-14T03:17:12.250452Z" + } + }, + "outputs": [], + "source": [ + "all_data.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 分析一下点击时间和点击文章的次数,区分用户活跃度\n", + "如果某个用户点击文章之间的时间间隔比较小, 同时点击的文章次数很多的话, 那么我们认为这种用户一般就是活跃用户, 当然衡量用户活跃度的方式可能多种多样, 这里我们只提供其中一种,我们写一个函数, 得到可以衡量用户活跃度的特征,逻辑如下:\n", + "1. 首先根据用户user_id分组, 对于每个用户,计算点击文章的次数, 两两点击文章时间间隔的均值\n", + "2. 把点击次数取倒数和时间间隔的均值统一归一化,然后两者相加合并,该值越小, 说明用户越活跃\n", + "3. 注意, 上面两两点击文章的时间间隔均值, 会出现如果用户只点击了一次的情况,这时候时间间隔均值那里会出现空值, 对于这种情况最后特征那里给个大数进行区分\n", + "\n", + "这个的衡量标准就是先把点击的次数取到数然后归一化, 然后点击的时间差归一化, 然后两者相加进行合并, 该值越小, 说明被点击的次数越多, 且间隔时间短。 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:28:55.336058Z", + "start_time": "2020-11-17T02:28:55.324332Z" + } + }, + "outputs": [], + "source": [ + " def active_level(all_data, cols):\n", + " \"\"\"\n", + " 制作区分用户活跃度的特征\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " data = all_data[cols]\n", + " data.sort_values(['user_id', 'click_timestamp'], inplace=True)\n", + " user_act = pd.DataFrame(data.groupby('user_id', as_index=False)[['click_article_id', 'click_timestamp']].\\\n", + " agg({'click_article_id':np.size, 'click_timestamp': {list}}).values, columns=['user_id', 'click_size', 'click_timestamp'])\n", + " \n", + " # 计算时间间隔的均值\n", + " def time_diff_mean(l):\n", + " if len(l) == 1:\n", + " return 1\n", + " else:\n", + " return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])\n", + " \n", + " user_act['time_diff_mean'] = user_act['click_timestamp'].apply(lambda x: time_diff_mean(x))\n", + " \n", + " # 点击次数取倒数\n", + " user_act['click_size'] = 1 / user_act['click_size']\n", + " \n", + " # 两者归一化\n", + " user_act['click_size'] = (user_act['click_size'] - user_act['click_size'].min()) / (user_act['click_size'].max() - user_act['click_size'].min())\n", + " user_act['time_diff_mean'] = (user_act['time_diff_mean'] - user_act['time_diff_mean'].min()) / (user_act['time_diff_mean'].max() - user_act['time_diff_mean'].min()) \n", + " user_act['active_level'] = user_act['click_size'] + user_act['time_diff_mean']\n", + " \n", + " user_act['user_id'] = user_act['user_id'].astype('int')\n", + " del user_act['click_timestamp']\n", + " \n", + " return user_act" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:30:12.696060Z", + "start_time": "2020-11-17T02:29:01.523837Z" + } + }, + "outputs": [], + "source": [ + "user_act_fea = active_level(all_data, ['user_id', 'click_article_id', 'click_timestamp'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:28:53.996742Z", + "start_time": "2020-11-17T02:09:18.374Z" + } + }, + "outputs": [], + "source": [ + "user_act_fea.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 分析一下点击时间和被点击文章的次数, 衡量文章热度特征\n", + "和上面同样的思路, 如果一篇文章在很短的时间间隔之内被点击了很多次, 说明文章比较热门,实现的逻辑和上面的基本一致, 只不过这里是按照点击的文章进行分组:\n", + "1. 根据文章进行分组, 对于每篇文章的用户, 计算点击的时间间隔\n", + "2. 将用户的数量取倒数, 然后用户的数量和时间间隔归一化, 然后相加得到热度特征, 该值越小, 说明被点击的次数越大且时间间隔越短, 文章比较热\n", + "\n", + "当然, 这只是给出一种判断文章热度的一种方法, 这里大家也可以头脑风暴一下" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:41:26.398567Z", + "start_time": "2020-11-17T02:41:26.386668Z" + } + }, + "outputs": [], + "source": [ + " def hot_level(all_data, cols):\n", + " \"\"\"\n", + " 制作衡量文章热度的特征\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " data = all_data[cols]\n", + " data.sort_values(['click_article_id', 'click_timestamp'], inplace=True)\n", + " article_hot = pd.DataFrame(data.groupby('click_article_id', as_index=False)[['user_id', 'click_timestamp']].\\\n", + " agg({'user_id':np.size, 'click_timestamp': {list}}).values, columns=['click_article_id', 'user_num', 'click_timestamp'])\n", + " \n", + " # 计算被点击时间间隔的均值\n", + " def time_diff_mean(l):\n", + " if len(l) == 1:\n", + " return 1\n", + " else:\n", + " return np.mean([j-i for i, j in list(zip(l[:-1], l[1:]))])\n", + " \n", + " article_hot['time_diff_mean'] = article_hot['click_timestamp'].apply(lambda x: time_diff_mean(x))\n", + " \n", + " # 点击次数取倒数\n", + " article_hot['user_num'] = 1 / article_hot['user_num']\n", + " \n", + " # 两者归一化\n", + " article_hot['user_num'] = (article_hot['user_num'] - article_hot['user_num'].min()) / (article_hot['user_num'].max() - article_hot['user_num'].min())\n", + " article_hot['time_diff_mean'] = (article_hot['time_diff_mean'] - article_hot['time_diff_mean'].min()) / (article_hot['time_diff_mean'].max() - article_hot['time_diff_mean'].min()) \n", + " article_hot['hot_level'] = article_hot['user_num'] + article_hot['time_diff_mean']\n", + " \n", + " article_hot['click_article_id'] = article_hot['click_article_id'].astype('int')\n", + " \n", + " del article_hot['click_timestamp']\n", + " \n", + " return article_hot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T02:41:44.635900Z", + "start_time": "2020-11-17T02:41:31.473032Z" + } + }, + "outputs": [], + "source": [ + "article_hot_fea = hot_level(all_data, ['user_id', 'click_article_id', 'click_timestamp']) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T03:19:54.775290Z", + "start_time": "2020-11-14T03:19:54.763699Z" + } + }, + "outputs": [], + "source": [ + "article_hot_fea.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的系列习惯\n", + "这个基于原来的日志表做一个类似于article的那种DataFrame, 存放用户特有的信息, 主要包括点击习惯, 爱好特征之类的\n", + "* 用户的设备习惯, 这里取最常用的设备(众数)\n", + "* 用户的时间习惯: 根据其点击过得历史文章的时间来做一个统计(这个感觉最好是把时间戳里的时间特征的h特征提出来,看看用户习惯一天的啥时候点击文章), 但这里先用转换的时间吧, 求个均值\n", + "* 用户的爱好特征, 对于用户点击的历史文章主题进行用户的爱好判别, 更偏向于哪几个主题, 这个最好是multi-hot进行编码, 先试试行不\n", + "* 用户文章的字数差特征, 用户的爱好文章的字数习惯\n", + "\n", + "这些就是对用户进行分组, 然后统计即可" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的设备习惯" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T04:22:48.877978Z", + "start_time": "2020-11-17T04:22:48.872049Z" + } + }, + "outputs": [], + "source": [ + "def device_fea(all_data, cols):\n", + " \"\"\"\n", + " 制作用户的设备特征\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " user_device_info = all_data[cols]\n", + " \n", + " # 用众数来表示每个用户的设备信息\n", + " user_device_info = user_device_info.groupby('user_id').agg(lambda x: x.value_counts().index[0]).reset_index()\n", + " \n", + " return user_device_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T05:27:10.897473Z", + "start_time": "2020-11-17T04:49:33.214865Z" + } + }, + "outputs": [], + "source": [ + "# 设备特征(这里时间会比较长)\n", + "device_cols = ['user_id', 'click_environment', 'click_deviceGroup', 'click_os', 'click_country', 'click_region', 'click_referrer_type']\n", + "user_device_info = device_fea(all_data, device_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T04:20:39.765842Z", + "start_time": "2020-11-14T04:20:39.747087Z" + } + }, + "outputs": [], + "source": [ + "user_device_info.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的时间习惯" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:11:50.889905Z", + "start_time": "2020-11-17T06:11:50.882653Z" + } + }, + "outputs": [], + "source": [ + "def user_time_hob_fea(all_data, cols):\n", + " \"\"\"\n", + " 制作用户的时间习惯特征\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " user_time_hob_info = all_data[cols]\n", + " \n", + " # 先把时间戳进行归一化\n", + " mm = MinMaxScaler()\n", + " user_time_hob_info['click_timestamp'] = mm.fit_transform(user_time_hob_info[['click_timestamp']])\n", + " user_time_hob_info['created_at_ts'] = mm.fit_transform(user_time_hob_info[['created_at_ts']])\n", + "\n", + " user_time_hob_info = user_time_hob_info.groupby('user_id').agg('mean').reset_index()\n", + " \n", + " user_time_hob_info.rename(columns={'click_timestamp': 'user_time_hob1', 'created_at_ts': 'user_time_hob2'}, inplace=True)\n", + " return user_time_hob_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:31:51.646110Z", + "start_time": "2020-11-17T06:31:51.171431Z" + } + }, + "outputs": [], + "source": [ + "user_time_hob_cols = ['user_id', 'click_timestamp', 'created_at_ts']\n", + "user_time_hob_info = user_time_hob_fea(all_data, user_time_hob_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的主题爱好\n", + "这里先把用户点击的文章属于的主题转成一个列表, 后面再总的汇总的时候单独制作一个特征, 就是文章的主题如果属于这里面, 就是1, 否则就是0。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:31:56.571088Z", + "start_time": "2020-11-17T06:31:56.565304Z" + } + }, + "outputs": [], + "source": [ + "def user_cat_hob_fea(all_data, cols):\n", + " \"\"\"\n", + " 用户的主题爱好\n", + " :param all_data: 数据集\n", + " :param cols: 用到的特征列\n", + " \"\"\"\n", + " user_category_hob_info = all_data[cols]\n", + " user_category_hob_info = user_category_hob_info.groupby('user_id').agg({list}).reset_index()\n", + " \n", + " user_cat_hob_info = pd.DataFrame()\n", + " user_cat_hob_info['user_id'] = user_category_hob_info['user_id']\n", + " user_cat_hob_info['cate_list'] = user_category_hob_info['category_id']\n", + " \n", + " return user_cat_hob_info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:32:55.150800Z", + "start_time": "2020-11-17T06:32:00.740046Z" + } + }, + "outputs": [], + "source": [ + "user_category_hob_cols = ['user_id', 'category_id']\n", + "user_cat_hob_info = user_cat_hob_fea(all_data, user_category_hob_cols)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的字数偏好特征" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:48:12.988460Z", + "start_time": "2020-11-17T06:48:12.547000Z" + } + }, + "outputs": [], + "source": [ + "user_wcou_info = all_data.groupby('user_id')['words_count'].agg('mean').reset_index()\n", + "user_wcou_info.rename(columns={'words_count': 'words_hbo'}, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 用户的信息特征合并保存" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:48:18.289591Z", + "start_time": "2020-11-17T06:48:17.084408Z" + } + }, + "outputs": [], + "source": [ + "# 所有表进行合并\n", + "user_info = pd.merge(user_act_fea, user_device_info, on='user_id')\n", + "user_info = user_info.merge(user_time_hob_info, on='user_id')\n", + "user_info = user_info.merge(user_cat_hob_info, on='user_id')\n", + "user_info = user_info.merge(user_wcou_info, on='user_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-17T06:48:26.907785Z", + "start_time": "2020-11-17T06:48:21.457597Z" + } + }, + "outputs": [], + "source": [ + "# 这样用户特征以后就可以直接读取了\n", + "user_info.to_csv(save_path + 'user_info.csv', index=False) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户特征直接读入\n", + "如果前面关于用户的特征工程已经给做完了,后面可以直接读取" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:15:49.502826Z", + "start_time": "2020-11-18T03:15:48.062243Z" + } + }, + "outputs": [], + "source": [ + "# 把用户信息直接读入进来\n", + "user_info = pd.read_csv(save_path + 'user_info.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:15:56.899635Z", + "start_time": "2020-11-18T03:15:53.701818Z" + } + }, + "outputs": [], + "source": [ + "if os.path.exists(save_path + 'trn_user_item_feats_df.csv'):\n", + " trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')\n", + " \n", + "if os.path.exists(save_path + 'tst_user_item_feats_df.csv'):\n", + " tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')\n", + "\n", + "if os.path.exists(save_path + 'val_user_item_feats_df.csv'):\n", + " val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')\n", + "else:\n", + " val_user_item_feats_df = None" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:16:02.739197Z", + "start_time": "2020-11-18T03:16:01.725028Z" + } + }, + "outputs": [], + "source": [ + "# 拼上用户特征\n", + "# 下面是线下验证的\n", + "trn_user_item_feats_df = trn_user_item_feats_df.merge(user_info, on='user_id', how='left')\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df = val_user_item_feats_df.merge(user_info, on='user_id', how='left')\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "tst_user_item_feats_df = tst_user_item_feats_df.merge(user_info, on='user_id',how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:16:06.989877Z", + "start_time": "2020-11-18T03:16:06.983327Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['user_id', 'click_article_id', 'sim0', 'time_diff0', 'word_diff0',\n", + " 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score', 'rank', 'label',\n", + " 'click_size', 'time_diff_mean', 'active_level', 'click_environment',\n", + " 'click_deviceGroup', 'click_os', 'click_country', 'click_region',\n", + " 'click_referrer_type', 'user_time_hob1', 'user_time_hob2', 'cate_list',\n", + " 'words_hbo'],\n", + " dtype='object')" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trn_user_item_feats_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-14T03:13:36.071236Z", + "start_time": "2020-11-14T03:13:36.050188Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 文章的特征直接读入" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:16:12.793070Z", + "start_time": "2020-11-18T03:16:12.425380Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-- Mem. usage decreased to 5.56 Mb (50.0% reduction),time spend:0.00 min\n" + ] + } + ], + "source": [ + "articles = pd.read_csv(data_path+'articles.csv')\n", + "articles = reduce_mem(articles)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:16:18.118507Z", + "start_time": "2020-11-18T03:16:16.344338Z" + } + }, + "outputs": [], + "source": [ + "# 拼上文章特征\n", + "trn_user_item_feats_df = trn_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df = val_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')\n", + "else:\n", + " val_user_item_feats_df = None\n", + "\n", + "tst_user_item_feats_df = tst_user_item_feats_df.merge(articles, left_on='click_article_id', right_on='article_id')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 召回文章的主题是否在用户的爱好里面" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:17:40.251797Z", + "start_time": "2020-11-18T03:16:28.130012Z" + } + }, + "outputs": [], + "source": [ + "trn_user_item_feats_df['is_cat_hab'] = trn_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df['is_cat_hab'] = val_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n", + "else:\n", + " val_user_item_feats_df = None\n", + "tst_user_item_feats_df['is_cat_hab'] = tst_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:19:30.451200Z", + "start_time": "2020-11-18T03:19:30.411225Z" + } + }, + "outputs": [], + "source": [ + "# 线下验证\n", + "del trn_user_item_feats_df['cate_list']\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " del val_user_item_feats_df['cate_list']\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "del tst_user_item_feats_df['cate_list']\n", + "\n", + "del trn_user_item_feats_df['article_id']\n", + "\n", + "if val_user_item_feats_df is not None:\n", + " del val_user_item_feats_df['article_id']\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "del tst_user_item_feats_df['article_id']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 保存特征" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T03:20:08.560942Z", + "start_time": "2020-11-18T03:19:35.601095Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "# 训练验证特征\n", + "trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n", + "if val_user_item_feats_df is not None:\n", + " val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n", + "tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 总结\n", + "特征工程和数据清洗转换是比赛中至关重要的一块, 因为**数据和特征决定了机器学习的上限,而算法和模型只是逼近这个上限而已**,所以特征工程的好坏往往决定着最后的结果,**特征工程**可以一步增强数据的表达能力,通过构造新特征,我们可以挖掘出数据的更多信息,使得数据的表达能力进一步放大。 在本节内容中,我们主要是先通过制作特征和标签把预测问题转成了监督学习问题,然后围绕着用户画像和文章画像进行一系列特征的制作, 此外,为了保证正负样本的数据均衡,我们还学习了负采样就技术等。当然本节内容只是对构造特征提供了一些思路,也请学习者们在学习过程中开启头脑风暴,尝试更多的构造特征的方法,也欢迎我们一块探讨和交流。\n", + "\n", + "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", + "\n", + "![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" + ] } - }, - "outputs": [], - "source": [ - "trn_user_item_feats_df['is_cat_hab'] = trn_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df['is_cat_hab'] = val_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)\n", - "else:\n", - " val_user_item_feats_df = None\n", - "tst_user_item_feats_df['is_cat_hab'] = tst_user_item_feats_df.apply(lambda x: 1 if x.category_id in set(x.cate_list) else 0, axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:19:30.451200Z", - "start_time": "2020-11-18T03:19:30.411225Z" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + }, + "tianchi_metadata": { + "competitions": [], + "datasets": [], + "description": "", + "notebookId": "130010", + "source": "dsw" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "218px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } - }, - "outputs": [], - "source": [ - "# 线下验证\n", - "del trn_user_item_feats_df['cate_list']\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " del val_user_item_feats_df['cate_list']\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "del tst_user_item_feats_df['cate_list']\n", - "\n", - "del trn_user_item_feats_df['article_id']\n", - "\n", - "if val_user_item_feats_df is not None:\n", - " del val_user_item_feats_df['article_id']\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "del tst_user_item_feats_df['article_id']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 保存特征" - ] }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T03:20:08.560942Z", - "start_time": "2020-11-18T03:19:35.601095Z" - }, - "scrolled": true - }, - "outputs": [], - "source": [ - "# 训练验证特征\n", - "trn_user_item_feats_df.to_csv(save_path + 'trn_user_item_feats_df.csv', index=False)\n", - "if val_user_item_feats_df is not None:\n", - " val_user_item_feats_df.to_csv(save_path + 'val_user_item_feats_df.csv', index=False)\n", - "tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 总结\n", - "特征工程和数据清洗转换是比赛中至关重要的一块, 因为**数据和特征决定了机器学习的上限,而算法和模型只是逼近这个上限而已**,所以特征工程的好坏往往决定着最后的结果,**特征工程**可以一步增强数据的表达能力,通过构造新特征,我们可以挖掘出数据的更多信息,使得数据的表达能力进一步放大。 在本节内容中,我们主要是先通过制作特征和标签把预测问题转成了监督学习问题,然后围绕着用户画像和文章画像进行一系列特征的制作, 此外,为了保证正负样本的数据均衡,我们还学习了负采样就技术等。当然本节内容只是对构造特征提供了一些思路,也请学习者们在学习过程中开启头脑风暴,尝试更多的构造特征的方法,也欢迎我们一块探讨和交流。\n", - "\n", - "**关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - }, - "tianchi_metadata": { - "competitions": [], - "datasets": [], - "description": "", - "notebookId": "130010", - "source": "dsw" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "218px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" - }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " - } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.5 \346\216\222\345\272\217\346\250\241\345\236\213+\346\250\241\345\236\213\350\236\215\345\220\210.ipynb" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.5 \346\216\222\345\272\217\346\250\241\345\236\213+\346\250\241\345\236\213\350\236\215\345\220\210.ipynb" index 5f96e246b..3af0aa71f 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.5 \346\216\222\345\272\217\346\250\241\345\236\213+\346\250\241\345\236\213\350\236\215\345\220\210.ipynb" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/jupyter/2.5 \346\216\222\345\272\217\346\250\241\345\236\213+\346\250\241\345\236\213\350\236\215\345\220\210.ipynb" @@ -1,2689 +1,2689 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 排序模型\n", - "通过召回的操作, 我们已经进行了问题规模的缩减, 对于每个用户, 选择出了N篇文章作为了候选集,并基于召回的候选集构建了与用户历史相关的特征,以及用户本身的属性特征,文章本省的属性特征,以及用户与文章之间的特征,下面就是使用机器学习模型来对构造好的特征进行学习,然后对测试集进行预测,得到测试集中的每个候选集用户点击的概率,返回点击概率最大的topk个文章,作为最终的结果。\n", - "\n", - "排序阶段选择了三个比较有代表性的排序模型,它们分别是:\n", - "\n", - "1. LGB的排序模型\n", - "2. LGB的分类模型\n", - "3. 深度学习的分类模型DIN\n", - "\n", - "得到了最终的排序模型输出的结果之后,还选择了两种比较经典的模型集成的方法:\n", - "\n", - "1. 输出结果加权融合\n", - "2. Staking(将模型的输出结果再使用一个简单模型进行预测)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:20:39.770642Z", - "start_time": "2020-11-18T04:20:38.500875Z" - } - }, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import pickle\n", - "from tqdm import tqdm\n", - "import gc, os\n", - "import time\n", - "from datetime import datetime\n", - "import lightgbm as lgb\n", - "from sklearn.preprocessing import MinMaxScaler\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 读取排序特征" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:20:41.843180Z", - "start_time": "2020-11-18T04:20:41.837287Z" - } - }, - "outputs": [], - "source": [ - "data_path = './data_raw/'\n", - "save_path = './temp_results/'\n", - "offline = False" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:20:53.358138Z", - "start_time": "2020-11-18T04:20:44.232944Z" - } - }, - "outputs": [], - "source": [ - "# 重新读取数据的时候,发现click_article_id是一个浮点数,所以将其转换成int类型\n", - "trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')\n", - "trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)\n", - "\n", - "if offline:\n", - " val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')\n", - " val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)\n", - "else:\n", - " val_user_item_feats_df = None\n", - " \n", - "tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')\n", - "tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)\n", - "\n", - "# 做特征的时候为了方便,给测试集也打上了一个无效的标签,这里直接删掉就行\n", - "del tst_user_item_feats_df['label']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 返回排序后的结果" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:01.809368Z", - "start_time": "2020-11-18T04:21:01.799641Z" - } - }, - "outputs": [], - "source": [ - "def submit(recall_df, topk=5, model_name=None):\n", - " recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])\n", - " recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 判断是不是每个用户都有5篇文章及以上\n", - " tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())\n", - " assert tmp.min() >= topk\n", - " \n", - " del recall_df['pred_score']\n", - " submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()\n", - " \n", - " submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]\n", - " # 按照提交格式定义列名\n", - " submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', \n", - " 3: 'article_3', 4: 'article_4', 5: 'article_5'})\n", - " \n", - " save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'\n", - " submit.to_csv(save_name, index=False, header=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:04.332198Z", - "start_time": "2020-11-18T04:21:04.325020Z" - } - }, - "outputs": [], - "source": [ - "# 排序结果归一化\n", - "def norm_sim(sim_df, weight=0.0):\n", - " # print(sim_df.head())\n", - " min_sim = sim_df.min()\n", - " max_sim = sim_df.max()\n", - " if max_sim == min_sim:\n", - " sim_df = sim_df.apply(lambda sim: 1.0)\n", - " else:\n", - " sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))\n", - "\n", - " sim_df = sim_df.apply(lambda sim: sim + weight) # plus one\n", - " return sim_df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## LGB排序模型" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:07.787698Z", - "start_time": "2020-11-18T04:21:07.536514Z" - } - }, - "outputs": [], - "source": [ - "# 防止中间出错之后重新读取数据\n", - "trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()\n", - "\n", - "if offline:\n", - " val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()\n", - " \n", - "tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:10.839656Z", - "start_time": "2020-11-18T04:21:10.833109Z" - } - }, - "outputs": [], - "source": [ - "# 定义特征列\n", - "lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', \n", - " 'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',\n", - " 'click_environment','click_deviceGroup', 'click_os', 'click_country', \n", - " 'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',\n", - " 'words_hbo', 'category_id', 'created_at_ts','words_count']" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:14.126608Z", - "start_time": "2020-11-18T04:21:13.493653Z" - } - }, - "outputs": [], - "source": [ - "# 排序模型分组\n", - "trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n", - "g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", - "\n", - "if offline:\n", - " val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n", - " g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:16.136151Z", - "start_time": "2020-11-18T04:21:16.124444Z" - } - }, - "outputs": [], - "source": [ - "# 排序模型定义\n", - "lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", - " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", - " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:22.965433Z", - "start_time": "2020-11-18T04:21:17.799127Z" - } - }, - "outputs": [], - "source": [ - "# 排序模型训练\n", - "if offline:\n", - " lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,\n", - " eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], \n", - " eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n", - "else:\n", - " lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:28.616665Z", - "start_time": "2020-11-18T04:21:24.672280Z" - } - }, - "outputs": [], - "source": [ - "# 模型预测\n", - "tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n", - "\n", - "# 将这里的排序结果保存一份,用户后面的模型融合\n", - "tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:21:40.253692Z", - "start_time": "2020-11-18T04:21:30.546587Z" + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 排序模型\n", + "通过召回的操作, 我们已经进行了问题规模的缩减, 对于每个用户, 选择出了N篇文章作为了候选集,并基于召回的候选集构建了与用户历史相关的特征,以及用户本身的属性特征,文章本省的属性特征,以及用户与文章之间的特征,下面就是使用机器学习模型来对构造好的特征进行学习,然后对测试集进行预测,得到测试集中的每个候选集用户点击的概率,返回点击概率最大的topk个文章,作为最终的结果。\n", + "\n", + "排序阶段选择了三个比较有代表性的排序模型,它们分别是:\n", + "\n", + "1. LGB的排序模型\n", + "2. LGB的分类模型\n", + "3. 深度学习的分类模型DIN\n", + "\n", + "得到了最终的排序模型输出的结果之后,还选择了两种比较经典的模型集成的方法:\n", + "\n", + "1. 输出结果加权融合\n", + "2. Staking(将模型的输出结果再使用一个简单模型进行预测)" + ] }, - "scrolled": true - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]\n", - "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", - "submit(rank_results, topk=5, model_name='lgb_ranker')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:22:26.195838Z", - "start_time": "2020-11-18T04:21:46.115002Z" + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:20:39.770642Z", + "start_time": "2020-11-18T04:20:38.500875Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "from tqdm import tqdm\n", + "import gc, os\n", + "import time\n", + "from datetime import datetime\n", + "import lightgbm as lgb\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1]\tvalid_0's ndcg@1: 0.909975\tvalid_0's ndcg@2: 0.963068\tvalid_0's ndcg@3: 0.96533\tvalid_0's ndcg@4: 0.965729\tvalid_0's ndcg@5: 0.965864\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.9143\tvalid_0's ndcg@2: 0.964711\tvalid_0's ndcg@3: 0.966961\tvalid_0's ndcg@4: 0.967338\tvalid_0's ndcg@5: 0.967483\n", - "[3]\tvalid_0's ndcg@1: 0.9181\tvalid_0's ndcg@2: 0.966114\tvalid_0's ndcg@3: 0.968289\tvalid_0's ndcg@4: 0.968773\tvalid_0's ndcg@5: 0.96887\n", - "[4]\tvalid_0's ndcg@1: 0.925575\tvalid_0's ndcg@2: 0.969093\tvalid_0's ndcg@3: 0.971193\tvalid_0's ndcg@4: 0.971603\tvalid_0's ndcg@5: 0.97169\n", - "[5]\tvalid_0's ndcg@1: 0.9267\tvalid_0's ndcg@2: 0.969635\tvalid_0's ndcg@3: 0.97166\tvalid_0's ndcg@4: 0.972037\tvalid_0's ndcg@5: 0.972133\n", - "[6]\tvalid_0's ndcg@1: 0.927\tvalid_0's ndcg@2: 0.969682\tvalid_0's ndcg@3: 0.971757\tvalid_0's ndcg@4: 0.972134\tvalid_0's ndcg@5: 0.972231\n", - "[7]\tvalid_0's ndcg@1: 0.928825\tvalid_0's ndcg@2: 0.970451\tvalid_0's ndcg@3: 0.972476\tvalid_0's ndcg@4: 0.97282\tvalid_0's ndcg@5: 0.972927\n", - "[8]\tvalid_0's ndcg@1: 0.930025\tvalid_0's ndcg@2: 0.970988\tvalid_0's ndcg@3: 0.972951\tvalid_0's ndcg@4: 0.973295\tvalid_0's ndcg@5: 0.973402\n", - "[9]\tvalid_0's ndcg@1: 0.931125\tvalid_0's ndcg@2: 0.971347\tvalid_0's ndcg@3: 0.973384\tvalid_0's ndcg@4: 0.973707\tvalid_0's ndcg@5: 0.973794\n", - "[10]\tvalid_0's ndcg@1: 0.9311\tvalid_0's ndcg@2: 0.971385\tvalid_0's ndcg@3: 0.973372\tvalid_0's ndcg@4: 0.973717\tvalid_0's ndcg@5: 0.973794\n", - "[11]\tvalid_0's ndcg@1: 0.930975\tvalid_0's ndcg@2: 0.971433\tvalid_0's ndcg@3: 0.973333\tvalid_0's ndcg@4: 0.973699\tvalid_0's ndcg@5: 0.973767\n", - "[12]\tvalid_0's ndcg@1: 0.93145\tvalid_0's ndcg@2: 0.971656\tvalid_0's ndcg@3: 0.973493\tvalid_0's ndcg@4: 0.973881\tvalid_0's ndcg@5: 0.973949\n", - "[13]\tvalid_0's ndcg@1: 0.932525\tvalid_0's ndcg@2: 0.971927\tvalid_0's ndcg@3: 0.973839\tvalid_0's ndcg@4: 0.974227\tvalid_0's ndcg@5: 0.974304\n", - "[14]\tvalid_0's ndcg@1: 0.932575\tvalid_0's ndcg@2: 0.971898\tvalid_0's ndcg@3: 0.973823\tvalid_0's ndcg@4: 0.974243\tvalid_0's ndcg@5: 0.97432\n", - "[15]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972239\tvalid_0's ndcg@3: 0.974189\tvalid_0's ndcg@4: 0.974587\tvalid_0's ndcg@5: 0.974665\n", - "[16]\tvalid_0's ndcg@1: 0.933475\tvalid_0's ndcg@2: 0.972309\tvalid_0's ndcg@3: 0.974209\tvalid_0's ndcg@4: 0.974596\tvalid_0's ndcg@5: 0.974674\n", - "[17]\tvalid_0's ndcg@1: 0.933725\tvalid_0's ndcg@2: 0.972369\tvalid_0's ndcg@3: 0.974307\tvalid_0's ndcg@4: 0.974684\tvalid_0's ndcg@5: 0.974761\n", - "[18]\tvalid_0's ndcg@1: 0.9339\tvalid_0's ndcg@2: 0.972497\tvalid_0's ndcg@3: 0.974372\tvalid_0's ndcg@4: 0.974749\tvalid_0's ndcg@5: 0.974836\n", - "[19]\tvalid_0's ndcg@1: 0.9345\tvalid_0's ndcg@2: 0.972845\tvalid_0's ndcg@3: 0.974645\tvalid_0's ndcg@4: 0.974979\tvalid_0's ndcg@5: 0.975085\n", - "[20]\tvalid_0's ndcg@1: 0.9349\tvalid_0's ndcg@2: 0.973103\tvalid_0's ndcg@3: 0.97484\tvalid_0's ndcg@4: 0.975174\tvalid_0's ndcg@5: 0.975271\n", - "[21]\tvalid_0's ndcg@1: 0.935\tvalid_0's ndcg@2: 0.973092\tvalid_0's ndcg@3: 0.97488\tvalid_0's ndcg@4: 0.975192\tvalid_0's ndcg@5: 0.975289\n", - "[22]\tvalid_0's ndcg@1: 0.93525\tvalid_0's ndcg@2: 0.9732\tvalid_0's ndcg@3: 0.974988\tvalid_0's ndcg@4: 0.975289\tvalid_0's ndcg@5: 0.975386\n", - "[23]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.972949\tvalid_0's ndcg@3: 0.974824\tvalid_0's ndcg@4: 0.975136\tvalid_0's ndcg@5: 0.975223\n", - "[24]\tvalid_0's ndcg@1: 0.93545\tvalid_0's ndcg@2: 0.973274\tvalid_0's ndcg@3: 0.975087\tvalid_0's ndcg@4: 0.975388\tvalid_0's ndcg@5: 0.975475\n", - "[25]\tvalid_0's ndcg@1: 0.9356\tvalid_0's ndcg@2: 0.973345\tvalid_0's ndcg@3: 0.97512\tvalid_0's ndcg@4: 0.975443\tvalid_0's ndcg@5: 0.97553\n", - "[26]\tvalid_0's ndcg@1: 0.93525\tvalid_0's ndcg@2: 0.9732\tvalid_0's ndcg@3: 0.975\tvalid_0's ndcg@4: 0.975313\tvalid_0's ndcg@5: 0.9754\n", - "[27]\tvalid_0's ndcg@1: 0.935175\tvalid_0's ndcg@2: 0.97322\tvalid_0's ndcg@3: 0.974983\tvalid_0's ndcg@4: 0.975295\tvalid_0's ndcg@5: 0.975382\n", - "[28]\tvalid_0's ndcg@1: 0.935425\tvalid_0's ndcg@2: 0.973328\tvalid_0's ndcg@3: 0.975041\tvalid_0's ndcg@4: 0.975374\tvalid_0's ndcg@5: 0.975471\n", - "[29]\tvalid_0's ndcg@1: 0.935275\tvalid_0's ndcg@2: 0.973225\tvalid_0's ndcg@3: 0.974963\tvalid_0's ndcg@4: 0.975297\tvalid_0's ndcg@5: 0.975403\n", - "[30]\tvalid_0's ndcg@1: 0.9353\tvalid_0's ndcg@2: 0.973235\tvalid_0's ndcg@3: 0.97501\tvalid_0's ndcg@4: 0.975311\tvalid_0's ndcg@5: 0.975418\n", - "[31]\tvalid_0's ndcg@1: 0.9356\tvalid_0's ndcg@2: 0.973361\tvalid_0's ndcg@3: 0.975099\tvalid_0's ndcg@4: 0.975422\tvalid_0's ndcg@5: 0.975528\n", - "[32]\tvalid_0's ndcg@1: 0.9364\tvalid_0's ndcg@2: 0.973641\tvalid_0's ndcg@3: 0.975391\tvalid_0's ndcg@4: 0.975714\tvalid_0's ndcg@5: 0.97582\n", - "[33]\tvalid_0's ndcg@1: 0.9367\tvalid_0's ndcg@2: 0.973751\tvalid_0's ndcg@3: 0.975501\tvalid_0's ndcg@4: 0.975824\tvalid_0's ndcg@5: 0.975931\n", - "[34]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.973902\tvalid_0's ndcg@3: 0.975677\tvalid_0's ndcg@4: 0.975989\tvalid_0's ndcg@5: 0.976095\n", - "[35]\tvalid_0's ndcg@1: 0.9377\tvalid_0's ndcg@2: 0.974105\tvalid_0's ndcg@3: 0.975892\tvalid_0's ndcg@4: 0.976194\tvalid_0's ndcg@5: 0.9763\n", - "[36]\tvalid_0's ndcg@1: 0.938\tvalid_0's ndcg@2: 0.974184\tvalid_0's ndcg@3: 0.975984\tvalid_0's ndcg@4: 0.976296\tvalid_0's ndcg@5: 0.976402\n", - "[37]\tvalid_0's ndcg@1: 0.93845\tvalid_0's ndcg@2: 0.974366\tvalid_0's ndcg@3: 0.976166\tvalid_0's ndcg@4: 0.976467\tvalid_0's ndcg@5: 0.976574\n", - "[38]\tvalid_0's ndcg@1: 0.938925\tvalid_0's ndcg@2: 0.974557\tvalid_0's ndcg@3: 0.976332\tvalid_0's ndcg@4: 0.976655\tvalid_0's ndcg@5: 0.976751\n", - "[39]\tvalid_0's ndcg@1: 0.93865\tvalid_0's ndcg@2: 0.974471\tvalid_0's ndcg@3: 0.976234\tvalid_0's ndcg@4: 0.976557\tvalid_0's ndcg@5: 0.976653\n", - "[40]\tvalid_0's ndcg@1: 0.938325\tvalid_0's ndcg@2: 0.974335\tvalid_0's ndcg@3: 0.97611\tvalid_0's ndcg@4: 0.976433\tvalid_0's ndcg@5: 0.97653\n", - "[41]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.974669\tvalid_0's ndcg@3: 0.976431\tvalid_0's ndcg@4: 0.976743\tvalid_0's ndcg@5: 0.97683\n", - "[42]\tvalid_0's ndcg@1: 0.939375\tvalid_0's ndcg@2: 0.974833\tvalid_0's ndcg@3: 0.976546\tvalid_0's ndcg@4: 0.976858\tvalid_0's ndcg@5: 0.976945\n", - "[43]\tvalid_0's ndcg@1: 0.939625\tvalid_0's ndcg@2: 0.974878\tvalid_0's ndcg@3: 0.976628\tvalid_0's ndcg@4: 0.97694\tvalid_0's ndcg@5: 0.977027\n", - "[44]\tvalid_0's ndcg@1: 0.9395\tvalid_0's ndcg@2: 0.974832\tvalid_0's ndcg@3: 0.97657\tvalid_0's ndcg@4: 0.976893\tvalid_0's ndcg@5: 0.97698\n", - "[45]\tvalid_0's ndcg@1: 0.939775\tvalid_0's ndcg@2: 0.974949\tvalid_0's ndcg@3: 0.976674\tvalid_0's ndcg@4: 0.976997\tvalid_0's ndcg@5: 0.977084\n", - "[46]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.974945\tvalid_0's ndcg@3: 0.976708\tvalid_0's ndcg@4: 0.97702\tvalid_0's ndcg@5: 0.977107\n", - "[47]\tvalid_0's ndcg@1: 0.94005\tvalid_0's ndcg@2: 0.975004\tvalid_0's ndcg@3: 0.976766\tvalid_0's ndcg@4: 0.977078\tvalid_0's ndcg@5: 0.977175\n", - "[48]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975189\tvalid_0's ndcg@3: 0.976939\tvalid_0's ndcg@4: 0.97723\tvalid_0's ndcg@5: 0.977327\n", - "[49]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975189\tvalid_0's ndcg@3: 0.976939\tvalid_0's ndcg@4: 0.97723\tvalid_0's ndcg@5: 0.977327\n", - "[50]\tvalid_0's ndcg@1: 0.9405\tvalid_0's ndcg@2: 0.975264\tvalid_0's ndcg@3: 0.976989\tvalid_0's ndcg@4: 0.977291\tvalid_0's ndcg@5: 0.977368\n", - "[51]\tvalid_0's ndcg@1: 0.941125\tvalid_0's ndcg@2: 0.975526\tvalid_0's ndcg@3: 0.977226\tvalid_0's ndcg@4: 0.977528\tvalid_0's ndcg@5: 0.977605\n", - "[52]\tvalid_0's ndcg@1: 0.941\tvalid_0's ndcg@2: 0.97548\tvalid_0's ndcg@3: 0.977193\tvalid_0's ndcg@4: 0.977484\tvalid_0's ndcg@5: 0.977561\n", - "[53]\tvalid_0's ndcg@1: 0.9411\tvalid_0's ndcg@2: 0.975596\tvalid_0's ndcg@3: 0.977259\tvalid_0's ndcg@4: 0.977539\tvalid_0's ndcg@5: 0.977616\n", - "[54]\tvalid_0's ndcg@1: 0.9412\tvalid_0's ndcg@2: 0.975712\tvalid_0's ndcg@3: 0.977299\tvalid_0's ndcg@4: 0.97759\tvalid_0's ndcg@5: 0.977667\n", - "[55]\tvalid_0's ndcg@1: 0.94155\tvalid_0's ndcg@2: 0.975841\tvalid_0's ndcg@3: 0.977429\tvalid_0's ndcg@4: 0.977719\tvalid_0's ndcg@5: 0.977797\n", - "[56]\tvalid_0's ndcg@1: 0.941825\tvalid_0's ndcg@2: 0.975943\tvalid_0's ndcg@3: 0.97753\tvalid_0's ndcg@4: 0.977821\tvalid_0's ndcg@5: 0.977898\n", - "[57]\tvalid_0's ndcg@1: 0.9416\tvalid_0's ndcg@2: 0.975891\tvalid_0's ndcg@3: 0.977429\tvalid_0's ndcg@4: 0.977741\tvalid_0's ndcg@5: 0.977818\n", - "[58]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.975969\tvalid_0's ndcg@3: 0.977494\tvalid_0's ndcg@4: 0.977795\tvalid_0's ndcg@5: 0.977873\n", - "[59]\tvalid_0's ndcg@1: 0.942025\tvalid_0's ndcg@2: 0.975985\tvalid_0's ndcg@3: 0.977547\tvalid_0's ndcg@4: 0.977881\tvalid_0's ndcg@5: 0.977958\n", - "[60]\tvalid_0's ndcg@1: 0.94205\tvalid_0's ndcg@2: 0.975994\tvalid_0's ndcg@3: 0.977569\tvalid_0's ndcg@4: 0.977892\tvalid_0's ndcg@5: 0.977969\n", - "[61]\tvalid_0's ndcg@1: 0.94205\tvalid_0's ndcg@2: 0.975947\tvalid_0's ndcg@3: 0.977559\tvalid_0's ndcg@4: 0.977882\tvalid_0's ndcg@5: 0.97796\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取排序特征" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[62]\tvalid_0's ndcg@1: 0.942225\tvalid_0's ndcg@2: 0.976027\tvalid_0's ndcg@3: 0.97764\tvalid_0's ndcg@4: 0.977941\tvalid_0's ndcg@5: 0.978028\n", - "[63]\tvalid_0's ndcg@1: 0.942125\tvalid_0's ndcg@2: 0.976022\tvalid_0's ndcg@3: 0.977622\tvalid_0's ndcg@4: 0.977912\tvalid_0's ndcg@5: 0.977999\n", - "[64]\tvalid_0's ndcg@1: 0.942675\tvalid_0's ndcg@2: 0.976193\tvalid_0's ndcg@3: 0.977793\tvalid_0's ndcg@4: 0.978105\tvalid_0's ndcg@5: 0.978192\n", - "[65]\tvalid_0's ndcg@1: 0.942725\tvalid_0's ndcg@2: 0.976227\tvalid_0's ndcg@3: 0.977802\tvalid_0's ndcg@4: 0.978125\tvalid_0's ndcg@5: 0.978212\n", - "[66]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976132\tvalid_0's ndcg@3: 0.977695\tvalid_0's ndcg@4: 0.978018\tvalid_0's ndcg@5: 0.978105\n", - "[67]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976092\tvalid_0's ndcg@3: 0.977679\tvalid_0's ndcg@4: 0.978002\tvalid_0's ndcg@5: 0.978089\n", - "[68]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976148\tvalid_0's ndcg@3: 0.977698\tvalid_0's ndcg@4: 0.978021\tvalid_0's ndcg@5: 0.978108\n", - "[69]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976123\tvalid_0's ndcg@3: 0.977686\tvalid_0's ndcg@4: 0.978009\tvalid_0's ndcg@5: 0.978096\n", - "[70]\tvalid_0's ndcg@1: 0.942625\tvalid_0's ndcg@2: 0.976222\tvalid_0's ndcg@3: 0.977785\tvalid_0's ndcg@4: 0.978097\tvalid_0's ndcg@5: 0.978184\n", - "[71]\tvalid_0's ndcg@1: 0.942575\tvalid_0's ndcg@2: 0.976188\tvalid_0's ndcg@3: 0.977763\tvalid_0's ndcg@4: 0.978075\tvalid_0's ndcg@5: 0.978162\n", - "[72]\tvalid_0's ndcg@1: 0.9427\tvalid_0's ndcg@2: 0.976234\tvalid_0's ndcg@3: 0.977809\tvalid_0's ndcg@4: 0.978121\tvalid_0's ndcg@5: 0.978208\n", - "[73]\tvalid_0's ndcg@1: 0.9428\tvalid_0's ndcg@2: 0.976255\tvalid_0's ndcg@3: 0.977843\tvalid_0's ndcg@4: 0.978155\tvalid_0's ndcg@5: 0.978242\n", - "[74]\tvalid_0's ndcg@1: 0.94295\tvalid_0's ndcg@2: 0.97631\tvalid_0's ndcg@3: 0.977898\tvalid_0's ndcg@4: 0.97821\tvalid_0's ndcg@5: 0.978297\n", - "[75]\tvalid_0's ndcg@1: 0.943\tvalid_0's ndcg@2: 0.976329\tvalid_0's ndcg@3: 0.977941\tvalid_0's ndcg@4: 0.978232\tvalid_0's ndcg@5: 0.978319\n", - "[76]\tvalid_0's ndcg@1: 0.9433\tvalid_0's ndcg@2: 0.976471\tvalid_0's ndcg@3: 0.978059\tvalid_0's ndcg@4: 0.97836\tvalid_0's ndcg@5: 0.978437\n", - "[77]\tvalid_0's ndcg@1: 0.94315\tvalid_0's ndcg@2: 0.976416\tvalid_0's ndcg@3: 0.977991\tvalid_0's ndcg@4: 0.978314\tvalid_0's ndcg@5: 0.978381\n", - "[78]\tvalid_0's ndcg@1: 0.943675\tvalid_0's ndcg@2: 0.976657\tvalid_0's ndcg@3: 0.978194\tvalid_0's ndcg@4: 0.978517\tvalid_0's ndcg@5: 0.978585\n", - "[79]\tvalid_0's ndcg@1: 0.94365\tvalid_0's ndcg@2: 0.976663\tvalid_0's ndcg@3: 0.978188\tvalid_0's ndcg@4: 0.978501\tvalid_0's ndcg@5: 0.978578\n", - "[80]\tvalid_0's ndcg@1: 0.943725\tvalid_0's ndcg@2: 0.976628\tvalid_0's ndcg@3: 0.978203\tvalid_0's ndcg@4: 0.978515\tvalid_0's ndcg@5: 0.978593\n", - "[81]\tvalid_0's ndcg@1: 0.943975\tvalid_0's ndcg@2: 0.97672\tvalid_0's ndcg@3: 0.978295\tvalid_0's ndcg@4: 0.978607\tvalid_0's ndcg@5: 0.978685\n", - "[82]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.976822\tvalid_0's ndcg@3: 0.978397\tvalid_0's ndcg@4: 0.97872\tvalid_0's ndcg@5: 0.978787\n", - "[83]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.976788\tvalid_0's ndcg@3: 0.978375\tvalid_0's ndcg@4: 0.978698\tvalid_0's ndcg@5: 0.978766\n", - "[84]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.97679\tvalid_0's ndcg@3: 0.97839\tvalid_0's ndcg@4: 0.978702\tvalid_0's ndcg@5: 0.97878\n", - "[85]\tvalid_0's ndcg@1: 0.9443\tvalid_0's ndcg@2: 0.976809\tvalid_0's ndcg@3: 0.978421\tvalid_0's ndcg@4: 0.978723\tvalid_0's ndcg@5: 0.9788\n", - "[86]\tvalid_0's ndcg@1: 0.944525\tvalid_0's ndcg@2: 0.976939\tvalid_0's ndcg@3: 0.978502\tvalid_0's ndcg@4: 0.978814\tvalid_0's ndcg@5: 0.978891\n", - "[87]\tvalid_0's ndcg@1: 0.944625\tvalid_0's ndcg@2: 0.976976\tvalid_0's ndcg@3: 0.978551\tvalid_0's ndcg@4: 0.978852\tvalid_0's ndcg@5: 0.97893\n", - "[88]\tvalid_0's ndcg@1: 0.944925\tvalid_0's ndcg@2: 0.977102\tvalid_0's ndcg@3: 0.978677\tvalid_0's ndcg@4: 0.978968\tvalid_0's ndcg@5: 0.979045\n", - "[89]\tvalid_0's ndcg@1: 0.945125\tvalid_0's ndcg@2: 0.977208\tvalid_0's ndcg@3: 0.978758\tvalid_0's ndcg@4: 0.979048\tvalid_0's ndcg@5: 0.979126\n", - "[90]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977135\tvalid_0's ndcg@3: 0.978735\tvalid_0's ndcg@4: 0.979026\tvalid_0's ndcg@5: 0.979104\n", - "[91]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977208\tvalid_0's ndcg@3: 0.978858\tvalid_0's ndcg@4: 0.979138\tvalid_0's ndcg@5: 0.979215\n", - "[92]\tvalid_0's ndcg@1: 0.9455\tvalid_0's ndcg@2: 0.977267\tvalid_0's ndcg@3: 0.978905\tvalid_0's ndcg@4: 0.979174\tvalid_0's ndcg@5: 0.979251\n", - "[93]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977193\tvalid_0's ndcg@3: 0.978818\tvalid_0's ndcg@4: 0.979098\tvalid_0's ndcg@5: 0.979176\n", - "[94]\tvalid_0's ndcg@1: 0.94545\tvalid_0's ndcg@2: 0.97728\tvalid_0's ndcg@3: 0.97888\tvalid_0's ndcg@4: 0.97916\tvalid_0's ndcg@5: 0.979238\n", - "[95]\tvalid_0's ndcg@1: 0.9458\tvalid_0's ndcg@2: 0.977394\tvalid_0's ndcg@3: 0.979006\tvalid_0's ndcg@4: 0.979286\tvalid_0's ndcg@5: 0.979364\n", - "[96]\tvalid_0's ndcg@1: 0.946075\tvalid_0's ndcg@2: 0.977527\tvalid_0's ndcg@3: 0.979114\tvalid_0's ndcg@4: 0.979394\tvalid_0's ndcg@5: 0.979472\n", - "[97]\tvalid_0's ndcg@1: 0.946475\tvalid_0's ndcg@2: 0.977659\tvalid_0's ndcg@3: 0.979259\tvalid_0's ndcg@4: 0.979539\tvalid_0's ndcg@5: 0.979616\n", - "[98]\tvalid_0's ndcg@1: 0.94675\tvalid_0's ndcg@2: 0.97776\tvalid_0's ndcg@3: 0.97936\tvalid_0's ndcg@4: 0.979651\tvalid_0's ndcg@5: 0.979719\n", - "[99]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.977831\tvalid_0's ndcg@3: 0.979419\tvalid_0's ndcg@4: 0.97971\tvalid_0's ndcg@5: 0.979777\n", - "[100]\tvalid_0's ndcg@1: 0.9468\tvalid_0's ndcg@2: 0.977794\tvalid_0's ndcg@3: 0.979369\tvalid_0's ndcg@4: 0.979671\tvalid_0's ndcg@5: 0.979739\n", - "Did not meet early stopping. Best iteration is:\n", - "[99]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.977831\tvalid_0's ndcg@3: 0.979419\tvalid_0's ndcg@4: 0.97971\tvalid_0's ndcg@5: 0.979777\n", - "[1]\tvalid_0's ndcg@1: 0.909075\tvalid_0's ndcg@2: 0.963019\tvalid_0's ndcg@3: 0.965069\tvalid_0's ndcg@4: 0.965543\tvalid_0's ndcg@5: 0.965601\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.9123\tvalid_0's ndcg@2: 0.964273\tvalid_0's ndcg@3: 0.966248\tvalid_0's ndcg@4: 0.966722\tvalid_0's ndcg@5: 0.966789\n", - "[3]\tvalid_0's ndcg@1: 0.915075\tvalid_0's ndcg@2: 0.965691\tvalid_0's ndcg@3: 0.967466\tvalid_0's ndcg@4: 0.967854\tvalid_0's ndcg@5: 0.967922\n", - "[4]\tvalid_0's ndcg@1: 0.91845\tvalid_0's ndcg@2: 0.967047\tvalid_0's ndcg@3: 0.968735\tvalid_0's ndcg@4: 0.969133\tvalid_0's ndcg@5: 0.969201\n", - "[5]\tvalid_0's ndcg@1: 0.92355\tvalid_0's ndcg@2: 0.968961\tvalid_0's ndcg@3: 0.970674\tvalid_0's ndcg@4: 0.97104\tvalid_0's ndcg@5: 0.971098\n", - "[6]\tvalid_0's ndcg@1: 0.9253\tvalid_0's ndcg@2: 0.969607\tvalid_0's ndcg@3: 0.971345\tvalid_0's ndcg@4: 0.971689\tvalid_0's ndcg@5: 0.971747\n", - "[7]\tvalid_0's ndcg@1: 0.926225\tvalid_0's ndcg@2: 0.969933\tvalid_0's ndcg@3: 0.971708\tvalid_0's ndcg@4: 0.972031\tvalid_0's ndcg@5: 0.972079\n", - "[8]\tvalid_0's ndcg@1: 0.926475\tvalid_0's ndcg@2: 0.970104\tvalid_0's ndcg@3: 0.971804\tvalid_0's ndcg@4: 0.972116\tvalid_0's ndcg@5: 0.972184\n", - "[9]\tvalid_0's ndcg@1: 0.9277\tvalid_0's ndcg@2: 0.970682\tvalid_0's ndcg@3: 0.972307\tvalid_0's ndcg@4: 0.972598\tvalid_0's ndcg@5: 0.972675\n", - "[10]\tvalid_0's ndcg@1: 0.92775\tvalid_0's ndcg@2: 0.970653\tvalid_0's ndcg@3: 0.972316\tvalid_0's ndcg@4: 0.972617\tvalid_0's ndcg@5: 0.972685\n", - "[11]\tvalid_0's ndcg@1: 0.9283\tvalid_0's ndcg@2: 0.97084\tvalid_0's ndcg@3: 0.97254\tvalid_0's ndcg@4: 0.97281\tvalid_0's ndcg@5: 0.972887\n", - "[12]\tvalid_0's ndcg@1: 0.9287\tvalid_0's ndcg@2: 0.971051\tvalid_0's ndcg@3: 0.972701\tvalid_0's ndcg@4: 0.97297\tvalid_0's ndcg@5: 0.973048\n", - "[13]\tvalid_0's ndcg@1: 0.9297\tvalid_0's ndcg@2: 0.971389\tvalid_0's ndcg@3: 0.973001\tvalid_0's ndcg@4: 0.973313\tvalid_0's ndcg@5: 0.9734\n", - "[14]\tvalid_0's ndcg@1: 0.92955\tvalid_0's ndcg@2: 0.971444\tvalid_0's ndcg@3: 0.972994\tvalid_0's ndcg@4: 0.973284\tvalid_0's ndcg@5: 0.973371\n", - "[15]\tvalid_0's ndcg@1: 0.930225\tvalid_0's ndcg@2: 0.97174\tvalid_0's ndcg@3: 0.973253\tvalid_0's ndcg@4: 0.973543\tvalid_0's ndcg@5: 0.97363\n", - "[16]\tvalid_0's ndcg@1: 0.930425\tvalid_0's ndcg@2: 0.971798\tvalid_0's ndcg@3: 0.973298\tvalid_0's ndcg@4: 0.97361\tvalid_0's ndcg@5: 0.973698\n", - "[17]\tvalid_0's ndcg@1: 0.93125\tvalid_0's ndcg@2: 0.971992\tvalid_0's ndcg@3: 0.97358\tvalid_0's ndcg@4: 0.973903\tvalid_0's ndcg@5: 0.97398\n", - "[18]\tvalid_0's ndcg@1: 0.931925\tvalid_0's ndcg@2: 0.972257\tvalid_0's ndcg@3: 0.973845\tvalid_0's ndcg@4: 0.974146\tvalid_0's ndcg@5: 0.974224\n", - "[19]\tvalid_0's ndcg@1: 0.932375\tvalid_0's ndcg@2: 0.972376\tvalid_0's ndcg@3: 0.974038\tvalid_0's ndcg@4: 0.974318\tvalid_0's ndcg@5: 0.974376\n", - "[20]\tvalid_0's ndcg@1: 0.932\tvalid_0's ndcg@2: 0.972269\tvalid_0's ndcg@3: 0.973907\tvalid_0's ndcg@4: 0.974187\tvalid_0's ndcg@5: 0.974245\n", - "[21]\tvalid_0's ndcg@1: 0.932725\tvalid_0's ndcg@2: 0.972568\tvalid_0's ndcg@3: 0.974181\tvalid_0's ndcg@4: 0.974471\tvalid_0's ndcg@5: 0.974529\n", - "[22]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.972735\tvalid_0's ndcg@3: 0.974298\tvalid_0's ndcg@4: 0.974599\tvalid_0's ndcg@5: 0.974657\n", - "[23]\tvalid_0's ndcg@1: 0.932925\tvalid_0's ndcg@2: 0.972642\tvalid_0's ndcg@3: 0.974255\tvalid_0's ndcg@4: 0.974545\tvalid_0's ndcg@5: 0.974594\n", - "[24]\tvalid_0's ndcg@1: 0.933175\tvalid_0's ndcg@2: 0.972734\tvalid_0's ndcg@3: 0.974347\tvalid_0's ndcg@4: 0.974638\tvalid_0's ndcg@5: 0.974686\n", - "[25]\tvalid_0's ndcg@1: 0.9331\tvalid_0's ndcg@2: 0.972754\tvalid_0's ndcg@3: 0.974366\tvalid_0's ndcg@4: 0.974636\tvalid_0's ndcg@5: 0.974674\n" - ] + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:20:41.843180Z", + "start_time": "2020-11-18T04:20:41.837287Z" + } + }, + "outputs": [], + "source": [ + "data_path = './data_raw/'\n", + "save_path = './temp_results/'\n", + "offline = False" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[26]\tvalid_0's ndcg@1: 0.933275\tvalid_0's ndcg@2: 0.972787\tvalid_0's ndcg@3: 0.974424\tvalid_0's ndcg@4: 0.974694\tvalid_0's ndcg@5: 0.974732\n", - "[27]\tvalid_0's ndcg@1: 0.93325\tvalid_0's ndcg@2: 0.972809\tvalid_0's ndcg@3: 0.974434\tvalid_0's ndcg@4: 0.974703\tvalid_0's ndcg@5: 0.974732\n", - "[28]\tvalid_0's ndcg@1: 0.933625\tvalid_0's ndcg@2: 0.972932\tvalid_0's ndcg@3: 0.974557\tvalid_0's ndcg@4: 0.974826\tvalid_0's ndcg@5: 0.974855\n", - "[29]\tvalid_0's ndcg@1: 0.933725\tvalid_0's ndcg@2: 0.972937\tvalid_0's ndcg@3: 0.974587\tvalid_0's ndcg@4: 0.974856\tvalid_0's ndcg@5: 0.974885\n", - "[30]\tvalid_0's ndcg@1: 0.93355\tvalid_0's ndcg@2: 0.972873\tvalid_0's ndcg@3: 0.974523\tvalid_0's ndcg@4: 0.974792\tvalid_0's ndcg@5: 0.974821\n", - "[31]\tvalid_0's ndcg@1: 0.9342\tvalid_0's ndcg@2: 0.973065\tvalid_0's ndcg@3: 0.974753\tvalid_0's ndcg@4: 0.975022\tvalid_0's ndcg@5: 0.975051\n", - "[32]\tvalid_0's ndcg@1: 0.93435\tvalid_0's ndcg@2: 0.973152\tvalid_0's ndcg@3: 0.974815\tvalid_0's ndcg@4: 0.975084\tvalid_0's ndcg@5: 0.975113\n", - "[33]\tvalid_0's ndcg@1: 0.934475\tvalid_0's ndcg@2: 0.97323\tvalid_0's ndcg@3: 0.974855\tvalid_0's ndcg@4: 0.975135\tvalid_0's ndcg@5: 0.975164\n", - "[34]\tvalid_0's ndcg@1: 0.9342\tvalid_0's ndcg@2: 0.973113\tvalid_0's ndcg@3: 0.974738\tvalid_0's ndcg@4: 0.975028\tvalid_0's ndcg@5: 0.975057\n", - "[35]\tvalid_0's ndcg@1: 0.93455\tvalid_0's ndcg@2: 0.973258\tvalid_0's ndcg@3: 0.97487\tvalid_0's ndcg@4: 0.975172\tvalid_0's ndcg@5: 0.975201\n", - "[36]\tvalid_0's ndcg@1: 0.9344\tvalid_0's ndcg@2: 0.973265\tvalid_0's ndcg@3: 0.974828\tvalid_0's ndcg@4: 0.975129\tvalid_0's ndcg@5: 0.975158\n", - "[37]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.973438\tvalid_0's ndcg@3: 0.975013\tvalid_0's ndcg@4: 0.975304\tvalid_0's ndcg@5: 0.975323\n", - "[38]\tvalid_0's ndcg@1: 0.934975\tvalid_0's ndcg@2: 0.973541\tvalid_0's ndcg@3: 0.975066\tvalid_0's ndcg@4: 0.975367\tvalid_0's ndcg@5: 0.975386\n", - "[39]\tvalid_0's ndcg@1: 0.935275\tvalid_0's ndcg@2: 0.973667\tvalid_0's ndcg@3: 0.975192\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975502\n", - "[40]\tvalid_0's ndcg@1: 0.9352\tvalid_0's ndcg@2: 0.973624\tvalid_0's ndcg@3: 0.975174\tvalid_0's ndcg@4: 0.975454\tvalid_0's ndcg@5: 0.975473\n", - "[41]\tvalid_0's ndcg@1: 0.935325\tvalid_0's ndcg@2: 0.973686\tvalid_0's ndcg@3: 0.975223\tvalid_0's ndcg@4: 0.975503\tvalid_0's ndcg@5: 0.975522\n", - "[42]\tvalid_0's ndcg@1: 0.93545\tvalid_0's ndcg@2: 0.973716\tvalid_0's ndcg@3: 0.975266\tvalid_0's ndcg@4: 0.975546\tvalid_0's ndcg@5: 0.975565\n", - "[43]\tvalid_0's ndcg@1: 0.93615\tvalid_0's ndcg@2: 0.974022\tvalid_0's ndcg@3: 0.975534\tvalid_0's ndcg@4: 0.975814\tvalid_0's ndcg@5: 0.975843\n", - "[44]\tvalid_0's ndcg@1: 0.936225\tvalid_0's ndcg@2: 0.974112\tvalid_0's ndcg@3: 0.975562\tvalid_0's ndcg@4: 0.975853\tvalid_0's ndcg@5: 0.975882\n", - "[45]\tvalid_0's ndcg@1: 0.9365\tvalid_0's ndcg@2: 0.974167\tvalid_0's ndcg@3: 0.975654\tvalid_0's ndcg@4: 0.975945\tvalid_0's ndcg@5: 0.975974\n", - "[46]\tvalid_0's ndcg@1: 0.93665\tvalid_0's ndcg@2: 0.974206\tvalid_0's ndcg@3: 0.975694\tvalid_0's ndcg@4: 0.975995\tvalid_0's ndcg@5: 0.976024\n", - "[47]\tvalid_0's ndcg@1: 0.93685\tvalid_0's ndcg@2: 0.974311\tvalid_0's ndcg@3: 0.975786\tvalid_0's ndcg@4: 0.976077\tvalid_0's ndcg@5: 0.976106\n", - "[48]\tvalid_0's ndcg@1: 0.937025\tvalid_0's ndcg@2: 0.974408\tvalid_0's ndcg@3: 0.975845\tvalid_0's ndcg@4: 0.976147\tvalid_0's ndcg@5: 0.976185\n", - "[49]\tvalid_0's ndcg@1: 0.936975\tvalid_0's ndcg@2: 0.974342\tvalid_0's ndcg@3: 0.975829\tvalid_0's ndcg@4: 0.97612\tvalid_0's ndcg@5: 0.976159\n", - "[50]\tvalid_0's ndcg@1: 0.9371\tvalid_0's ndcg@2: 0.974388\tvalid_0's ndcg@3: 0.97585\tvalid_0's ndcg@4: 0.976152\tvalid_0's ndcg@5: 0.976191\n", - "[51]\tvalid_0's ndcg@1: 0.937025\tvalid_0's ndcg@2: 0.974329\tvalid_0's ndcg@3: 0.975841\tvalid_0's ndcg@4: 0.976121\tvalid_0's ndcg@5: 0.97616\n", - "[52]\tvalid_0's ndcg@1: 0.9377\tvalid_0's ndcg@2: 0.974578\tvalid_0's ndcg@3: 0.976078\tvalid_0's ndcg@4: 0.976369\tvalid_0's ndcg@5: 0.976407\n", - "[53]\tvalid_0's ndcg@1: 0.9378\tvalid_0's ndcg@2: 0.974615\tvalid_0's ndcg@3: 0.976115\tvalid_0's ndcg@4: 0.976405\tvalid_0's ndcg@5: 0.976444\n", - "[54]\tvalid_0's ndcg@1: 0.938\tvalid_0's ndcg@2: 0.974689\tvalid_0's ndcg@3: 0.976214\tvalid_0's ndcg@4: 0.976483\tvalid_0's ndcg@5: 0.976521\n", - "[55]\tvalid_0's ndcg@1: 0.938225\tvalid_0's ndcg@2: 0.974803\tvalid_0's ndcg@3: 0.976303\tvalid_0's ndcg@4: 0.976572\tvalid_0's ndcg@5: 0.976611\n", - "[56]\tvalid_0's ndcg@1: 0.938175\tvalid_0's ndcg@2: 0.9748\tvalid_0's ndcg@3: 0.976275\tvalid_0's ndcg@4: 0.976555\tvalid_0's ndcg@5: 0.976594\n", - "[57]\tvalid_0's ndcg@1: 0.938525\tvalid_0's ndcg@2: 0.974914\tvalid_0's ndcg@3: 0.976414\tvalid_0's ndcg@4: 0.976683\tvalid_0's ndcg@5: 0.976722\n", - "[58]\tvalid_0's ndcg@1: 0.93875\tvalid_0's ndcg@2: 0.975028\tvalid_0's ndcg@3: 0.976503\tvalid_0's ndcg@4: 0.976773\tvalid_0's ndcg@5: 0.976811\n", - "[59]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975198\tvalid_0's ndcg@3: 0.976648\tvalid_0's ndcg@4: 0.976918\tvalid_0's ndcg@5: 0.976956\n", - "[60]\tvalid_0's ndcg@1: 0.939025\tvalid_0's ndcg@2: 0.975177\tvalid_0's ndcg@3: 0.976615\tvalid_0's ndcg@4: 0.976884\tvalid_0's ndcg@5: 0.976923\n", - "[61]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.975205\tvalid_0's ndcg@3: 0.976642\tvalid_0's ndcg@4: 0.976912\tvalid_0's ndcg@5: 0.97695\n", - "[62]\tvalid_0's ndcg@1: 0.93965\tvalid_0's ndcg@2: 0.975424\tvalid_0's ndcg@3: 0.976836\tvalid_0's ndcg@4: 0.977116\tvalid_0's ndcg@5: 0.977155\n", - "[63]\tvalid_0's ndcg@1: 0.940075\tvalid_0's ndcg@2: 0.975596\tvalid_0's ndcg@3: 0.976996\tvalid_0's ndcg@4: 0.977276\tvalid_0's ndcg@5: 0.977315\n", - "[64]\tvalid_0's ndcg@1: 0.940375\tvalid_0's ndcg@2: 0.975723\tvalid_0's ndcg@3: 0.977123\tvalid_0's ndcg@4: 0.977392\tvalid_0's ndcg@5: 0.977431\n", - "[65]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975766\tvalid_0's ndcg@3: 0.977154\tvalid_0's ndcg@4: 0.977423\tvalid_0's ndcg@5: 0.977462\n", - "[66]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.975744\tvalid_0's ndcg@3: 0.977156\tvalid_0's ndcg@4: 0.977426\tvalid_0's ndcg@5: 0.977464\n", - "[67]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.97576\tvalid_0's ndcg@3: 0.977172\tvalid_0's ndcg@4: 0.977431\tvalid_0's ndcg@5: 0.977469\n", - "[68]\tvalid_0's ndcg@1: 0.940675\tvalid_0's ndcg@2: 0.975849\tvalid_0's ndcg@3: 0.977249\tvalid_0's ndcg@4: 0.977508\tvalid_0's ndcg@5: 0.977546\n", - "[69]\tvalid_0's ndcg@1: 0.9413\tvalid_0's ndcg@2: 0.976017\tvalid_0's ndcg@3: 0.977454\tvalid_0's ndcg@4: 0.977724\tvalid_0's ndcg@5: 0.977762\n", - "[70]\tvalid_0's ndcg@1: 0.94105\tvalid_0's ndcg@2: 0.975925\tvalid_0's ndcg@3: 0.977362\tvalid_0's ndcg@4: 0.977631\tvalid_0's ndcg@5: 0.97767\n", - "[71]\tvalid_0's ndcg@1: 0.94105\tvalid_0's ndcg@2: 0.975925\tvalid_0's ndcg@3: 0.97735\tvalid_0's ndcg@4: 0.97763\tvalid_0's ndcg@5: 0.977668\n", - "[72]\tvalid_0's ndcg@1: 0.941325\tvalid_0's ndcg@2: 0.976058\tvalid_0's ndcg@3: 0.97747\tvalid_0's ndcg@4: 0.977739\tvalid_0's ndcg@5: 0.977778\n", - "[73]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.976076\tvalid_0's ndcg@3: 0.977476\tvalid_0's ndcg@4: 0.977756\tvalid_0's ndcg@5: 0.977795\n", - "[74]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.97619\tvalid_0's ndcg@3: 0.97759\tvalid_0's ndcg@4: 0.97788\tvalid_0's ndcg@5: 0.977919\n", - "[75]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.97619\tvalid_0's ndcg@3: 0.977602\tvalid_0's ndcg@4: 0.977882\tvalid_0's ndcg@5: 0.977921\n", - "[76]\tvalid_0's ndcg@1: 0.94195\tvalid_0's ndcg@2: 0.976273\tvalid_0's ndcg@3: 0.977685\tvalid_0's ndcg@4: 0.977965\tvalid_0's ndcg@5: 0.978004\n", - "[77]\tvalid_0's ndcg@1: 0.9419\tvalid_0's ndcg@2: 0.97627\tvalid_0's ndcg@3: 0.97767\tvalid_0's ndcg@4: 0.97795\tvalid_0's ndcg@5: 0.977989\n", - "[78]\tvalid_0's ndcg@1: 0.94235\tvalid_0's ndcg@2: 0.976452\tvalid_0's ndcg@3: 0.977839\tvalid_0's ndcg@4: 0.978119\tvalid_0's ndcg@5: 0.978158\n", - "[79]\tvalid_0's ndcg@1: 0.94265\tvalid_0's ndcg@2: 0.976562\tvalid_0's ndcg@3: 0.977937\tvalid_0's ndcg@4: 0.978228\tvalid_0's ndcg@5: 0.978267\n", - "[80]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976667\tvalid_0's ndcg@3: 0.978067\tvalid_0's ndcg@4: 0.978347\tvalid_0's ndcg@5: 0.978385\n", - "[81]\tvalid_0's ndcg@1: 0.94305\tvalid_0's ndcg@2: 0.97671\tvalid_0's ndcg@3: 0.978098\tvalid_0's ndcg@4: 0.978378\tvalid_0's ndcg@5: 0.978416\n", - "[82]\tvalid_0's ndcg@1: 0.943175\tvalid_0's ndcg@2: 0.97674\tvalid_0's ndcg@3: 0.978115\tvalid_0's ndcg@4: 0.978417\tvalid_0's ndcg@5: 0.978456\n", - "[83]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976752\tvalid_0's ndcg@3: 0.97814\tvalid_0's ndcg@4: 0.978441\tvalid_0's ndcg@5: 0.97848\n", - "[84]\tvalid_0's ndcg@1: 0.943375\tvalid_0's ndcg@2: 0.976767\tvalid_0's ndcg@3: 0.978179\tvalid_0's ndcg@4: 0.978481\tvalid_0's ndcg@5: 0.97852\n", - "[85]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976721\tvalid_0's ndcg@3: 0.978146\tvalid_0's ndcg@4: 0.978437\tvalid_0's ndcg@5: 0.978475\n", - "[86]\tvalid_0's ndcg@1: 0.9434\tvalid_0's ndcg@2: 0.976792\tvalid_0's ndcg@3: 0.978204\tvalid_0's ndcg@4: 0.978506\tvalid_0's ndcg@5: 0.978535\n", - "[87]\tvalid_0's ndcg@1: 0.943475\tvalid_0's ndcg@2: 0.976851\tvalid_0's ndcg@3: 0.978239\tvalid_0's ndcg@4: 0.97854\tvalid_0's ndcg@5: 0.978569\n", - "[88]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976882\tvalid_0's ndcg@3: 0.978282\tvalid_0's ndcg@4: 0.978572\tvalid_0's ndcg@5: 0.978611\n", - "[89]\tvalid_0's ndcg@1: 0.943775\tvalid_0's ndcg@2: 0.976915\tvalid_0's ndcg@3: 0.97834\tvalid_0's ndcg@4: 0.97863\tvalid_0's ndcg@5: 0.978669\n" - ] + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:20:53.358138Z", + "start_time": "2020-11-18T04:20:44.232944Z" + } + }, + "outputs": [], + "source": [ + "# 重新读取数据的时候,发现click_article_id是一个浮点数,所以将其转换成int类型\n", + "trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')\n", + "trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)\n", + "\n", + "if offline:\n", + " val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv')\n", + " val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)\n", + "else:\n", + " val_user_item_feats_df = None\n", + " \n", + "tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')\n", + "tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)\n", + "\n", + "# 做特征的时候为了方便,给测试集也打上了一个无效的标签,这里直接删掉就行\n", + "del tst_user_item_feats_df['label']" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[90]\tvalid_0's ndcg@1: 0.943925\tvalid_0's ndcg@2: 0.976986\tvalid_0's ndcg@3: 0.978398\tvalid_0's ndcg@4: 0.978689\tvalid_0's ndcg@5: 0.978728\n", - "[91]\tvalid_0's ndcg@1: 0.943875\tvalid_0's ndcg@2: 0.976999\tvalid_0's ndcg@3: 0.978399\tvalid_0's ndcg@4: 0.978679\tvalid_0's ndcg@5: 0.978717\n", - "[92]\tvalid_0's ndcg@1: 0.94395\tvalid_0's ndcg@2: 0.977058\tvalid_0's ndcg@3: 0.978421\tvalid_0's ndcg@4: 0.978711\tvalid_0's ndcg@5: 0.97876\n", - "[93]\tvalid_0's ndcg@1: 0.944075\tvalid_0's ndcg@2: 0.977104\tvalid_0's ndcg@3: 0.978479\tvalid_0's ndcg@4: 0.978759\tvalid_0's ndcg@5: 0.978807\n", - "[94]\tvalid_0's ndcg@1: 0.944175\tvalid_0's ndcg@2: 0.977125\tvalid_0's ndcg@3: 0.978513\tvalid_0's ndcg@4: 0.978793\tvalid_0's ndcg@5: 0.978841\n", - "[95]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.977153\tvalid_0's ndcg@3: 0.97854\tvalid_0's ndcg@4: 0.97882\tvalid_0's ndcg@5: 0.978869\n", - "[96]\tvalid_0's ndcg@1: 0.944225\tvalid_0's ndcg@2: 0.977144\tvalid_0's ndcg@3: 0.978531\tvalid_0's ndcg@4: 0.978811\tvalid_0's ndcg@5: 0.97886\n", - "[97]\tvalid_0's ndcg@1: 0.94435\tvalid_0's ndcg@2: 0.977221\tvalid_0's ndcg@3: 0.978584\tvalid_0's ndcg@4: 0.978864\tvalid_0's ndcg@5: 0.978912\n", - "[98]\tvalid_0's ndcg@1: 0.944575\tvalid_0's ndcg@2: 0.977289\tvalid_0's ndcg@3: 0.978651\tvalid_0's ndcg@4: 0.978942\tvalid_0's ndcg@5: 0.97899\n", - "[99]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.977341\tvalid_0's ndcg@3: 0.978691\tvalid_0's ndcg@4: 0.978993\tvalid_0's ndcg@5: 0.979032\n", - "[100]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977482\tvalid_0's ndcg@3: 0.978857\tvalid_0's ndcg@4: 0.979148\tvalid_0's ndcg@5: 0.979187\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977482\tvalid_0's ndcg@3: 0.978857\tvalid_0's ndcg@4: 0.979148\tvalid_0's ndcg@5: 0.979187\n", - "[1]\tvalid_0's ndcg@1: 0.911575\tvalid_0's ndcg@2: 0.964384\tvalid_0's ndcg@3: 0.966321\tvalid_0's ndcg@4: 0.966623\tvalid_0's ndcg@5: 0.966671\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.9136\tvalid_0's ndcg@2: 0.965257\tvalid_0's ndcg@3: 0.967107\tvalid_0's ndcg@4: 0.967398\tvalid_0's ndcg@5: 0.967456\n", - "[3]\tvalid_0's ndcg@1: 0.917425\tvalid_0's ndcg@2: 0.966732\tvalid_0's ndcg@3: 0.968545\tvalid_0's ndcg@4: 0.968814\tvalid_0's ndcg@5: 0.968882\n", - "[4]\tvalid_0's ndcg@1: 0.9222\tvalid_0's ndcg@2: 0.968558\tvalid_0's ndcg@3: 0.970383\tvalid_0's ndcg@4: 0.970619\tvalid_0's ndcg@5: 0.970668\n", - "[5]\tvalid_0's ndcg@1: 0.925875\tvalid_0's ndcg@2: 0.969914\tvalid_0's ndcg@3: 0.971714\tvalid_0's ndcg@4: 0.971972\tvalid_0's ndcg@5: 0.972021\n", - "[6]\tvalid_0's ndcg@1: 0.926875\tvalid_0's ndcg@2: 0.970425\tvalid_0's ndcg@3: 0.972112\tvalid_0's ndcg@4: 0.972371\tvalid_0's ndcg@5: 0.972419\n", - "[7]\tvalid_0's ndcg@1: 0.927475\tvalid_0's ndcg@2: 0.970631\tvalid_0's ndcg@3: 0.972306\tvalid_0's ndcg@4: 0.972586\tvalid_0's ndcg@5: 0.972634\n", - "[8]\tvalid_0's ndcg@1: 0.93015\tvalid_0's ndcg@2: 0.971649\tvalid_0's ndcg@3: 0.973287\tvalid_0's ndcg@4: 0.973567\tvalid_0's ndcg@5: 0.973625\n", - "[9]\tvalid_0's ndcg@1: 0.9312\tvalid_0's ndcg@2: 0.972084\tvalid_0's ndcg@3: 0.973684\tvalid_0's ndcg@4: 0.973964\tvalid_0's ndcg@5: 0.974022\n", - "[10]\tvalid_0's ndcg@1: 0.93225\tvalid_0's ndcg@2: 0.972456\tvalid_0's ndcg@3: 0.974081\tvalid_0's ndcg@4: 0.974361\tvalid_0's ndcg@5: 0.974409\n", - "[11]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.972704\tvalid_0's ndcg@3: 0.974379\tvalid_0's ndcg@4: 0.974648\tvalid_0's ndcg@5: 0.974696\n", - "[12]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972949\tvalid_0's ndcg@3: 0.974574\tvalid_0's ndcg@4: 0.974832\tvalid_0's ndcg@5: 0.974881\n", - "[13]\tvalid_0's ndcg@1: 0.93415\tvalid_0's ndcg@2: 0.97322\tvalid_0's ndcg@3: 0.97482\tvalid_0's ndcg@4: 0.975079\tvalid_0's ndcg@5: 0.975127\n", - "[14]\tvalid_0's ndcg@1: 0.9352\tvalid_0's ndcg@2: 0.973671\tvalid_0's ndcg@3: 0.975246\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975531\n", - "[15]\tvalid_0's ndcg@1: 0.9358\tvalid_0's ndcg@2: 0.973877\tvalid_0's ndcg@3: 0.975452\tvalid_0's ndcg@4: 0.975699\tvalid_0's ndcg@5: 0.975748\n", - "[16]\tvalid_0's ndcg@1: 0.935825\tvalid_0's ndcg@2: 0.973917\tvalid_0's ndcg@3: 0.975442\tvalid_0's ndcg@4: 0.975712\tvalid_0's ndcg@5: 0.97576\n", - "[17]\tvalid_0's ndcg@1: 0.936475\tvalid_0's ndcg@2: 0.97411\tvalid_0's ndcg@3: 0.975697\tvalid_0's ndcg@4: 0.975956\tvalid_0's ndcg@5: 0.975995\n", - "[18]\tvalid_0's ndcg@1: 0.936925\tvalid_0's ndcg@2: 0.974292\tvalid_0's ndcg@3: 0.975867\tvalid_0's ndcg@4: 0.976114\tvalid_0's ndcg@5: 0.976163\n", - "[19]\tvalid_0's ndcg@1: 0.937525\tvalid_0's ndcg@2: 0.974545\tvalid_0's ndcg@3: 0.976095\tvalid_0's ndcg@4: 0.976342\tvalid_0's ndcg@5: 0.976391\n", - "[20]\tvalid_0's ndcg@1: 0.937775\tvalid_0's ndcg@2: 0.974653\tvalid_0's ndcg@3: 0.976203\tvalid_0's ndcg@4: 0.976429\tvalid_0's ndcg@5: 0.976487\n", - "[21]\tvalid_0's ndcg@1: 0.938825\tvalid_0's ndcg@2: 0.975072\tvalid_0's ndcg@3: 0.976597\tvalid_0's ndcg@4: 0.976823\tvalid_0's ndcg@5: 0.976881\n", - "[22]\tvalid_0's ndcg@1: 0.93885\tvalid_0's ndcg@2: 0.975097\tvalid_0's ndcg@3: 0.976609\tvalid_0's ndcg@4: 0.976846\tvalid_0's ndcg@5: 0.976895\n", - "[23]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975246\tvalid_0's ndcg@3: 0.976733\tvalid_0's ndcg@4: 0.976959\tvalid_0's ndcg@5: 0.977008\n", - "[24]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975246\tvalid_0's ndcg@3: 0.976721\tvalid_0's ndcg@4: 0.976947\tvalid_0's ndcg@5: 0.977005\n", - "[25]\tvalid_0's ndcg@1: 0.9396\tvalid_0's ndcg@2: 0.975421\tvalid_0's ndcg@3: 0.976909\tvalid_0's ndcg@4: 0.977124\tvalid_0's ndcg@5: 0.977182\n", - "[26]\tvalid_0's ndcg@1: 0.9393\tvalid_0's ndcg@2: 0.975342\tvalid_0's ndcg@3: 0.976804\tvalid_0's ndcg@4: 0.97702\tvalid_0's ndcg@5: 0.977078\n", - "[27]\tvalid_0's ndcg@1: 0.93925\tvalid_0's ndcg@2: 0.975323\tvalid_0's ndcg@3: 0.976798\tvalid_0's ndcg@4: 0.977014\tvalid_0's ndcg@5: 0.977062\n", - "[28]\tvalid_0's ndcg@1: 0.93925\tvalid_0's ndcg@2: 0.975308\tvalid_0's ndcg@3: 0.976783\tvalid_0's ndcg@4: 0.977009\tvalid_0's ndcg@5: 0.977057\n", - "[29]\tvalid_0's ndcg@1: 0.94\tvalid_0's ndcg@2: 0.975569\tvalid_0's ndcg@3: 0.977056\tvalid_0's ndcg@4: 0.977282\tvalid_0's ndcg@5: 0.977331\n", - "[30]\tvalid_0's ndcg@1: 0.940325\tvalid_0's ndcg@2: 0.975673\tvalid_0's ndcg@3: 0.977173\tvalid_0's ndcg@4: 0.977399\tvalid_0's ndcg@5: 0.977447\n", - "[31]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975731\tvalid_0's ndcg@3: 0.977243\tvalid_0's ndcg@4: 0.977469\tvalid_0's ndcg@5: 0.977518\n", - "[32]\tvalid_0's ndcg@1: 0.940625\tvalid_0's ndcg@2: 0.975831\tvalid_0's ndcg@3: 0.977306\tvalid_0's ndcg@4: 0.977521\tvalid_0's ndcg@5: 0.97757\n", - "[33]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975766\tvalid_0's ndcg@3: 0.977241\tvalid_0's ndcg@4: 0.977457\tvalid_0's ndcg@5: 0.977505\n", - "[34]\tvalid_0's ndcg@1: 0.940625\tvalid_0's ndcg@2: 0.975831\tvalid_0's ndcg@3: 0.977306\tvalid_0's ndcg@4: 0.977521\tvalid_0's ndcg@5: 0.97757\n", - "[35]\tvalid_0's ndcg@1: 0.940725\tvalid_0's ndcg@2: 0.975868\tvalid_0's ndcg@3: 0.977343\tvalid_0's ndcg@4: 0.977558\tvalid_0's ndcg@5: 0.977606\n", - "[36]\tvalid_0's ndcg@1: 0.94115\tvalid_0's ndcg@2: 0.976056\tvalid_0's ndcg@3: 0.977506\tvalid_0's ndcg@4: 0.977722\tvalid_0's ndcg@5: 0.97777\n", - "[37]\tvalid_0's ndcg@1: 0.9414\tvalid_0's ndcg@2: 0.976133\tvalid_0's ndcg@3: 0.977595\tvalid_0's ndcg@4: 0.977811\tvalid_0's ndcg@5: 0.977859\n", - "[38]\tvalid_0's ndcg@1: 0.94175\tvalid_0's ndcg@2: 0.976278\tvalid_0's ndcg@3: 0.977715\tvalid_0's ndcg@4: 0.977941\tvalid_0's ndcg@5: 0.97799\n", - "[39]\tvalid_0's ndcg@1: 0.942075\tvalid_0's ndcg@2: 0.976366\tvalid_0's ndcg@3: 0.977841\tvalid_0's ndcg@4: 0.978056\tvalid_0's ndcg@5: 0.978105\n", - "[40]\tvalid_0's ndcg@1: 0.94215\tvalid_0's ndcg@2: 0.976409\tvalid_0's ndcg@3: 0.977872\tvalid_0's ndcg@4: 0.978087\tvalid_0's ndcg@5: 0.978136\n", - "[41]\tvalid_0's ndcg@1: 0.94245\tvalid_0's ndcg@2: 0.97652\tvalid_0's ndcg@3: 0.977983\tvalid_0's ndcg@4: 0.978198\tvalid_0's ndcg@5: 0.978246\n", - "[42]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976682\tvalid_0's ndcg@3: 0.97817\tvalid_0's ndcg@4: 0.978385\tvalid_0's ndcg@5: 0.978434\n", - "[43]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976682\tvalid_0's ndcg@3: 0.97817\tvalid_0's ndcg@4: 0.978385\tvalid_0's ndcg@5: 0.978434\n", - "[44]\tvalid_0's ndcg@1: 0.94285\tvalid_0's ndcg@2: 0.976636\tvalid_0's ndcg@3: 0.978111\tvalid_0's ndcg@4: 0.978337\tvalid_0's ndcg@5: 0.978386\n", - "[45]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.9768\tvalid_0's ndcg@3: 0.978262\tvalid_0's ndcg@4: 0.978488\tvalid_0's ndcg@5: 0.978537\n", - "[46]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976913\tvalid_0's ndcg@3: 0.978388\tvalid_0's ndcg@4: 0.978614\tvalid_0's ndcg@5: 0.978663\n", - "[47]\tvalid_0's ndcg@1: 0.943525\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.97836\tvalid_0's ndcg@4: 0.978576\tvalid_0's ndcg@5: 0.978634\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 返回排序后的结果" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[48]\tvalid_0's ndcg@1: 0.943525\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.978373\tvalid_0's ndcg@4: 0.978577\tvalid_0's ndcg@5: 0.978636\n", - "[49]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976913\tvalid_0's ndcg@3: 0.978388\tvalid_0's ndcg@4: 0.978614\tvalid_0's ndcg@5: 0.978663\n", - "[50]\tvalid_0's ndcg@1: 0.943975\tvalid_0's ndcg@2: 0.97702\tvalid_0's ndcg@3: 0.97852\tvalid_0's ndcg@4: 0.978746\tvalid_0's ndcg@5: 0.978794\n", - "[51]\tvalid_0's ndcg@1: 0.9441\tvalid_0's ndcg@2: 0.97705\tvalid_0's ndcg@3: 0.97855\tvalid_0's ndcg@4: 0.978787\tvalid_0's ndcg@5: 0.978836\n", - "[52]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.977121\tvalid_0's ndcg@3: 0.978609\tvalid_0's ndcg@4: 0.978846\tvalid_0's ndcg@5: 0.978894\n", - "[53]\tvalid_0's ndcg@1: 0.944225\tvalid_0's ndcg@2: 0.977081\tvalid_0's ndcg@3: 0.978618\tvalid_0's ndcg@4: 0.978834\tvalid_0's ndcg@5: 0.978882\n", - "[54]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.977071\tvalid_0's ndcg@3: 0.978609\tvalid_0's ndcg@4: 0.978824\tvalid_0's ndcg@5: 0.978873\n", - "[55]\tvalid_0's ndcg@1: 0.94435\tvalid_0's ndcg@2: 0.977143\tvalid_0's ndcg@3: 0.978668\tvalid_0's ndcg@4: 0.978883\tvalid_0's ndcg@5: 0.978931\n", - "[56]\tvalid_0's ndcg@1: 0.9444\tvalid_0's ndcg@2: 0.977177\tvalid_0's ndcg@3: 0.978702\tvalid_0's ndcg@4: 0.978906\tvalid_0's ndcg@5: 0.978955\n", - "[57]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.977263\tvalid_0's ndcg@3: 0.978788\tvalid_0's ndcg@4: 0.979003\tvalid_0's ndcg@5: 0.979051\n", - "[58]\tvalid_0's ndcg@1: 0.9448\tvalid_0's ndcg@2: 0.977293\tvalid_0's ndcg@3: 0.978843\tvalid_0's ndcg@4: 0.979047\tvalid_0's ndcg@5: 0.979096\n", - "[59]\tvalid_0's ndcg@1: 0.9452\tvalid_0's ndcg@2: 0.977472\tvalid_0's ndcg@3: 0.978997\tvalid_0's ndcg@4: 0.979202\tvalid_0's ndcg@5: 0.97925\n", - "[60]\tvalid_0's ndcg@1: 0.9455\tvalid_0's ndcg@2: 0.97763\tvalid_0's ndcg@3: 0.979118\tvalid_0's ndcg@4: 0.979322\tvalid_0's ndcg@5: 0.979371\n", - "[61]\tvalid_0's ndcg@1: 0.945725\tvalid_0's ndcg@2: 0.977682\tvalid_0's ndcg@3: 0.979194\tvalid_0's ndcg@4: 0.979399\tvalid_0's ndcg@5: 0.979447\n", - "[62]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977812\tvalid_0's ndcg@3: 0.979312\tvalid_0's ndcg@4: 0.979495\tvalid_0's ndcg@5: 0.979543\n", - "[63]\tvalid_0's ndcg@1: 0.946\tvalid_0's ndcg@2: 0.977878\tvalid_0's ndcg@3: 0.97934\tvalid_0's ndcg@4: 0.979523\tvalid_0's ndcg@5: 0.979572\n", - "[64]\tvalid_0's ndcg@1: 0.946525\tvalid_0's ndcg@2: 0.978056\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979714\tvalid_0's ndcg@5: 0.979762\n", - "[65]\tvalid_0's ndcg@1: 0.9467\tvalid_0's ndcg@2: 0.978105\tvalid_0's ndcg@3: 0.979592\tvalid_0's ndcg@4: 0.979775\tvalid_0's ndcg@5: 0.979823\n", - "[66]\tvalid_0's ndcg@1: 0.9465\tvalid_0's ndcg@2: 0.978046\tvalid_0's ndcg@3: 0.979534\tvalid_0's ndcg@4: 0.979706\tvalid_0's ndcg@5: 0.979755\n", - "[67]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.978127\tvalid_0's ndcg@3: 0.979614\tvalid_0's ndcg@4: 0.979776\tvalid_0's ndcg@5: 0.979824\n", - "[68]\tvalid_0's ndcg@1: 0.9467\tvalid_0's ndcg@2: 0.97812\tvalid_0's ndcg@3: 0.979608\tvalid_0's ndcg@4: 0.97978\tvalid_0's ndcg@5: 0.979828\n", - "[69]\tvalid_0's ndcg@1: 0.946875\tvalid_0's ndcg@2: 0.978216\tvalid_0's ndcg@3: 0.979679\tvalid_0's ndcg@4: 0.979851\tvalid_0's ndcg@5: 0.9799\n", - "[70]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.978194\tvalid_0's ndcg@3: 0.979682\tvalid_0's ndcg@4: 0.979854\tvalid_0's ndcg@5: 0.979902\n", - "[71]\tvalid_0's ndcg@1: 0.947025\tvalid_0's ndcg@2: 0.978209\tvalid_0's ndcg@3: 0.979721\tvalid_0's ndcg@4: 0.979893\tvalid_0's ndcg@5: 0.979942\n", - "[72]\tvalid_0's ndcg@1: 0.9472\tvalid_0's ndcg@2: 0.978273\tvalid_0's ndcg@3: 0.979773\tvalid_0's ndcg@4: 0.979956\tvalid_0's ndcg@5: 0.980005\n", - "[73]\tvalid_0's ndcg@1: 0.947475\tvalid_0's ndcg@2: 0.978391\tvalid_0's ndcg@3: 0.979878\tvalid_0's ndcg@4: 0.980061\tvalid_0's ndcg@5: 0.980109\n", - "[74]\tvalid_0's ndcg@1: 0.94715\tvalid_0's ndcg@2: 0.978271\tvalid_0's ndcg@3: 0.979758\tvalid_0's ndcg@4: 0.979941\tvalid_0's ndcg@5: 0.97999\n", - "[75]\tvalid_0's ndcg@1: 0.947275\tvalid_0's ndcg@2: 0.978333\tvalid_0's ndcg@3: 0.979808\tvalid_0's ndcg@4: 0.979991\tvalid_0's ndcg@5: 0.980039\n", - "[76]\tvalid_0's ndcg@1: 0.9474\tvalid_0's ndcg@2: 0.97841\tvalid_0's ndcg@3: 0.979873\tvalid_0's ndcg@4: 0.980045\tvalid_0's ndcg@5: 0.980093\n", - "[77]\tvalid_0's ndcg@1: 0.94745\tvalid_0's ndcg@2: 0.97846\tvalid_0's ndcg@3: 0.979898\tvalid_0's ndcg@4: 0.98007\tvalid_0's ndcg@5: 0.980118\n", - "[78]\tvalid_0's ndcg@1: 0.94775\tvalid_0's ndcg@2: 0.978555\tvalid_0's ndcg@3: 0.980005\tvalid_0's ndcg@4: 0.980177\tvalid_0's ndcg@5: 0.980226\n", - "[79]\tvalid_0's ndcg@1: 0.947875\tvalid_0's ndcg@2: 0.978617\tvalid_0's ndcg@3: 0.980055\tvalid_0's ndcg@4: 0.980238\tvalid_0's ndcg@5: 0.980276\n", - "[80]\tvalid_0's ndcg@1: 0.947875\tvalid_0's ndcg@2: 0.978617\tvalid_0's ndcg@3: 0.980055\tvalid_0's ndcg@4: 0.980238\tvalid_0's ndcg@5: 0.980276\n", - "[81]\tvalid_0's ndcg@1: 0.948175\tvalid_0's ndcg@2: 0.978744\tvalid_0's ndcg@3: 0.980169\tvalid_0's ndcg@4: 0.980352\tvalid_0's ndcg@5: 0.98039\n", - "[82]\tvalid_0's ndcg@1: 0.948375\tvalid_0's ndcg@2: 0.97888\tvalid_0's ndcg@3: 0.980255\tvalid_0's ndcg@4: 0.980438\tvalid_0's ndcg@5: 0.980477\n", - "[83]\tvalid_0's ndcg@1: 0.94825\tvalid_0's ndcg@2: 0.978834\tvalid_0's ndcg@3: 0.980209\tvalid_0's ndcg@4: 0.980392\tvalid_0's ndcg@5: 0.980431\n", - "[84]\tvalid_0's ndcg@1: 0.948275\tvalid_0's ndcg@2: 0.978844\tvalid_0's ndcg@3: 0.980219\tvalid_0's ndcg@4: 0.980402\tvalid_0's ndcg@5: 0.98044\n", - "[85]\tvalid_0's ndcg@1: 0.948475\tvalid_0's ndcg@2: 0.978917\tvalid_0's ndcg@3: 0.980292\tvalid_0's ndcg@4: 0.980475\tvalid_0's ndcg@5: 0.980514\n", - "[86]\tvalid_0's ndcg@1: 0.948975\tvalid_0's ndcg@2: 0.979102\tvalid_0's ndcg@3: 0.980477\tvalid_0's ndcg@4: 0.98066\tvalid_0's ndcg@5: 0.980699\n", - "[87]\tvalid_0's ndcg@1: 0.948975\tvalid_0's ndcg@2: 0.979086\tvalid_0's ndcg@3: 0.980474\tvalid_0's ndcg@4: 0.980657\tvalid_0's ndcg@5: 0.980695\n", - "[88]\tvalid_0's ndcg@1: 0.949025\tvalid_0's ndcg@2: 0.979136\tvalid_0's ndcg@3: 0.980499\tvalid_0's ndcg@4: 0.980682\tvalid_0's ndcg@5: 0.98072\n", - "[89]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979285\tvalid_0's ndcg@3: 0.98061\tvalid_0's ndcg@4: 0.980793\tvalid_0's ndcg@5: 0.980832\n", - "[90]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979269\tvalid_0's ndcg@3: 0.980607\tvalid_0's ndcg@4: 0.98079\tvalid_0's ndcg@5: 0.980828\n", - "[91]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979269\tvalid_0's ndcg@3: 0.980607\tvalid_0's ndcg@4: 0.98079\tvalid_0's ndcg@5: 0.980828\n", - "[92]\tvalid_0's ndcg@1: 0.9494\tvalid_0's ndcg@2: 0.97929\tvalid_0's ndcg@3: 0.98064\tvalid_0's ndcg@4: 0.980823\tvalid_0's ndcg@5: 0.980862\n", - "[93]\tvalid_0's ndcg@1: 0.949375\tvalid_0's ndcg@2: 0.979297\tvalid_0's ndcg@3: 0.980634\tvalid_0's ndcg@4: 0.980817\tvalid_0's ndcg@5: 0.980856\n", - "[94]\tvalid_0's ndcg@1: 0.949525\tvalid_0's ndcg@2: 0.979336\tvalid_0's ndcg@3: 0.980686\tvalid_0's ndcg@4: 0.980869\tvalid_0's ndcg@5: 0.980908\n", - "[95]\tvalid_0's ndcg@1: 0.949825\tvalid_0's ndcg@2: 0.979416\tvalid_0's ndcg@3: 0.980791\tvalid_0's ndcg@4: 0.980974\tvalid_0's ndcg@5: 0.981012\n", - "[96]\tvalid_0's ndcg@1: 0.94975\tvalid_0's ndcg@2: 0.979404\tvalid_0's ndcg@3: 0.980779\tvalid_0's ndcg@4: 0.980951\tvalid_0's ndcg@5: 0.98099\n", - "[97]\tvalid_0's ndcg@1: 0.950025\tvalid_0's ndcg@2: 0.979537\tvalid_0's ndcg@3: 0.980874\tvalid_0's ndcg@4: 0.981057\tvalid_0's ndcg@5: 0.981096\n", - "[98]\tvalid_0's ndcg@1: 0.9501\tvalid_0's ndcg@2: 0.979564\tvalid_0's ndcg@3: 0.980889\tvalid_0's ndcg@4: 0.981083\tvalid_0's ndcg@5: 0.981122\n", - "[99]\tvalid_0's ndcg@1: 0.950275\tvalid_0's ndcg@2: 0.979629\tvalid_0's ndcg@3: 0.980967\tvalid_0's ndcg@4: 0.98115\tvalid_0's ndcg@5: 0.981188\n", - "[100]\tvalid_0's ndcg@1: 0.950325\tvalid_0's ndcg@2: 0.979647\tvalid_0's ndcg@3: 0.980985\tvalid_0's ndcg@4: 0.981168\tvalid_0's ndcg@5: 0.981207\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's ndcg@1: 0.950325\tvalid_0's ndcg@2: 0.979647\tvalid_0's ndcg@3: 0.980985\tvalid_0's ndcg@4: 0.981168\tvalid_0's ndcg@5: 0.981207\n", - "[1]\tvalid_0's ndcg@1: 0.910175\tvalid_0's ndcg@2: 0.96382\tvalid_0's ndcg@3: 0.965707\tvalid_0's ndcg@4: 0.966009\tvalid_0's ndcg@5: 0.966086\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.91415\tvalid_0's ndcg@2: 0.965492\tvalid_0's ndcg@3: 0.967254\tvalid_0's ndcg@4: 0.967556\tvalid_0's ndcg@5: 0.967604\n", - "[3]\tvalid_0's ndcg@1: 0.916025\tvalid_0's ndcg@2: 0.966389\tvalid_0's ndcg@3: 0.967976\tvalid_0's ndcg@4: 0.968278\tvalid_0's ndcg@5: 0.968355\n", - "[4]\tvalid_0's ndcg@1: 0.919\tvalid_0's ndcg@2: 0.967392\tvalid_0's ndcg@3: 0.96903\tvalid_0's ndcg@4: 0.969364\tvalid_0's ndcg@5: 0.969431\n", - "[5]\tvalid_0's ndcg@1: 0.921125\tvalid_0's ndcg@2: 0.968192\tvalid_0's ndcg@3: 0.969855\tvalid_0's ndcg@4: 0.970156\tvalid_0's ndcg@5: 0.970224\n", - "[6]\tvalid_0's ndcg@1: 0.921675\tvalid_0's ndcg@2: 0.968411\tvalid_0's ndcg@3: 0.970111\tvalid_0's ndcg@4: 0.97037\tvalid_0's ndcg@5: 0.970437\n", - "[7]\tvalid_0's ndcg@1: 0.9237\tvalid_0's ndcg@2: 0.969332\tvalid_0's ndcg@3: 0.970882\tvalid_0's ndcg@4: 0.97113\tvalid_0's ndcg@5: 0.971217\n", - "[8]\tvalid_0's ndcg@1: 0.925775\tvalid_0's ndcg@2: 0.970129\tvalid_0's ndcg@3: 0.971642\tvalid_0's ndcg@4: 0.971922\tvalid_0's ndcg@5: 0.97199\n", - "[9]\tvalid_0's ndcg@1: 0.926775\tvalid_0's ndcg@2: 0.970435\tvalid_0's ndcg@3: 0.971985\tvalid_0's ndcg@4: 0.972276\tvalid_0's ndcg@5: 0.972334\n" - ] + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:01.809368Z", + "start_time": "2020-11-18T04:21:01.799641Z" + } + }, + "outputs": [], + "source": [ + "def submit(recall_df, topk=5, model_name=None):\n", + " recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])\n", + " recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 判断是不是每个用户都有5篇文章及以上\n", + " tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())\n", + " assert tmp.min() >= topk\n", + " \n", + " del recall_df['pred_score']\n", + " submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()\n", + " \n", + " submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]\n", + " # 按照提交格式定义列名\n", + " submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', \n", + " 3: 'article_3', 4: 'article_4', 5: 'article_5'})\n", + " \n", + " save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'\n", + " submit.to_csv(save_name, index=False, header=True)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[10]\tvalid_0's ndcg@1: 0.9277\tvalid_0's ndcg@2: 0.970761\tvalid_0's ndcg@3: 0.972311\tvalid_0's ndcg@4: 0.972612\tvalid_0's ndcg@5: 0.97267\n", - "[11]\tvalid_0's ndcg@1: 0.928975\tvalid_0's ndcg@2: 0.97131\tvalid_0's ndcg@3: 0.972798\tvalid_0's ndcg@4: 0.973089\tvalid_0's ndcg@5: 0.973166\n", - "[12]\tvalid_0's ndcg@1: 0.929375\tvalid_0's ndcg@2: 0.971505\tvalid_0's ndcg@3: 0.972968\tvalid_0's ndcg@4: 0.973259\tvalid_0's ndcg@5: 0.973326\n", - "[13]\tvalid_0's ndcg@1: 0.929375\tvalid_0's ndcg@2: 0.971426\tvalid_0's ndcg@3: 0.972939\tvalid_0's ndcg@4: 0.97324\tvalid_0's ndcg@5: 0.973318\n", - "[14]\tvalid_0's ndcg@1: 0.929775\tvalid_0's ndcg@2: 0.971621\tvalid_0's ndcg@3: 0.973121\tvalid_0's ndcg@4: 0.973412\tvalid_0's ndcg@5: 0.97348\n", - "[15]\tvalid_0's ndcg@1: 0.9304\tvalid_0's ndcg@2: 0.971868\tvalid_0's ndcg@3: 0.97338\tvalid_0's ndcg@4: 0.97365\tvalid_0's ndcg@5: 0.973717\n", - "[16]\tvalid_0's ndcg@1: 0.930975\tvalid_0's ndcg@2: 0.972096\tvalid_0's ndcg@3: 0.973558\tvalid_0's ndcg@4: 0.973849\tvalid_0's ndcg@5: 0.973926\n", - "[17]\tvalid_0's ndcg@1: 0.93105\tvalid_0's ndcg@2: 0.972108\tvalid_0's ndcg@3: 0.973583\tvalid_0's ndcg@4: 0.973884\tvalid_0's ndcg@5: 0.973952\n", - "[18]\tvalid_0's ndcg@1: 0.931725\tvalid_0's ndcg@2: 0.972373\tvalid_0's ndcg@3: 0.97386\tvalid_0's ndcg@4: 0.974129\tvalid_0's ndcg@5: 0.974207\n", - "[19]\tvalid_0's ndcg@1: 0.932175\tvalid_0's ndcg@2: 0.972681\tvalid_0's ndcg@3: 0.974068\tvalid_0's ndcg@4: 0.974348\tvalid_0's ndcg@5: 0.974406\n", - "[20]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.973019\tvalid_0's ndcg@3: 0.974382\tvalid_0's ndcg@4: 0.974673\tvalid_0's ndcg@5: 0.974731\n", - "[21]\tvalid_0's ndcg@1: 0.933075\tvalid_0's ndcg@2: 0.97306\tvalid_0's ndcg@3: 0.974423\tvalid_0's ndcg@4: 0.974703\tvalid_0's ndcg@5: 0.97477\n", - "[22]\tvalid_0's ndcg@1: 0.93375\tvalid_0's ndcg@2: 0.973262\tvalid_0's ndcg@3: 0.974649\tvalid_0's ndcg@4: 0.974929\tvalid_0's ndcg@5: 0.975007\n", - "[23]\tvalid_0's ndcg@1: 0.933675\tvalid_0's ndcg@2: 0.973219\tvalid_0's ndcg@3: 0.974606\tvalid_0's ndcg@4: 0.974886\tvalid_0's ndcg@5: 0.974973\n", - "[24]\tvalid_0's ndcg@1: 0.934\tvalid_0's ndcg@2: 0.97337\tvalid_0's ndcg@3: 0.974745\tvalid_0's ndcg@4: 0.975014\tvalid_0's ndcg@5: 0.975101\n", - "[25]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.973674\tvalid_0's ndcg@3: 0.975062\tvalid_0's ndcg@4: 0.975342\tvalid_0's ndcg@5: 0.97541\n", - "[26]\tvalid_0's ndcg@1: 0.93495\tvalid_0's ndcg@2: 0.973721\tvalid_0's ndcg@3: 0.975096\tvalid_0's ndcg@4: 0.975365\tvalid_0's ndcg@5: 0.975452\n", - "[27]\tvalid_0's ndcg@1: 0.9358\tvalid_0's ndcg@2: 0.974082\tvalid_0's ndcg@3: 0.975444\tvalid_0's ndcg@4: 0.975713\tvalid_0's ndcg@5: 0.975781\n", - "[28]\tvalid_0's ndcg@1: 0.935325\tvalid_0's ndcg@2: 0.973875\tvalid_0's ndcg@3: 0.975275\tvalid_0's ndcg@4: 0.975512\tvalid_0's ndcg@5: 0.975599\n", - "[29]\tvalid_0's ndcg@1: 0.935925\tvalid_0's ndcg@2: 0.974159\tvalid_0's ndcg@3: 0.975522\tvalid_0's ndcg@4: 0.975759\tvalid_0's ndcg@5: 0.975836\n", - "[30]\tvalid_0's ndcg@1: 0.9362\tvalid_0's ndcg@2: 0.974214\tvalid_0's ndcg@3: 0.975589\tvalid_0's ndcg@4: 0.975847\tvalid_0's ndcg@5: 0.975924\n", - "[31]\tvalid_0's ndcg@1: 0.93625\tvalid_0's ndcg@2: 0.974216\tvalid_0's ndcg@3: 0.975629\tvalid_0's ndcg@4: 0.975876\tvalid_0's ndcg@5: 0.975944\n", - "[32]\tvalid_0's ndcg@1: 0.93665\tvalid_0's ndcg@2: 0.974427\tvalid_0's ndcg@3: 0.975814\tvalid_0's ndcg@4: 0.97603\tvalid_0's ndcg@5: 0.976107\n", - "[33]\tvalid_0's ndcg@1: 0.936775\tvalid_0's ndcg@2: 0.974505\tvalid_0's ndcg@3: 0.975855\tvalid_0's ndcg@4: 0.976081\tvalid_0's ndcg@5: 0.976158\n", - "[34]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.974643\tvalid_0's ndcg@3: 0.975993\tvalid_0's ndcg@4: 0.976219\tvalid_0's ndcg@5: 0.976296\n", - "[35]\tvalid_0's ndcg@1: 0.937675\tvalid_0's ndcg@2: 0.974805\tvalid_0's ndcg@3: 0.97618\tvalid_0's ndcg@4: 0.976406\tvalid_0's ndcg@5: 0.976484\n", - "[36]\tvalid_0's ndcg@1: 0.9382\tvalid_0's ndcg@2: 0.974983\tvalid_0's ndcg@3: 0.976371\tvalid_0's ndcg@4: 0.976597\tvalid_0's ndcg@5: 0.976674\n", - "[37]\tvalid_0's ndcg@1: 0.938175\tvalid_0's ndcg@2: 0.974974\tvalid_0's ndcg@3: 0.976349\tvalid_0's ndcg@4: 0.976586\tvalid_0's ndcg@5: 0.976663\n", - "[38]\tvalid_0's ndcg@1: 0.938675\tvalid_0's ndcg@2: 0.975143\tvalid_0's ndcg@3: 0.976518\tvalid_0's ndcg@4: 0.976776\tvalid_0's ndcg@5: 0.976844\n", - "[39]\tvalid_0's ndcg@1: 0.938575\tvalid_0's ndcg@2: 0.975106\tvalid_0's ndcg@3: 0.976481\tvalid_0's ndcg@4: 0.976739\tvalid_0's ndcg@5: 0.976807\n", - "[40]\tvalid_0's ndcg@1: 0.938675\tvalid_0's ndcg@2: 0.97519\tvalid_0's ndcg@3: 0.976528\tvalid_0's ndcg@4: 0.976775\tvalid_0's ndcg@5: 0.976853\n", - "[41]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.975347\tvalid_0's ndcg@3: 0.976697\tvalid_0's ndcg@4: 0.976934\tvalid_0's ndcg@5: 0.977001\n", - "[42]\tvalid_0's ndcg@1: 0.939825\tvalid_0's ndcg@2: 0.975599\tvalid_0's ndcg@3: 0.976961\tvalid_0's ndcg@4: 0.977198\tvalid_0's ndcg@5: 0.977266\n", - "[43]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.975639\tvalid_0's ndcg@3: 0.976977\tvalid_0's ndcg@4: 0.977214\tvalid_0's ndcg@5: 0.977282\n", - "[44]\tvalid_0's ndcg@1: 0.9398\tvalid_0's ndcg@2: 0.975605\tvalid_0's ndcg@3: 0.976955\tvalid_0's ndcg@4: 0.977192\tvalid_0's ndcg@5: 0.97726\n", - "[45]\tvalid_0's ndcg@1: 0.9401\tvalid_0's ndcg@2: 0.9757\tvalid_0's ndcg@3: 0.977075\tvalid_0's ndcg@4: 0.977291\tvalid_0's ndcg@5: 0.977368\n", - "[46]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975845\tvalid_0's ndcg@3: 0.977183\tvalid_0's ndcg@4: 0.97742\tvalid_0's ndcg@5: 0.977497\n", - "[47]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.975854\tvalid_0's ndcg@3: 0.977204\tvalid_0's ndcg@4: 0.97743\tvalid_0's ndcg@5: 0.977508\n", - "[48]\tvalid_0's ndcg@1: 0.940575\tvalid_0's ndcg@2: 0.975923\tvalid_0's ndcg@3: 0.977273\tvalid_0's ndcg@4: 0.977488\tvalid_0's ndcg@5: 0.977556\n", - "[49]\tvalid_0's ndcg@1: 0.9407\tvalid_0's ndcg@2: 0.975922\tvalid_0's ndcg@3: 0.977297\tvalid_0's ndcg@4: 0.977501\tvalid_0's ndcg@5: 0.977588\n", - "[50]\tvalid_0's ndcg@1: 0.940725\tvalid_0's ndcg@2: 0.975947\tvalid_0's ndcg@3: 0.977322\tvalid_0's ndcg@4: 0.977505\tvalid_0's ndcg@5: 0.977592\n", - "[51]\tvalid_0's ndcg@1: 0.9406\tvalid_0's ndcg@2: 0.975837\tvalid_0's ndcg@3: 0.97725\tvalid_0's ndcg@4: 0.977422\tvalid_0's ndcg@5: 0.977509\n", - "[52]\tvalid_0's ndcg@1: 0.941075\tvalid_0's ndcg@2: 0.975997\tvalid_0's ndcg@3: 0.977422\tvalid_0's ndcg@4: 0.977594\tvalid_0's ndcg@5: 0.977691\n", - "[53]\tvalid_0's ndcg@1: 0.940925\tvalid_0's ndcg@2: 0.975989\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.977538\tvalid_0's ndcg@5: 0.977644\n", - "[54]\tvalid_0's ndcg@1: 0.94125\tvalid_0's ndcg@2: 0.976062\tvalid_0's ndcg@3: 0.977487\tvalid_0's ndcg@4: 0.977659\tvalid_0's ndcg@5: 0.977756\n", - "[55]\tvalid_0's ndcg@1: 0.94145\tvalid_0's ndcg@2: 0.976183\tvalid_0's ndcg@3: 0.97757\tvalid_0's ndcg@4: 0.977742\tvalid_0's ndcg@5: 0.977839\n", - "[56]\tvalid_0's ndcg@1: 0.941475\tvalid_0's ndcg@2: 0.976176\tvalid_0's ndcg@3: 0.977576\tvalid_0's ndcg@4: 0.977748\tvalid_0's ndcg@5: 0.977845\n", - "[57]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.976139\tvalid_0's ndcg@3: 0.977539\tvalid_0's ndcg@4: 0.977712\tvalid_0's ndcg@5: 0.977808\n", - "[58]\tvalid_0's ndcg@1: 0.941675\tvalid_0's ndcg@2: 0.97625\tvalid_0's ndcg@3: 0.97765\tvalid_0's ndcg@4: 0.977822\tvalid_0's ndcg@5: 0.977919\n", - "[59]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.976253\tvalid_0's ndcg@3: 0.977653\tvalid_0's ndcg@4: 0.977836\tvalid_0's ndcg@5: 0.977932\n", - "[60]\tvalid_0's ndcg@1: 0.941675\tvalid_0's ndcg@2: 0.976234\tvalid_0's ndcg@3: 0.977634\tvalid_0's ndcg@4: 0.977817\tvalid_0's ndcg@5: 0.977914\n", - "[61]\tvalid_0's ndcg@1: 0.9419\tvalid_0's ndcg@2: 0.976333\tvalid_0's ndcg@3: 0.977745\tvalid_0's ndcg@4: 0.977918\tvalid_0's ndcg@5: 0.978005\n", - "[62]\tvalid_0's ndcg@1: 0.941975\tvalid_0's ndcg@2: 0.976345\tvalid_0's ndcg@3: 0.977757\tvalid_0's ndcg@4: 0.97794\tvalid_0's ndcg@5: 0.978027\n", - "[63]\tvalid_0's ndcg@1: 0.9423\tvalid_0's ndcg@2: 0.976496\tvalid_0's ndcg@3: 0.977871\tvalid_0's ndcg@4: 0.978065\tvalid_0's ndcg@5: 0.978152\n", - "[64]\tvalid_0's ndcg@1: 0.942625\tvalid_0's ndcg@2: 0.976632\tvalid_0's ndcg@3: 0.977995\tvalid_0's ndcg@4: 0.978188\tvalid_0's ndcg@5: 0.978275\n", - "[65]\tvalid_0's ndcg@1: 0.942575\tvalid_0's ndcg@2: 0.976629\tvalid_0's ndcg@3: 0.977979\tvalid_0's ndcg@4: 0.978173\tvalid_0's ndcg@5: 0.97826\n", - "[66]\tvalid_0's ndcg@1: 0.942725\tvalid_0's ndcg@2: 0.976685\tvalid_0's ndcg@3: 0.978035\tvalid_0's ndcg@4: 0.978229\tvalid_0's ndcg@5: 0.978316\n", - "[67]\tvalid_0's ndcg@1: 0.94275\tvalid_0's ndcg@2: 0.976678\tvalid_0's ndcg@3: 0.978041\tvalid_0's ndcg@4: 0.978224\tvalid_0's ndcg@5: 0.97832\n", - "[68]\tvalid_0's ndcg@1: 0.94275\tvalid_0's ndcg@2: 0.976694\tvalid_0's ndcg@3: 0.978044\tvalid_0's ndcg@4: 0.978227\tvalid_0's ndcg@5: 0.978324\n", - "[69]\tvalid_0's ndcg@1: 0.943\tvalid_0's ndcg@2: 0.976834\tvalid_0's ndcg@3: 0.978146\tvalid_0's ndcg@4: 0.978329\tvalid_0's ndcg@5: 0.978426\n", - "[70]\tvalid_0's ndcg@1: 0.943025\tvalid_0's ndcg@2: 0.976827\tvalid_0's ndcg@3: 0.978152\tvalid_0's ndcg@4: 0.978324\tvalid_0's ndcg@5: 0.978431\n", - "[71]\tvalid_0's ndcg@1: 0.9432\tvalid_0's ndcg@2: 0.976923\tvalid_0's ndcg@3: 0.978236\tvalid_0's ndcg@4: 0.978397\tvalid_0's ndcg@5: 0.978504\n", - "[72]\tvalid_0's ndcg@1: 0.943225\tvalid_0's ndcg@2: 0.976917\tvalid_0's ndcg@3: 0.978254\tvalid_0's ndcg@4: 0.978405\tvalid_0's ndcg@5: 0.978511\n", - "[73]\tvalid_0's ndcg@1: 0.94315\tvalid_0's ndcg@2: 0.976936\tvalid_0's ndcg@3: 0.978236\tvalid_0's ndcg@4: 0.978409\tvalid_0's ndcg@5: 0.978496\n" - ] + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:04.332198Z", + "start_time": "2020-11-18T04:21:04.325020Z" + } + }, + "outputs": [], + "source": [ + "# 排序结果归一化\n", + "def norm_sim(sim_df, weight=0.0):\n", + " # print(sim_df.head())\n", + " min_sim = sim_df.min()\n", + " max_sim = sim_df.max()\n", + " if max_sim == min_sim:\n", + " sim_df = sim_df.apply(lambda sim: 1.0)\n", + " else:\n", + " sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim))\n", + "\n", + " sim_df = sim_df.apply(lambda sim: sim + weight) # plus one\n", + " return sim_df" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[74]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976957\tvalid_0's ndcg@3: 0.97827\tvalid_0's ndcg@4: 0.978431\tvalid_0's ndcg@5: 0.978528\n", - "[75]\tvalid_0's ndcg@1: 0.943075\tvalid_0's ndcg@2: 0.976861\tvalid_0's ndcg@3: 0.978199\tvalid_0's ndcg@4: 0.97836\tvalid_0's ndcg@5: 0.978457\n", - "[76]\tvalid_0's ndcg@1: 0.94335\tvalid_0's ndcg@2: 0.976963\tvalid_0's ndcg@3: 0.978288\tvalid_0's ndcg@4: 0.978471\tvalid_0's ndcg@5: 0.978568\n", - "[77]\tvalid_0's ndcg@1: 0.94345\tvalid_0's ndcg@2: 0.977031\tvalid_0's ndcg@3: 0.978331\tvalid_0's ndcg@4: 0.978514\tvalid_0's ndcg@5: 0.978611\n", - "[78]\tvalid_0's ndcg@1: 0.943475\tvalid_0's ndcg@2: 0.977088\tvalid_0's ndcg@3: 0.97835\tvalid_0's ndcg@4: 0.978533\tvalid_0's ndcg@5: 0.97863\n", - "[79]\tvalid_0's ndcg@1: 0.943625\tvalid_0's ndcg@2: 0.977096\tvalid_0's ndcg@3: 0.978396\tvalid_0's ndcg@4: 0.978579\tvalid_0's ndcg@5: 0.978676\n", - "[80]\tvalid_0's ndcg@1: 0.943825\tvalid_0's ndcg@2: 0.977154\tvalid_0's ndcg@3: 0.978479\tvalid_0's ndcg@4: 0.978651\tvalid_0's ndcg@5: 0.978748\n", - "[81]\tvalid_0's ndcg@1: 0.943775\tvalid_0's ndcg@2: 0.977135\tvalid_0's ndcg@3: 0.97846\tvalid_0's ndcg@4: 0.978633\tvalid_0's ndcg@5: 0.978729\n", - "[82]\tvalid_0's ndcg@1: 0.9443\tvalid_0's ndcg@2: 0.977361\tvalid_0's ndcg@3: 0.978673\tvalid_0's ndcg@4: 0.978845\tvalid_0's ndcg@5: 0.978933\n", - "[83]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.977324\tvalid_0's ndcg@3: 0.978624\tvalid_0's ndcg@4: 0.978796\tvalid_0's ndcg@5: 0.978893\n", - "[84]\tvalid_0's ndcg@1: 0.94405\tvalid_0's ndcg@2: 0.977253\tvalid_0's ndcg@3: 0.978565\tvalid_0's ndcg@4: 0.978737\tvalid_0's ndcg@5: 0.978834\n", - "[85]\tvalid_0's ndcg@1: 0.944175\tvalid_0's ndcg@2: 0.977283\tvalid_0's ndcg@3: 0.978633\tvalid_0's ndcg@4: 0.978795\tvalid_0's ndcg@5: 0.978882\n", - "[86]\tvalid_0's ndcg@1: 0.9445\tvalid_0's ndcg@2: 0.97745\tvalid_0's ndcg@3: 0.978763\tvalid_0's ndcg@4: 0.978924\tvalid_0's ndcg@5: 0.979011\n", - "[87]\tvalid_0's ndcg@1: 0.9445\tvalid_0's ndcg@2: 0.977419\tvalid_0's ndcg@3: 0.978756\tvalid_0's ndcg@4: 0.978918\tvalid_0's ndcg@5: 0.979005\n", - "[88]\tvalid_0's ndcg@1: 0.944825\tvalid_0's ndcg@2: 0.977554\tvalid_0's ndcg@3: 0.978867\tvalid_0's ndcg@4: 0.979039\tvalid_0's ndcg@5: 0.979126\n", - "[89]\tvalid_0's ndcg@1: 0.9454\tvalid_0's ndcg@2: 0.977767\tvalid_0's ndcg@3: 0.979079\tvalid_0's ndcg@4: 0.979262\tvalid_0's ndcg@5: 0.97934\n", - "[90]\tvalid_0's ndcg@1: 0.945375\tvalid_0's ndcg@2: 0.977773\tvalid_0's ndcg@3: 0.979073\tvalid_0's ndcg@4: 0.979256\tvalid_0's ndcg@5: 0.979334\n", - "[91]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977792\tvalid_0's ndcg@3: 0.979092\tvalid_0's ndcg@4: 0.979275\tvalid_0's ndcg@5: 0.979352\n", - "[92]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977776\tvalid_0's ndcg@3: 0.979088\tvalid_0's ndcg@4: 0.979261\tvalid_0's ndcg@5: 0.979348\n", - "[93]\tvalid_0's ndcg@1: 0.945375\tvalid_0's ndcg@2: 0.977757\tvalid_0's ndcg@3: 0.979082\tvalid_0's ndcg@4: 0.979244\tvalid_0's ndcg@5: 0.979331\n", - "[94]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977761\tvalid_0's ndcg@3: 0.979061\tvalid_0's ndcg@4: 0.979223\tvalid_0's ndcg@5: 0.97931\n", - "[95]\tvalid_0's ndcg@1: 0.9454\tvalid_0's ndcg@2: 0.977798\tvalid_0's ndcg@3: 0.979086\tvalid_0's ndcg@4: 0.979258\tvalid_0's ndcg@5: 0.979345\n", - "[96]\tvalid_0's ndcg@1: 0.945825\tvalid_0's ndcg@2: 0.977955\tvalid_0's ndcg@3: 0.97923\tvalid_0's ndcg@4: 0.979413\tvalid_0's ndcg@5: 0.9795\n", - "[97]\tvalid_0's ndcg@1: 0.945925\tvalid_0's ndcg@2: 0.97796\tvalid_0's ndcg@3: 0.97926\tvalid_0's ndcg@4: 0.979443\tvalid_0's ndcg@5: 0.979531\n", - "[98]\tvalid_0's ndcg@1: 0.9464\tvalid_0's ndcg@2: 0.97812\tvalid_0's ndcg@3: 0.97942\tvalid_0's ndcg@4: 0.979625\tvalid_0's ndcg@5: 0.979702\n", - "[99]\tvalid_0's ndcg@1: 0.94655\tvalid_0's ndcg@2: 0.978191\tvalid_0's ndcg@3: 0.979479\tvalid_0's ndcg@4: 0.979683\tvalid_0's ndcg@5: 0.97977\n", - "[100]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.978244\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979725\tvalid_0's ndcg@5: 0.979812\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.978244\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979725\tvalid_0's ndcg@5: 0.979812\n", - "[1]\tvalid_0's ndcg@1: 0.910175\tvalid_0's ndcg@2: 0.963031\tvalid_0's ndcg@3: 0.965281\tvalid_0's ndcg@4: 0.965819\tvalid_0's ndcg@5: 0.965887\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's ndcg@1: 0.9141\tvalid_0's ndcg@2: 0.964748\tvalid_0's ndcg@3: 0.96681\tvalid_0's ndcg@4: 0.967316\tvalid_0's ndcg@5: 0.967394\n", - "[3]\tvalid_0's ndcg@1: 0.915925\tvalid_0's ndcg@2: 0.9655\tvalid_0's ndcg@3: 0.967575\tvalid_0's ndcg@4: 0.968028\tvalid_0's ndcg@5: 0.968105\n", - "[4]\tvalid_0's ndcg@1: 0.91915\tvalid_0's ndcg@2: 0.966943\tvalid_0's ndcg@3: 0.968968\tvalid_0's ndcg@4: 0.969334\tvalid_0's ndcg@5: 0.969373\n", - "[5]\tvalid_0's ndcg@1: 0.920625\tvalid_0's ndcg@2: 0.967598\tvalid_0's ndcg@3: 0.969498\tvalid_0's ndcg@4: 0.969896\tvalid_0's ndcg@5: 0.969944\n", - "[6]\tvalid_0's ndcg@1: 0.922625\tvalid_0's ndcg@2: 0.968336\tvalid_0's ndcg@3: 0.970261\tvalid_0's ndcg@4: 0.970659\tvalid_0's ndcg@5: 0.970688\n", - "[7]\tvalid_0's ndcg@1: 0.923625\tvalid_0's ndcg@2: 0.968768\tvalid_0's ndcg@3: 0.970656\tvalid_0's ndcg@4: 0.971043\tvalid_0's ndcg@5: 0.971072\n", - "[8]\tvalid_0's ndcg@1: 0.925825\tvalid_0's ndcg@2: 0.969612\tvalid_0's ndcg@3: 0.971462\tvalid_0's ndcg@4: 0.97186\tvalid_0's ndcg@5: 0.971879\n", - "[9]\tvalid_0's ndcg@1: 0.926475\tvalid_0's ndcg@2: 0.969899\tvalid_0's ndcg@3: 0.971711\tvalid_0's ndcg@4: 0.97211\tvalid_0's ndcg@5: 0.972129\n", - "[10]\tvalid_0's ndcg@1: 0.927775\tvalid_0's ndcg@2: 0.97041\tvalid_0's ndcg@3: 0.972185\tvalid_0's ndcg@4: 0.972594\tvalid_0's ndcg@5: 0.972614\n", - "[11]\tvalid_0's ndcg@1: 0.92885\tvalid_0's ndcg@2: 0.970838\tvalid_0's ndcg@3: 0.972588\tvalid_0's ndcg@4: 0.973008\tvalid_0's ndcg@5: 0.973028\n", - "[12]\tvalid_0's ndcg@1: 0.930325\tvalid_0's ndcg@2: 0.971367\tvalid_0's ndcg@3: 0.973129\tvalid_0's ndcg@4: 0.973549\tvalid_0's ndcg@5: 0.973569\n", - "[13]\tvalid_0's ndcg@1: 0.931125\tvalid_0's ndcg@2: 0.971631\tvalid_0's ndcg@3: 0.973443\tvalid_0's ndcg@4: 0.973842\tvalid_0's ndcg@5: 0.973871\n", - "[14]\tvalid_0's ndcg@1: 0.931525\tvalid_0's ndcg@2: 0.971778\tvalid_0's ndcg@3: 0.973616\tvalid_0's ndcg@4: 0.973993\tvalid_0's ndcg@5: 0.974022\n", - "[15]\tvalid_0's ndcg@1: 0.9311\tvalid_0's ndcg@2: 0.9717\tvalid_0's ndcg@3: 0.973475\tvalid_0's ndcg@4: 0.973852\tvalid_0's ndcg@5: 0.973872\n", - "[16]\tvalid_0's ndcg@1: 0.931775\tvalid_0's ndcg@2: 0.971902\tvalid_0's ndcg@3: 0.973702\tvalid_0's ndcg@4: 0.97409\tvalid_0's ndcg@5: 0.974109\n", - "[17]\tvalid_0's ndcg@1: 0.931425\tvalid_0's ndcg@2: 0.971805\tvalid_0's ndcg@3: 0.97358\tvalid_0's ndcg@4: 0.973967\tvalid_0's ndcg@5: 0.973986\n", - "[18]\tvalid_0's ndcg@1: 0.931575\tvalid_0's ndcg@2: 0.971876\tvalid_0's ndcg@3: 0.973651\tvalid_0's ndcg@4: 0.974027\tvalid_0's ndcg@5: 0.974047\n", - "[19]\tvalid_0's ndcg@1: 0.932\tvalid_0's ndcg@2: 0.97208\tvalid_0's ndcg@3: 0.973805\tvalid_0's ndcg@4: 0.974192\tvalid_0's ndcg@5: 0.974212\n", - "[20]\tvalid_0's ndcg@1: 0.932075\tvalid_0's ndcg@2: 0.972092\tvalid_0's ndcg@3: 0.973829\tvalid_0's ndcg@4: 0.974217\tvalid_0's ndcg@5: 0.974236\n", - "[21]\tvalid_0's ndcg@1: 0.932675\tvalid_0's ndcg@2: 0.972282\tvalid_0's ndcg@3: 0.974057\tvalid_0's ndcg@4: 0.974444\tvalid_0's ndcg@5: 0.974454\n", - "[22]\tvalid_0's ndcg@1: 0.932925\tvalid_0's ndcg@2: 0.972358\tvalid_0's ndcg@3: 0.974146\tvalid_0's ndcg@4: 0.974533\tvalid_0's ndcg@5: 0.974543\n", - "[23]\tvalid_0's ndcg@1: 0.93325\tvalid_0's ndcg@2: 0.972478\tvalid_0's ndcg@3: 0.974253\tvalid_0's ndcg@4: 0.974651\tvalid_0's ndcg@5: 0.974661\n", - "[24]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972539\tvalid_0's ndcg@3: 0.974351\tvalid_0's ndcg@4: 0.974739\tvalid_0's ndcg@5: 0.974749\n", - "[25]\tvalid_0's ndcg@1: 0.93475\tvalid_0's ndcg@2: 0.973\tvalid_0's ndcg@3: 0.974788\tvalid_0's ndcg@4: 0.975197\tvalid_0's ndcg@5: 0.975206\n", - "[26]\tvalid_0's ndcg@1: 0.935075\tvalid_0's ndcg@2: 0.97312\tvalid_0's ndcg@3: 0.974895\tvalid_0's ndcg@4: 0.975315\tvalid_0's ndcg@5: 0.975325\n", - "[27]\tvalid_0's ndcg@1: 0.9349\tvalid_0's ndcg@2: 0.973103\tvalid_0's ndcg@3: 0.974865\tvalid_0's ndcg@4: 0.975264\tvalid_0's ndcg@5: 0.975273\n", - "[28]\tvalid_0's ndcg@1: 0.935075\tvalid_0's ndcg@2: 0.973152\tvalid_0's ndcg@3: 0.974939\tvalid_0's ndcg@4: 0.975327\tvalid_0's ndcg@5: 0.975336\n", - "[29]\tvalid_0's ndcg@1: 0.935475\tvalid_0's ndcg@2: 0.973315\tvalid_0's ndcg@3: 0.975128\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975492\n", - "[30]\tvalid_0's ndcg@1: 0.93595\tvalid_0's ndcg@2: 0.973522\tvalid_0's ndcg@3: 0.975297\tvalid_0's ndcg@4: 0.975663\tvalid_0's ndcg@5: 0.975673\n", - "[31]\tvalid_0's ndcg@1: 0.93595\tvalid_0's ndcg@2: 0.973506\tvalid_0's ndcg@3: 0.975281\tvalid_0's ndcg@4: 0.975658\tvalid_0's ndcg@5: 0.975668\n", - "[32]\tvalid_0's ndcg@1: 0.93675\tvalid_0's ndcg@2: 0.973833\tvalid_0's ndcg@3: 0.975595\tvalid_0's ndcg@4: 0.975961\tvalid_0's ndcg@5: 0.975971\n", - "[33]\tvalid_0's ndcg@1: 0.936475\tvalid_0's ndcg@2: 0.973763\tvalid_0's ndcg@3: 0.975488\tvalid_0's ndcg@4: 0.975865\tvalid_0's ndcg@5: 0.975874\n", - "[34]\tvalid_0's ndcg@1: 0.9367\tvalid_0's ndcg@2: 0.973893\tvalid_0's ndcg@3: 0.975568\tvalid_0's ndcg@4: 0.975956\tvalid_0's ndcg@5: 0.975966\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LGB排序模型" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[35]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.974059\tvalid_0's ndcg@3: 0.975722\tvalid_0's ndcg@4: 0.97612\tvalid_0's ndcg@5: 0.97613\n", - "[36]\tvalid_0's ndcg@1: 0.9374\tvalid_0's ndcg@2: 0.974183\tvalid_0's ndcg@3: 0.975846\tvalid_0's ndcg@4: 0.976223\tvalid_0's ndcg@5: 0.976232\n", - "[37]\tvalid_0's ndcg@1: 0.9374\tvalid_0's ndcg@2: 0.974183\tvalid_0's ndcg@3: 0.975846\tvalid_0's ndcg@4: 0.976223\tvalid_0's ndcg@5: 0.976232\n", - "[38]\tvalid_0's ndcg@1: 0.938725\tvalid_0's ndcg@2: 0.974672\tvalid_0's ndcg@3: 0.97636\tvalid_0's ndcg@4: 0.976715\tvalid_0's ndcg@5: 0.976725\n", - "[39]\tvalid_0's ndcg@1: 0.93865\tvalid_0's ndcg@2: 0.974676\tvalid_0's ndcg@3: 0.976364\tvalid_0's ndcg@4: 0.976697\tvalid_0's ndcg@5: 0.976707\n", - "[40]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.974867\tvalid_0's ndcg@3: 0.97653\tvalid_0's ndcg@4: 0.976874\tvalid_0's ndcg@5: 0.976884\n", - "[41]\tvalid_0's ndcg@1: 0.9396\tvalid_0's ndcg@2: 0.975042\tvalid_0's ndcg@3: 0.976705\tvalid_0's ndcg@4: 0.97705\tvalid_0's ndcg@5: 0.977059\n", - "[42]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.975072\tvalid_0's ndcg@3: 0.976784\tvalid_0's ndcg@4: 0.977129\tvalid_0's ndcg@5: 0.977138\n", - "[43]\tvalid_0's ndcg@1: 0.940075\tvalid_0's ndcg@2: 0.97517\tvalid_0's ndcg@3: 0.97687\tvalid_0's ndcg@4: 0.977215\tvalid_0's ndcg@5: 0.977225\n", - "[44]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.97534\tvalid_0's ndcg@3: 0.977015\tvalid_0's ndcg@4: 0.97736\tvalid_0's ndcg@5: 0.97737\n", - "[45]\tvalid_0's ndcg@1: 0.94055\tvalid_0's ndcg@2: 0.975409\tvalid_0's ndcg@3: 0.977059\tvalid_0's ndcg@4: 0.977403\tvalid_0's ndcg@5: 0.977413\n", - "[46]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975415\tvalid_0's ndcg@3: 0.97704\tvalid_0's ndcg@4: 0.977396\tvalid_0's ndcg@5: 0.977405\n", - "[47]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975363\tvalid_0's ndcg@3: 0.977013\tvalid_0's ndcg@4: 0.977357\tvalid_0's ndcg@5: 0.977367\n", - "[48]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975388\tvalid_0's ndcg@3: 0.977025\tvalid_0's ndcg@4: 0.97737\tvalid_0's ndcg@5: 0.977379\n", - "[49]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975447\tvalid_0's ndcg@3: 0.977097\tvalid_0's ndcg@4: 0.977409\tvalid_0's ndcg@5: 0.977419\n", - "[50]\tvalid_0's ndcg@1: 0.941075\tvalid_0's ndcg@2: 0.975666\tvalid_0's ndcg@3: 0.977303\tvalid_0's ndcg@4: 0.977615\tvalid_0's ndcg@5: 0.977625\n", - "[51]\tvalid_0's ndcg@1: 0.94135\tvalid_0's ndcg@2: 0.975751\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.97771\tvalid_0's ndcg@5: 0.97772\n", - "[52]\tvalid_0's ndcg@1: 0.9413\tvalid_0's ndcg@2: 0.975717\tvalid_0's ndcg@3: 0.977355\tvalid_0's ndcg@4: 0.977688\tvalid_0's ndcg@5: 0.977698\n", - "[53]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.975713\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.977699\tvalid_0's ndcg@5: 0.977718\n", - "[54]\tvalid_0's ndcg@1: 0.94185\tvalid_0's ndcg@2: 0.975857\tvalid_0's ndcg@3: 0.977557\tvalid_0's ndcg@4: 0.977869\tvalid_0's ndcg@5: 0.977889\n", - "[55]\tvalid_0's ndcg@1: 0.941925\tvalid_0's ndcg@2: 0.975837\tvalid_0's ndcg@3: 0.9776\tvalid_0's ndcg@4: 0.977891\tvalid_0's ndcg@5: 0.97791\n", - "[56]\tvalid_0's ndcg@1: 0.942325\tvalid_0's ndcg@2: 0.975969\tvalid_0's ndcg@3: 0.977719\tvalid_0's ndcg@4: 0.978032\tvalid_0's ndcg@5: 0.978051\n", - "[57]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976022\tvalid_0's ndcg@3: 0.977772\tvalid_0's ndcg@4: 0.978073\tvalid_0's ndcg@5: 0.978093\n", - "[58]\tvalid_0's ndcg@1: 0.9425\tvalid_0's ndcg@2: 0.976081\tvalid_0's ndcg@3: 0.977806\tvalid_0's ndcg@4: 0.978108\tvalid_0's ndcg@5: 0.978127\n", - "[59]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976076\tvalid_0's ndcg@3: 0.977788\tvalid_0's ndcg@4: 0.978079\tvalid_0's ndcg@5: 0.978098\n", - "[60]\tvalid_0's ndcg@1: 0.942375\tvalid_0's ndcg@2: 0.976067\tvalid_0's ndcg@3: 0.977779\tvalid_0's ndcg@4: 0.97807\tvalid_0's ndcg@5: 0.978089\n", - "[61]\tvalid_0's ndcg@1: 0.942225\tvalid_0's ndcg@2: 0.976043\tvalid_0's ndcg@3: 0.97773\tvalid_0's ndcg@4: 0.978021\tvalid_0's ndcg@5: 0.97804\n", - "[62]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976117\tvalid_0's ndcg@3: 0.977792\tvalid_0's ndcg@4: 0.978093\tvalid_0's ndcg@5: 0.978112\n", - "[63]\tvalid_0's ndcg@1: 0.942675\tvalid_0's ndcg@2: 0.976193\tvalid_0's ndcg@3: 0.977881\tvalid_0's ndcg@4: 0.978182\tvalid_0's ndcg@5: 0.978201\n", - "[64]\tvalid_0's ndcg@1: 0.942925\tvalid_0's ndcg@2: 0.976254\tvalid_0's ndcg@3: 0.977966\tvalid_0's ndcg@4: 0.978268\tvalid_0's ndcg@5: 0.978287\n", - "[65]\tvalid_0's ndcg@1: 0.9431\tvalid_0's ndcg@2: 0.97635\tvalid_0's ndcg@3: 0.978025\tvalid_0's ndcg@4: 0.978337\tvalid_0's ndcg@5: 0.978357\n", - "[66]\tvalid_0's ndcg@1: 0.9434\tvalid_0's ndcg@2: 0.976445\tvalid_0's ndcg@3: 0.978132\tvalid_0's ndcg@4: 0.978445\tvalid_0's ndcg@5: 0.978464\n", - "[67]\tvalid_0's ndcg@1: 0.943275\tvalid_0's ndcg@2: 0.976399\tvalid_0's ndcg@3: 0.978074\tvalid_0's ndcg@4: 0.978397\tvalid_0's ndcg@5: 0.978416\n", - "[68]\tvalid_0's ndcg@1: 0.943325\tvalid_0's ndcg@2: 0.976401\tvalid_0's ndcg@3: 0.978089\tvalid_0's ndcg@4: 0.978412\tvalid_0's ndcg@5: 0.978431\n", - "[69]\tvalid_0's ndcg@1: 0.943675\tvalid_0's ndcg@2: 0.976578\tvalid_0's ndcg@3: 0.97819\tvalid_0's ndcg@4: 0.978546\tvalid_0's ndcg@5: 0.978565\n", - "[70]\tvalid_0's ndcg@1: 0.944025\tvalid_0's ndcg@2: 0.976707\tvalid_0's ndcg@3: 0.97832\tvalid_0's ndcg@4: 0.978675\tvalid_0's ndcg@5: 0.978694\n", - "[71]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.976772\tvalid_0's ndcg@3: 0.978384\tvalid_0's ndcg@4: 0.97874\tvalid_0's ndcg@5: 0.978759\n", - "[72]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.976822\tvalid_0's ndcg@3: 0.978409\tvalid_0's ndcg@4: 0.978765\tvalid_0's ndcg@5: 0.978784\n", - "[73]\tvalid_0's ndcg@1: 0.94445\tvalid_0's ndcg@2: 0.976864\tvalid_0's ndcg@3: 0.978464\tvalid_0's ndcg@4: 0.97883\tvalid_0's ndcg@5: 0.978849\n", - "[74]\tvalid_0's ndcg@1: 0.9446\tvalid_0's ndcg@2: 0.976919\tvalid_0's ndcg@3: 0.978519\tvalid_0's ndcg@4: 0.978885\tvalid_0's ndcg@5: 0.978905\n", - "[75]\tvalid_0's ndcg@1: 0.9446\tvalid_0's ndcg@2: 0.976919\tvalid_0's ndcg@3: 0.978519\tvalid_0's ndcg@4: 0.978885\tvalid_0's ndcg@5: 0.978905\n", - "[76]\tvalid_0's ndcg@1: 0.944625\tvalid_0's ndcg@2: 0.97696\tvalid_0's ndcg@3: 0.978535\tvalid_0's ndcg@4: 0.978901\tvalid_0's ndcg@5: 0.978921\n", - "[77]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.976979\tvalid_0's ndcg@3: 0.978554\tvalid_0's ndcg@4: 0.97892\tvalid_0's ndcg@5: 0.978939\n", - "[78]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.976979\tvalid_0's ndcg@3: 0.978554\tvalid_0's ndcg@4: 0.97892\tvalid_0's ndcg@5: 0.978939\n", - "[79]\tvalid_0's ndcg@1: 0.944525\tvalid_0's ndcg@2: 0.976907\tvalid_0's ndcg@3: 0.978507\tvalid_0's ndcg@4: 0.978863\tvalid_0's ndcg@5: 0.978882\n", - "[80]\tvalid_0's ndcg@1: 0.94455\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.97851\tvalid_0's ndcg@4: 0.978865\tvalid_0's ndcg@5: 0.978885\n", - "[81]\tvalid_0's ndcg@1: 0.944725\tvalid_0's ndcg@2: 0.97695\tvalid_0's ndcg@3: 0.978575\tvalid_0's ndcg@4: 0.978919\tvalid_0's ndcg@5: 0.978948\n", - "[82]\tvalid_0's ndcg@1: 0.945225\tvalid_0's ndcg@2: 0.977103\tvalid_0's ndcg@3: 0.978765\tvalid_0's ndcg@4: 0.97911\tvalid_0's ndcg@5: 0.979129\n", - "[83]\tvalid_0's ndcg@1: 0.945125\tvalid_0's ndcg@2: 0.977066\tvalid_0's ndcg@3: 0.978716\tvalid_0's ndcg@4: 0.979071\tvalid_0's ndcg@5: 0.97909\n", - "[84]\tvalid_0's ndcg@1: 0.945225\tvalid_0's ndcg@2: 0.97715\tvalid_0's ndcg@3: 0.978775\tvalid_0's ndcg@4: 0.97912\tvalid_0's ndcg@5: 0.979139\n", - "[85]\tvalid_0's ndcg@1: 0.945025\tvalid_0's ndcg@2: 0.977092\tvalid_0's ndcg@3: 0.978692\tvalid_0's ndcg@4: 0.979047\tvalid_0's ndcg@5: 0.979067\n", - "[86]\tvalid_0's ndcg@1: 0.9452\tvalid_0's ndcg@2: 0.977172\tvalid_0's ndcg@3: 0.97876\tvalid_0's ndcg@4: 0.979115\tvalid_0's ndcg@5: 0.979135\n", - "[87]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977178\tvalid_0's ndcg@3: 0.97879\tvalid_0's ndcg@4: 0.979156\tvalid_0's ndcg@5: 0.979166\n", - "[88]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977178\tvalid_0's ndcg@3: 0.978815\tvalid_0's ndcg@4: 0.979149\tvalid_0's ndcg@5: 0.979168\n", - "[89]\tvalid_0's ndcg@1: 0.94555\tvalid_0's ndcg@2: 0.977333\tvalid_0's ndcg@3: 0.978933\tvalid_0's ndcg@4: 0.979267\tvalid_0's ndcg@5: 0.979277\n", - "[90]\tvalid_0's ndcg@1: 0.9459\tvalid_0's ndcg@2: 0.977462\tvalid_0's ndcg@3: 0.979062\tvalid_0's ndcg@4: 0.979396\tvalid_0's ndcg@5: 0.979406\n", - "[91]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977481\tvalid_0's ndcg@3: 0.979081\tvalid_0's ndcg@4: 0.979414\tvalid_0's ndcg@5: 0.979424\n", - "[92]\tvalid_0's ndcg@1: 0.945875\tvalid_0's ndcg@2: 0.977437\tvalid_0's ndcg@3: 0.97905\tvalid_0's ndcg@4: 0.979384\tvalid_0's ndcg@5: 0.979393\n", - "[93]\tvalid_0's ndcg@1: 0.945875\tvalid_0's ndcg@2: 0.977421\tvalid_0's ndcg@3: 0.979046\tvalid_0's ndcg@4: 0.97938\tvalid_0's ndcg@5: 0.97939\n", - "[94]\tvalid_0's ndcg@1: 0.9459\tvalid_0's ndcg@2: 0.977431\tvalid_0's ndcg@3: 0.979068\tvalid_0's ndcg@4: 0.979391\tvalid_0's ndcg@5: 0.979401\n", - "[95]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977449\tvalid_0's ndcg@3: 0.979074\tvalid_0's ndcg@4: 0.979408\tvalid_0's ndcg@5: 0.979418\n", - "[96]\tvalid_0's ndcg@1: 0.946075\tvalid_0's ndcg@2: 0.977527\tvalid_0's ndcg@3: 0.979127\tvalid_0's ndcg@4: 0.979461\tvalid_0's ndcg@5: 0.97947\n" - ] + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:07.787698Z", + "start_time": "2020-11-18T04:21:07.536514Z" + } + }, + "outputs": [], + "source": [ + "# 防止中间出错之后重新读取数据\n", + "trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()\n", + "\n", + "if offline:\n", + " val_user_item_feats_df_rank_model = val_user_item_feats_df.copy()\n", + " \n", + "tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[97]\tvalid_0's ndcg@1: 0.946375\tvalid_0's ndcg@2: 0.977622\tvalid_0's ndcg@3: 0.979222\tvalid_0's ndcg@4: 0.979577\tvalid_0's ndcg@5: 0.979577\n", - "[98]\tvalid_0's ndcg@1: 0.946625\tvalid_0's ndcg@2: 0.977714\tvalid_0's ndcg@3: 0.979339\tvalid_0's ndcg@4: 0.979673\tvalid_0's ndcg@5: 0.979673\n", - "[99]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.977739\tvalid_0's ndcg@3: 0.979352\tvalid_0's ndcg@4: 0.979685\tvalid_0's ndcg@5: 0.979685\n", - "[100]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.97778\tvalid_0's ndcg@3: 0.97938\tvalid_0's ndcg@4: 0.979703\tvalid_0's ndcg@5: 0.979703\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.97778\tvalid_0's ndcg@3: 0.97938\tvalid_0's ndcg@4: 0.979703\tvalid_0's ndcg@5: 0.979703\n" - ] - } - ], - "source": [ - "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", - "# 这一部分与前面的单独训练和验证是分开的\n", - "def get_kfold_users(trn_df, n=5):\n", - " user_ids = trn_df['user_id'].unique()\n", - " user_set = [user_ids[i::n] for i in range(n)]\n", - " return user_set\n", - "\n", - "k_fold = 5\n", - "trn_df = trn_user_item_feats_df_rank_model\n", - "user_set = get_kfold_users(trn_df, n=k_fold)\n", - "\n", - "score_list = []\n", - "score_df = trn_df[['user_id', 'click_article_id','label']]\n", - "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", - "\n", - "# 五折交叉验证,并将中间结果保存用于staking\n", - "for n_fold, valid_user in enumerate(user_set):\n", - " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", - " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", - " \n", - " # 训练集与验证集的用户分组\n", - " train_idx.sort_values(by=['user_id'], inplace=True)\n", - " g_train = train_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", - " \n", - " valid_idx.sort_values(by=['user_id'], inplace=True)\n", - " g_val = valid_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", - " \n", - " # 定义模型\n", - " lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", - " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", - " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) \n", - " # 训练模型\n", - " lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,\n", - " eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], \n", - " eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n", - " \n", - " # 预测验证集结果\n", - " valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n", - " \n", - " # 对输出结果进行归一化\n", - " valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))\n", - " \n", - " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", - " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", - " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", - " \n", - " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", - " if not offline:\n", - " sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)\n", - " \n", - "score_df_ = pd.concat(score_list, axis=0)\n", - "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", - "# 保存训练集交叉验证产生的新特征\n", - "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)\n", - " \n", - "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", - "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n", - "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n", - "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n", - "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - "\n", - "# 保存测试集交叉验证的新特征\n", - "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:22:52.604397Z", - "start_time": "2020-11-18T04:22:43.253034Z" - } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "# 单模型生成提交结果\n", - "rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]\n", - "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", - "submit(rank_results, topk=5, model_name='lgb_ranker')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## LGB分类模型" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:22:58.259730Z", - "start_time": "2020-11-18T04:22:58.254297Z" - } - }, - "outputs": [], - "source": [ - "# 模型及参数的定义\n", - "lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", - " max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", - " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) " - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:23:11.258774Z", - "start_time": "2020-11-18T04:23:00.861936Z" - } - }, - "outputs": [], - "source": [ - "# 模型训练\n", - "if offline:\n", - " lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],\n", - " eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], \n", - " eval_metric=['auc', ],early_stopping_rounds=50, )\n", - "else:\n", - " lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:23:19.591396Z", - "start_time": "2020-11-18T04:23:13.813850Z" - } - }, - "outputs": [], - "source": [ - "# 模型预测\n", - "tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]\n", - "\n", - "# 将这里的排序结果保存一份,用户后面的模型融合\n", - "tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:23:32.352931Z", - "start_time": "2020-11-18T04:23:22.346609Z" - } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]\n", - "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", - "submit(rank_results, topk=5, model_name='lgb_cls')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:24:11.241196Z", - "start_time": "2020-11-18T04:23:41.377394Z" + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:10.839656Z", + "start_time": "2020-11-18T04:21:10.833109Z" + } + }, + "outputs": [], + "source": [ + "# 定义特征列\n", + "lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', \n", + " 'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level',\n", + " 'click_environment','click_deviceGroup', 'click_os', 'click_country', \n", + " 'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2',\n", + " 'words_hbo', 'category_id', 'created_at_ts','words_count']" + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1]\tvalid_0's auc: 0.764896\tvalid_0's binary_logloss: 0.522153\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.767857\tvalid_0's binary_logloss: 0.52057\n", - "[3]\tvalid_0's auc: 0.783096\tvalid_0's binary_logloss: 0.519584\n", - "[4]\tvalid_0's auc: 0.784354\tvalid_0's binary_logloss: 0.518485\n", - "[5]\tvalid_0's auc: 0.790554\tvalid_0's binary_logloss: 0.516886\n", - "[6]\tvalid_0's auc: 0.791954\tvalid_0's binary_logloss: 0.515334\n", - "[7]\tvalid_0's auc: 0.794257\tvalid_0's binary_logloss: 0.514032\n", - "[8]\tvalid_0's auc: 0.795222\tvalid_0's binary_logloss: 0.512516\n", - "[9]\tvalid_0's auc: 0.795417\tvalid_0's binary_logloss: 0.511671\n", - "[10]\tvalid_0's auc: 0.795913\tvalid_0's binary_logloss: 0.510226\n", - "[11]\tvalid_0's auc: 0.798222\tvalid_0's binary_logloss: 0.508858\n", - "[12]\tvalid_0's auc: 0.79825\tvalid_0's binary_logloss: 0.507928\n", - "[13]\tvalid_0's auc: 0.798842\tvalid_0's binary_logloss: 0.50708\n", - "[14]\tvalid_0's auc: 0.798935\tvalid_0's binary_logloss: 0.505752\n", - "[15]\tvalid_0's auc: 0.799543\tvalid_0's binary_logloss: 0.504388\n", - "[16]\tvalid_0's auc: 0.800844\tvalid_0's binary_logloss: 0.503126\n", - "[17]\tvalid_0's auc: 0.800855\tvalid_0's binary_logloss: 0.501809\n", - "[18]\tvalid_0's auc: 0.801653\tvalid_0's binary_logloss: 0.500676\n", - "[19]\tvalid_0's auc: 0.801518\tvalid_0's binary_logloss: 0.49987\n", - "[20]\tvalid_0's auc: 0.801662\tvalid_0's binary_logloss: 0.498625\n", - "[21]\tvalid_0's auc: 0.802093\tvalid_0's binary_logloss: 0.498113\n", - "[22]\tvalid_0's auc: 0.803071\tvalid_0's binary_logloss: 0.496933\n", - "[23]\tvalid_0's auc: 0.803222\tvalid_0's binary_logloss: 0.495864\n", - "[24]\tvalid_0's auc: 0.802927\tvalid_0's binary_logloss: 0.494691\n", - "[25]\tvalid_0's auc: 0.802581\tvalid_0's binary_logloss: 0.493543\n", - "[26]\tvalid_0's auc: 0.802965\tvalid_0's binary_logloss: 0.492444\n", - "[27]\tvalid_0's auc: 0.80298\tvalid_0's binary_logloss: 0.491336\n", - "[28]\tvalid_0's auc: 0.803226\tvalid_0's binary_logloss: 0.490275\n", - "[29]\tvalid_0's auc: 0.803436\tvalid_0's binary_logloss: 0.489126\n", - "[30]\tvalid_0's auc: 0.803796\tvalid_0's binary_logloss: 0.48802\n", - "[31]\tvalid_0's auc: 0.803601\tvalid_0's binary_logloss: 0.486988\n", - "[32]\tvalid_0's auc: 0.804416\tvalid_0's binary_logloss: 0.485972\n", - "[33]\tvalid_0's auc: 0.804529\tvalid_0's binary_logloss: 0.484939\n", - "[34]\tvalid_0's auc: 0.804534\tvalid_0's binary_logloss: 0.483927\n", - "[35]\tvalid_0's auc: 0.804819\tvalid_0's binary_logloss: 0.483271\n", - "[36]\tvalid_0's auc: 0.804774\tvalid_0's binary_logloss: 0.482273\n", - "[37]\tvalid_0's auc: 0.805237\tvalid_0's binary_logloss: 0.481639\n", - "[38]\tvalid_0's auc: 0.805546\tvalid_0's binary_logloss: 0.480959\n", - "[39]\tvalid_0's auc: 0.805598\tvalid_0's binary_logloss: 0.479955\n", - "[40]\tvalid_0's auc: 0.806011\tvalid_0's binary_logloss: 0.47903\n", - "[41]\tvalid_0's auc: 0.806664\tvalid_0's binary_logloss: 0.478439\n", - "[42]\tvalid_0's auc: 0.807021\tvalid_0's binary_logloss: 0.477798\n", - "[43]\tvalid_0's auc: 0.80726\tvalid_0's binary_logloss: 0.476829\n", - "[44]\tvalid_0's auc: 0.807157\tvalid_0's binary_logloss: 0.475976\n", - "[45]\tvalid_0's auc: 0.807788\tvalid_0's binary_logloss: 0.475056\n", - "[46]\tvalid_0's auc: 0.80805\tvalid_0's binary_logloss: 0.474446\n", - "[47]\tvalid_0's auc: 0.808097\tvalid_0's binary_logloss: 0.473576\n", - "[48]\tvalid_0's auc: 0.80815\tvalid_0's binary_logloss: 0.472676\n", - "[49]\tvalid_0's auc: 0.808304\tvalid_0's binary_logloss: 0.471918\n", - "[50]\tvalid_0's auc: 0.808749\tvalid_0's binary_logloss: 0.471481\n", - "[51]\tvalid_0's auc: 0.808972\tvalid_0's binary_logloss: 0.471104\n", - "[52]\tvalid_0's auc: 0.809326\tvalid_0's binary_logloss: 0.470289\n", - "[53]\tvalid_0's auc: 0.809472\tvalid_0's binary_logloss: 0.469508\n", - "[54]\tvalid_0's auc: 0.809505\tvalid_0's binary_logloss: 0.46869\n", - "[55]\tvalid_0's auc: 0.809594\tvalid_0's binary_logloss: 0.467885\n", - "[56]\tvalid_0's auc: 0.809847\tvalid_0's binary_logloss: 0.467356\n", - "[57]\tvalid_0's auc: 0.810262\tvalid_0's binary_logloss: 0.466531\n", - "[58]\tvalid_0's auc: 0.810407\tvalid_0's binary_logloss: 0.46573\n", - "[59]\tvalid_0's auc: 0.810618\tvalid_0's binary_logloss: 0.465205\n", - "[60]\tvalid_0's auc: 0.81066\tvalid_0's binary_logloss: 0.464435\n", - "[61]\tvalid_0's auc: 0.810638\tvalid_0's binary_logloss: 0.463721\n", - "[62]\tvalid_0's auc: 0.810658\tvalid_0's binary_logloss: 0.462982\n", - "[63]\tvalid_0's auc: 0.811106\tvalid_0's binary_logloss: 0.462246\n", - "[64]\tvalid_0's auc: 0.811313\tvalid_0's binary_logloss: 0.461748\n", - "[65]\tvalid_0's auc: 0.811351\tvalid_0's binary_logloss: 0.461038\n", - "[66]\tvalid_0's auc: 0.811433\tvalid_0's binary_logloss: 0.460323\n", - "[67]\tvalid_0's auc: 0.81158\tvalid_0's binary_logloss: 0.459662\n", - "[68]\tvalid_0's auc: 0.811561\tvalid_0's binary_logloss: 0.458988\n", - "[69]\tvalid_0's auc: 0.811748\tvalid_0's binary_logloss: 0.458592\n", - "[70]\tvalid_0's auc: 0.811919\tvalid_0's binary_logloss: 0.457934\n", - "[71]\tvalid_0's auc: 0.812073\tvalid_0's binary_logloss: 0.457508\n", - "[72]\tvalid_0's auc: 0.812273\tvalid_0's binary_logloss: 0.457038\n", - "[73]\tvalid_0's auc: 0.812561\tvalid_0's binary_logloss: 0.456439\n", - "[74]\tvalid_0's auc: 0.812633\tvalid_0's binary_logloss: 0.455789\n", - "[75]\tvalid_0's auc: 0.812757\tvalid_0's binary_logloss: 0.455173\n", - "[76]\tvalid_0's auc: 0.812923\tvalid_0's binary_logloss: 0.454533\n", - "[77]\tvalid_0's auc: 0.81295\tvalid_0's binary_logloss: 0.45392\n", - "[78]\tvalid_0's auc: 0.813073\tvalid_0's binary_logloss: 0.453517\n", - "[79]\tvalid_0's auc: 0.813202\tvalid_0's binary_logloss: 0.452932\n", - "[80]\tvalid_0's auc: 0.813611\tvalid_0's binary_logloss: 0.452285\n", - "[81]\tvalid_0's auc: 0.813769\tvalid_0's binary_logloss: 0.45191\n", - "[82]\tvalid_0's auc: 0.814468\tvalid_0's binary_logloss: 0.451455\n", - "[83]\tvalid_0's auc: 0.814656\tvalid_0's binary_logloss: 0.450885\n", - "[84]\tvalid_0's auc: 0.814755\tvalid_0's binary_logloss: 0.450308\n", - "[85]\tvalid_0's auc: 0.814824\tvalid_0's binary_logloss: 0.449739\n", - "[86]\tvalid_0's auc: 0.81499\tvalid_0's binary_logloss: 0.449348\n", - "[87]\tvalid_0's auc: 0.815232\tvalid_0's binary_logloss: 0.448759\n", - "[88]\tvalid_0's auc: 0.815452\tvalid_0's binary_logloss: 0.44823\n", - "[89]\tvalid_0's auc: 0.815593\tvalid_0's binary_logloss: 0.447861\n", - "[90]\tvalid_0's auc: 0.815591\tvalid_0's binary_logloss: 0.447323\n", - "[91]\tvalid_0's auc: 0.815672\tvalid_0's binary_logloss: 0.446796\n", - "[92]\tvalid_0's auc: 0.815875\tvalid_0's binary_logloss: 0.446472\n", - "[93]\tvalid_0's auc: 0.815984\tvalid_0's binary_logloss: 0.445961\n", - "[94]\tvalid_0's auc: 0.816026\tvalid_0's binary_logloss: 0.445439\n", - "[95]\tvalid_0's auc: 0.816172\tvalid_0's binary_logloss: 0.444909\n", - "[96]\tvalid_0's auc: 0.816321\tvalid_0's binary_logloss: 0.444413\n", - "[97]\tvalid_0's auc: 0.816751\tvalid_0's binary_logloss: 0.44405\n", - "[98]\tvalid_0's auc: 0.817226\tvalid_0's binary_logloss: 0.443626\n", - "[99]\tvalid_0's auc: 0.817286\tvalid_0's binary_logloss: 0.443136\n", - "[100]\tvalid_0's auc: 0.817391\tvalid_0's binary_logloss: 0.442854\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.817391\tvalid_0's binary_logloss: 0.442854\n", - "[1]\tvalid_0's auc: 0.771584\tvalid_0's binary_logloss: 0.527139\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.775446\tvalid_0's binary_logloss: 0.525462\n", - "[3]\tvalid_0's auc: 0.790092\tvalid_0's binary_logloss: 0.524461\n", - "[4]\tvalid_0's auc: 0.791432\tvalid_0's binary_logloss: 0.523322\n", - "[5]\tvalid_0's auc: 0.797482\tvalid_0's binary_logloss: 0.521614\n", - "[6]\tvalid_0's auc: 0.79893\tvalid_0's binary_logloss: 0.520007\n", - "[7]\tvalid_0's auc: 0.800753\tvalid_0's binary_logloss: 0.5187\n", - "[8]\tvalid_0's auc: 0.802197\tvalid_0's binary_logloss: 0.517125\n", - "[9]\tvalid_0's auc: 0.802828\tvalid_0's binary_logloss: 0.516269\n", - "[10]\tvalid_0's auc: 0.803496\tvalid_0's binary_logloss: 0.51474\n", - "[11]\tvalid_0's auc: 0.804972\tvalid_0's binary_logloss: 0.513321\n", - "[12]\tvalid_0's auc: 0.804995\tvalid_0's binary_logloss: 0.512334\n", - "[13]\tvalid_0's auc: 0.80525\tvalid_0's binary_logloss: 0.51151\n", - "[14]\tvalid_0's auc: 0.805026\tvalid_0's binary_logloss: 0.510149\n", - "[15]\tvalid_0's auc: 0.805622\tvalid_0's binary_logloss: 0.508708\n", - "[16]\tvalid_0's auc: 0.806974\tvalid_0's binary_logloss: 0.507384\n", - "[17]\tvalid_0's auc: 0.807045\tvalid_0's binary_logloss: 0.506017\n", - "[18]\tvalid_0's auc: 0.807265\tvalid_0's binary_logloss: 0.504853\n", - "[19]\tvalid_0's auc: 0.807126\tvalid_0's binary_logloss: 0.503972\n", - "[20]\tvalid_0's auc: 0.806948\tvalid_0's binary_logloss: 0.502693\n", - "[21]\tvalid_0's auc: 0.807315\tvalid_0's binary_logloss: 0.502166\n", - "[22]\tvalid_0's auc: 0.808067\tvalid_0's binary_logloss: 0.500948\n", - "[23]\tvalid_0's auc: 0.808226\tvalid_0's binary_logloss: 0.49987\n", - "[24]\tvalid_0's auc: 0.808268\tvalid_0's binary_logloss: 0.498623\n", - "[25]\tvalid_0's auc: 0.808569\tvalid_0's binary_logloss: 0.497389\n", - "[26]\tvalid_0's auc: 0.809069\tvalid_0's binary_logloss: 0.49624\n", - "[27]\tvalid_0's auc: 0.809312\tvalid_0's binary_logloss: 0.495095\n", - "[28]\tvalid_0's auc: 0.809549\tvalid_0's binary_logloss: 0.494012\n", - "[29]\tvalid_0's auc: 0.809944\tvalid_0's binary_logloss: 0.492834\n", - "[30]\tvalid_0's auc: 0.810047\tvalid_0's binary_logloss: 0.491735\n", - "[31]\tvalid_0's auc: 0.810086\tvalid_0's binary_logloss: 0.490633\n" - ] + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:14.126608Z", + "start_time": "2020-11-18T04:21:13.493653Z" + } + }, + "outputs": [], + "source": [ + "# 排序模型分组\n", + "trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n", + "g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", + "\n", + "if offline:\n", + " val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)\n", + " g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()[\"label\"].values" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[32]\tvalid_0's auc: 0.810566\tvalid_0's binary_logloss: 0.489595\n", - "[33]\tvalid_0's auc: 0.810539\tvalid_0's binary_logloss: 0.488536\n", - "[34]\tvalid_0's auc: 0.810529\tvalid_0's binary_logloss: 0.487489\n", - "[35]\tvalid_0's auc: 0.810932\tvalid_0's binary_logloss: 0.486775\n", - "[36]\tvalid_0's auc: 0.810769\tvalid_0's binary_logloss: 0.48577\n", - "[37]\tvalid_0's auc: 0.811363\tvalid_0's binary_logloss: 0.485123\n", - "[38]\tvalid_0's auc: 0.811801\tvalid_0's binary_logloss: 0.484413\n", - "[39]\tvalid_0's auc: 0.811987\tvalid_0's binary_logloss: 0.483371\n", - "[40]\tvalid_0's auc: 0.812268\tvalid_0's binary_logloss: 0.482407\n", - "[41]\tvalid_0's auc: 0.813297\tvalid_0's binary_logloss: 0.481742\n", - "[42]\tvalid_0's auc: 0.813453\tvalid_0's binary_logloss: 0.481108\n", - "[43]\tvalid_0's auc: 0.813603\tvalid_0's binary_logloss: 0.480163\n", - "[44]\tvalid_0's auc: 0.813654\tvalid_0's binary_logloss: 0.479239\n", - "[45]\tvalid_0's auc: 0.814267\tvalid_0's binary_logloss: 0.478299\n", - "[46]\tvalid_0's auc: 0.81455\tvalid_0's binary_logloss: 0.477678\n", - "[47]\tvalid_0's auc: 0.81452\tvalid_0's binary_logloss: 0.476766\n", - "[48]\tvalid_0's auc: 0.814925\tvalid_0's binary_logloss: 0.475815\n", - "[49]\tvalid_0's auc: 0.814907\tvalid_0's binary_logloss: 0.47503\n", - "[50]\tvalid_0's auc: 0.815278\tvalid_0's binary_logloss: 0.474588\n", - "[51]\tvalid_0's auc: 0.815535\tvalid_0's binary_logloss: 0.474171\n", - "[52]\tvalid_0's auc: 0.815685\tvalid_0's binary_logloss: 0.473335\n", - "[53]\tvalid_0's auc: 0.815787\tvalid_0's binary_logloss: 0.472509\n", - "[54]\tvalid_0's auc: 0.815827\tvalid_0's binary_logloss: 0.471686\n", - "[55]\tvalid_0's auc: 0.815871\tvalid_0's binary_logloss: 0.470838\n", - "[56]\tvalid_0's auc: 0.816238\tvalid_0's binary_logloss: 0.470285\n", - "[57]\tvalid_0's auc: 0.816269\tvalid_0's binary_logloss: 0.469495\n", - "[58]\tvalid_0's auc: 0.816528\tvalid_0's binary_logloss: 0.468654\n", - "[59]\tvalid_0's auc: 0.816706\tvalid_0's binary_logloss: 0.468122\n", - "[60]\tvalid_0's auc: 0.816821\tvalid_0's binary_logloss: 0.467352\n", - "[61]\tvalid_0's auc: 0.816759\tvalid_0's binary_logloss: 0.466622\n", - "[62]\tvalid_0's auc: 0.81682\tvalid_0's binary_logloss: 0.465867\n", - "[63]\tvalid_0's auc: 0.817251\tvalid_0's binary_logloss: 0.465112\n", - "[64]\tvalid_0's auc: 0.817476\tvalid_0's binary_logloss: 0.464589\n", - "[65]\tvalid_0's auc: 0.817613\tvalid_0's binary_logloss: 0.463831\n", - "[66]\tvalid_0's auc: 0.817648\tvalid_0's binary_logloss: 0.463098\n", - "[67]\tvalid_0's auc: 0.817719\tvalid_0's binary_logloss: 0.462414\n", - "[68]\tvalid_0's auc: 0.817814\tvalid_0's binary_logloss: 0.461727\n", - "[69]\tvalid_0's auc: 0.817973\tvalid_0's binary_logloss: 0.461329\n", - "[70]\tvalid_0's auc: 0.818108\tvalid_0's binary_logloss: 0.460674\n", - "[71]\tvalid_0's auc: 0.818347\tvalid_0's binary_logloss: 0.460222\n", - "[72]\tvalid_0's auc: 0.818456\tvalid_0's binary_logloss: 0.45977\n", - "[73]\tvalid_0's auc: 0.818727\tvalid_0's binary_logloss: 0.459157\n", - "[74]\tvalid_0's auc: 0.818988\tvalid_0's binary_logloss: 0.458437\n", - "[75]\tvalid_0's auc: 0.819144\tvalid_0's binary_logloss: 0.457808\n", - "[76]\tvalid_0's auc: 0.819259\tvalid_0's binary_logloss: 0.457159\n", - "[77]\tvalid_0's auc: 0.819343\tvalid_0's binary_logloss: 0.456512\n", - "[78]\tvalid_0's auc: 0.81954\tvalid_0's binary_logloss: 0.456045\n", - "[79]\tvalid_0's auc: 0.819687\tvalid_0's binary_logloss: 0.455416\n", - "[80]\tvalid_0's auc: 0.819958\tvalid_0's binary_logloss: 0.454765\n", - "[81]\tvalid_0's auc: 0.820115\tvalid_0's binary_logloss: 0.45436\n", - "[82]\tvalid_0's auc: 0.820536\tvalid_0's binary_logloss: 0.453965\n", - "[83]\tvalid_0's auc: 0.820649\tvalid_0's binary_logloss: 0.453383\n", - "[84]\tvalid_0's auc: 0.820663\tvalid_0's binary_logloss: 0.452804\n", - "[85]\tvalid_0's auc: 0.820809\tvalid_0's binary_logloss: 0.452167\n", - "[86]\tvalid_0's auc: 0.821024\tvalid_0's binary_logloss: 0.451735\n", - "[87]\tvalid_0's auc: 0.821124\tvalid_0's binary_logloss: 0.451167\n", - "[88]\tvalid_0's auc: 0.821243\tvalid_0's binary_logloss: 0.45061\n", - "[89]\tvalid_0's auc: 0.821404\tvalid_0's binary_logloss: 0.450215\n", - "[90]\tvalid_0's auc: 0.821488\tvalid_0's binary_logloss: 0.449656\n", - "[91]\tvalid_0's auc: 0.821538\tvalid_0's binary_logloss: 0.449107\n", - "[92]\tvalid_0's auc: 0.82172\tvalid_0's binary_logloss: 0.448752\n", - "[93]\tvalid_0's auc: 0.821809\tvalid_0's binary_logloss: 0.448188\n", - "[94]\tvalid_0's auc: 0.82184\tvalid_0's binary_logloss: 0.447659\n", - "[95]\tvalid_0's auc: 0.821971\tvalid_0's binary_logloss: 0.447108\n", - "[96]\tvalid_0's auc: 0.822086\tvalid_0's binary_logloss: 0.446596\n", - "[97]\tvalid_0's auc: 0.82247\tvalid_0's binary_logloss: 0.446244\n", - "[98]\tvalid_0's auc: 0.822951\tvalid_0's binary_logloss: 0.445812\n", - "[99]\tvalid_0's auc: 0.822991\tvalid_0's binary_logloss: 0.445329\n", - "[100]\tvalid_0's auc: 0.823174\tvalid_0's binary_logloss: 0.445037\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.823174\tvalid_0's binary_logloss: 0.445037\n", - "[1]\tvalid_0's auc: 0.769525\tvalid_0's binary_logloss: 0.526256\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.775857\tvalid_0's binary_logloss: 0.524594\n", - "[3]\tvalid_0's auc: 0.785307\tvalid_0's binary_logloss: 0.523606\n", - "[4]\tvalid_0's auc: 0.786356\tvalid_0's binary_logloss: 0.522495\n", - "[5]\tvalid_0's auc: 0.793385\tvalid_0's binary_logloss: 0.520812\n", - "[6]\tvalid_0's auc: 0.794014\tvalid_0's binary_logloss: 0.519253\n", - "[7]\tvalid_0's auc: 0.795454\tvalid_0's binary_logloss: 0.517961\n", - "[8]\tvalid_0's auc: 0.79807\tvalid_0's binary_logloss: 0.516363\n", - "[9]\tvalid_0's auc: 0.798756\tvalid_0's binary_logloss: 0.51548\n", - "[10]\tvalid_0's auc: 0.798314\tvalid_0's binary_logloss: 0.514021\n", - "[11]\tvalid_0's auc: 0.799343\tvalid_0's binary_logloss: 0.512678\n", - "[12]\tvalid_0's auc: 0.799573\tvalid_0's binary_logloss: 0.511708\n", - "[13]\tvalid_0's auc: 0.799563\tvalid_0's binary_logloss: 0.510892\n", - "[14]\tvalid_0's auc: 0.800333\tvalid_0's binary_logloss: 0.509532\n", - "[15]\tvalid_0's auc: 0.800672\tvalid_0's binary_logloss: 0.508117\n", - "[16]\tvalid_0's auc: 0.801953\tvalid_0's binary_logloss: 0.506866\n", - "[17]\tvalid_0's auc: 0.802078\tvalid_0's binary_logloss: 0.5055\n", - "[18]\tvalid_0's auc: 0.802449\tvalid_0's binary_logloss: 0.504358\n", - "[19]\tvalid_0's auc: 0.802329\tvalid_0's binary_logloss: 0.503503\n", - "[20]\tvalid_0's auc: 0.802437\tvalid_0's binary_logloss: 0.502233\n", - "[21]\tvalid_0's auc: 0.802653\tvalid_0's binary_logloss: 0.50174\n", - "[22]\tvalid_0's auc: 0.803753\tvalid_0's binary_logloss: 0.50056\n", - "[23]\tvalid_0's auc: 0.803956\tvalid_0's binary_logloss: 0.499496\n", - "[24]\tvalid_0's auc: 0.804231\tvalid_0's binary_logloss: 0.498283\n", - "[25]\tvalid_0's auc: 0.804554\tvalid_0's binary_logloss: 0.497059\n", - "[26]\tvalid_0's auc: 0.805133\tvalid_0's binary_logloss: 0.495963\n", - "[27]\tvalid_0's auc: 0.805333\tvalid_0's binary_logloss: 0.494842\n", - "[28]\tvalid_0's auc: 0.805644\tvalid_0's binary_logloss: 0.493771\n", - "[29]\tvalid_0's auc: 0.806029\tvalid_0's binary_logloss: 0.492598\n", - "[30]\tvalid_0's auc: 0.806321\tvalid_0's binary_logloss: 0.491474\n", - "[31]\tvalid_0's auc: 0.806201\tvalid_0's binary_logloss: 0.490419\n", - "[32]\tvalid_0's auc: 0.806671\tvalid_0's binary_logloss: 0.489393\n", - "[33]\tvalid_0's auc: 0.806899\tvalid_0's binary_logloss: 0.488331\n", - "[34]\tvalid_0's auc: 0.807105\tvalid_0's binary_logloss: 0.487277\n", - "[35]\tvalid_0's auc: 0.807257\tvalid_0's binary_logloss: 0.486592\n", - "[36]\tvalid_0's auc: 0.80729\tvalid_0's binary_logloss: 0.485607\n", - "[37]\tvalid_0's auc: 0.807752\tvalid_0's binary_logloss: 0.484951\n", - "[38]\tvalid_0's auc: 0.808191\tvalid_0's binary_logloss: 0.484269\n", - "[39]\tvalid_0's auc: 0.808417\tvalid_0's binary_logloss: 0.483242\n", - "[40]\tvalid_0's auc: 0.808761\tvalid_0's binary_logloss: 0.482291\n", - "[41]\tvalid_0's auc: 0.80965\tvalid_0's binary_logloss: 0.48164\n", - "[42]\tvalid_0's auc: 0.810065\tvalid_0's binary_logloss: 0.480962\n", - "[43]\tvalid_0's auc: 0.810209\tvalid_0's binary_logloss: 0.479995\n", - "[44]\tvalid_0's auc: 0.810091\tvalid_0's binary_logloss: 0.479077\n", - "[45]\tvalid_0's auc: 0.810573\tvalid_0's binary_logloss: 0.478185\n", - "[46]\tvalid_0's auc: 0.810924\tvalid_0's binary_logloss: 0.477558\n", - "[47]\tvalid_0's auc: 0.810951\tvalid_0's binary_logloss: 0.476662\n", - "[48]\tvalid_0's auc: 0.811101\tvalid_0's binary_logloss: 0.475745\n", - "[49]\tvalid_0's auc: 0.811269\tvalid_0's binary_logloss: 0.474951\n", - "[50]\tvalid_0's auc: 0.81173\tvalid_0's binary_logloss: 0.474514\n", - "[51]\tvalid_0's auc: 0.811937\tvalid_0's binary_logloss: 0.474114\n", - "[52]\tvalid_0's auc: 0.812136\tvalid_0's binary_logloss: 0.473297\n", - "[53]\tvalid_0's auc: 0.812249\tvalid_0's binary_logloss: 0.472497\n", - "[54]\tvalid_0's auc: 0.812121\tvalid_0's binary_logloss: 0.471696\n", - "[55]\tvalid_0's auc: 0.812164\tvalid_0's binary_logloss: 0.470905\n", - "[56]\tvalid_0's auc: 0.812462\tvalid_0's binary_logloss: 0.470384\n", - "[57]\tvalid_0's auc: 0.812613\tvalid_0's binary_logloss: 0.4696\n", - "[58]\tvalid_0's auc: 0.812615\tvalid_0's binary_logloss: 0.468778\n", - "[59]\tvalid_0's auc: 0.812842\tvalid_0's binary_logloss: 0.468211\n", - "[60]\tvalid_0's auc: 0.81312\tvalid_0's binary_logloss: 0.467385\n", - "[61]\tvalid_0's auc: 0.813039\tvalid_0's binary_logloss: 0.466632\n", - "[62]\tvalid_0's auc: 0.812942\tvalid_0's binary_logloss: 0.465933\n", - "[63]\tvalid_0's auc: 0.813274\tvalid_0's binary_logloss: 0.465214\n", - "[64]\tvalid_0's auc: 0.813572\tvalid_0's binary_logloss: 0.464692\n", - "[65]\tvalid_0's auc: 0.813594\tvalid_0's binary_logloss: 0.463925\n", - "[66]\tvalid_0's auc: 0.813719\tvalid_0's binary_logloss: 0.463177\n", - "[67]\tvalid_0's auc: 0.814011\tvalid_0's binary_logloss: 0.462513\n", - "[68]\tvalid_0's auc: 0.813989\tvalid_0's binary_logloss: 0.461843\n" - ] + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:16.136151Z", + "start_time": "2020-11-18T04:21:16.124444Z" + } + }, + "outputs": [], + "source": [ + "# 排序模型定义\n", + "lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", + " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", + " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) " + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[69]\tvalid_0's auc: 0.814218\tvalid_0's binary_logloss: 0.461443\n", - "[70]\tvalid_0's auc: 0.814334\tvalid_0's binary_logloss: 0.460775\n", - "[71]\tvalid_0's auc: 0.814493\tvalid_0's binary_logloss: 0.460332\n", - "[72]\tvalid_0's auc: 0.814663\tvalid_0's binary_logloss: 0.459867\n", - "[73]\tvalid_0's auc: 0.814856\tvalid_0's binary_logloss: 0.459266\n", - "[74]\tvalid_0's auc: 0.815017\tvalid_0's binary_logloss: 0.458585\n", - "[75]\tvalid_0's auc: 0.815186\tvalid_0's binary_logloss: 0.457958\n", - "[76]\tvalid_0's auc: 0.815374\tvalid_0's binary_logloss: 0.457316\n", - "[77]\tvalid_0's auc: 0.81554\tvalid_0's binary_logloss: 0.45665\n", - "[78]\tvalid_0's auc: 0.81569\tvalid_0's binary_logloss: 0.456217\n", - "[79]\tvalid_0's auc: 0.815861\tvalid_0's binary_logloss: 0.455615\n", - "[80]\tvalid_0's auc: 0.816443\tvalid_0's binary_logloss: 0.454895\n", - "[81]\tvalid_0's auc: 0.816659\tvalid_0's binary_logloss: 0.454503\n", - "[82]\tvalid_0's auc: 0.817017\tvalid_0's binary_logloss: 0.454149\n", - "[83]\tvalid_0's auc: 0.817162\tvalid_0's binary_logloss: 0.453578\n", - "[84]\tvalid_0's auc: 0.817274\tvalid_0's binary_logloss: 0.452984\n", - "[85]\tvalid_0's auc: 0.817283\tvalid_0's binary_logloss: 0.452416\n", - "[86]\tvalid_0's auc: 0.817339\tvalid_0's binary_logloss: 0.452022\n", - "[87]\tvalid_0's auc: 0.817494\tvalid_0's binary_logloss: 0.45146\n", - "[88]\tvalid_0's auc: 0.817594\tvalid_0's binary_logloss: 0.450926\n", - "[89]\tvalid_0's auc: 0.817771\tvalid_0's binary_logloss: 0.450553\n", - "[90]\tvalid_0's auc: 0.81789\tvalid_0's binary_logloss: 0.449985\n", - "[91]\tvalid_0's auc: 0.817931\tvalid_0's binary_logloss: 0.449439\n", - "[92]\tvalid_0's auc: 0.818138\tvalid_0's binary_logloss: 0.449094\n", - "[93]\tvalid_0's auc: 0.818334\tvalid_0's binary_logloss: 0.448527\n", - "[94]\tvalid_0's auc: 0.818426\tvalid_0's binary_logloss: 0.447989\n", - "[95]\tvalid_0's auc: 0.818676\tvalid_0's binary_logloss: 0.447407\n", - "[96]\tvalid_0's auc: 0.818852\tvalid_0's binary_logloss: 0.446884\n", - "[97]\tvalid_0's auc: 0.81945\tvalid_0's binary_logloss: 0.446455\n", - "[98]\tvalid_0's auc: 0.819861\tvalid_0's binary_logloss: 0.446045\n", - "[99]\tvalid_0's auc: 0.819943\tvalid_0's binary_logloss: 0.445543\n", - "[100]\tvalid_0's auc: 0.820076\tvalid_0's binary_logloss: 0.445258\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.820076\tvalid_0's binary_logloss: 0.445258\n", - "[1]\tvalid_0's auc: 0.770032\tvalid_0's binary_logloss: 0.527241\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.779881\tvalid_0's binary_logloss: 0.525545\n", - "[3]\tvalid_0's auc: 0.791308\tvalid_0's binary_logloss: 0.524508\n", - "[4]\tvalid_0's auc: 0.790788\tvalid_0's binary_logloss: 0.52341\n", - "[5]\tvalid_0's auc: 0.795645\tvalid_0's binary_logloss: 0.521753\n", - "[6]\tvalid_0's auc: 0.797745\tvalid_0's binary_logloss: 0.520131\n", - "[7]\tvalid_0's auc: 0.79931\tvalid_0's binary_logloss: 0.518872\n", - "[8]\tvalid_0's auc: 0.800014\tvalid_0's binary_logloss: 0.517353\n", - "[9]\tvalid_0's auc: 0.800549\tvalid_0's binary_logloss: 0.516487\n", - "[10]\tvalid_0's auc: 0.800261\tvalid_0's binary_logloss: 0.515039\n", - "[11]\tvalid_0's auc: 0.801261\tvalid_0's binary_logloss: 0.513695\n", - "[12]\tvalid_0's auc: 0.801062\tvalid_0's binary_logloss: 0.512735\n", - "[13]\tvalid_0's auc: 0.801155\tvalid_0's binary_logloss: 0.51192\n", - "[14]\tvalid_0's auc: 0.801315\tvalid_0's binary_logloss: 0.510559\n", - "[15]\tvalid_0's auc: 0.80185\tvalid_0's binary_logloss: 0.509147\n", - "[16]\tvalid_0's auc: 0.803029\tvalid_0's binary_logloss: 0.507914\n", - "[17]\tvalid_0's auc: 0.803035\tvalid_0's binary_logloss: 0.506583\n", - "[18]\tvalid_0's auc: 0.803433\tvalid_0's binary_logloss: 0.505441\n", - "[19]\tvalid_0's auc: 0.803717\tvalid_0's binary_logloss: 0.504599\n", - "[20]\tvalid_0's auc: 0.803819\tvalid_0's binary_logloss: 0.503327\n", - "[21]\tvalid_0's auc: 0.803923\tvalid_0's binary_logloss: 0.502782\n", - "[22]\tvalid_0's auc: 0.804939\tvalid_0's binary_logloss: 0.501596\n", - "[23]\tvalid_0's auc: 0.804707\tvalid_0's binary_logloss: 0.500572\n", - "[24]\tvalid_0's auc: 0.804632\tvalid_0's binary_logloss: 0.499367\n", - "[25]\tvalid_0's auc: 0.804756\tvalid_0's binary_logloss: 0.498161\n", - "[26]\tvalid_0's auc: 0.805067\tvalid_0's binary_logloss: 0.497061\n", - "[27]\tvalid_0's auc: 0.805119\tvalid_0's binary_logloss: 0.495933\n", - "[28]\tvalid_0's auc: 0.805304\tvalid_0's binary_logloss: 0.494849\n", - "[29]\tvalid_0's auc: 0.805688\tvalid_0's binary_logloss: 0.493677\n", - "[30]\tvalid_0's auc: 0.805822\tvalid_0's binary_logloss: 0.492594\n", - "[31]\tvalid_0's auc: 0.805869\tvalid_0's binary_logloss: 0.49152\n", - "[32]\tvalid_0's auc: 0.807267\tvalid_0's binary_logloss: 0.490435\n", - "[33]\tvalid_0's auc: 0.807301\tvalid_0's binary_logloss: 0.489392\n", - "[34]\tvalid_0's auc: 0.80736\tvalid_0's binary_logloss: 0.488325\n", - "[35]\tvalid_0's auc: 0.807706\tvalid_0's binary_logloss: 0.487654\n", - "[36]\tvalid_0's auc: 0.807758\tvalid_0's binary_logloss: 0.486651\n", - "[37]\tvalid_0's auc: 0.808051\tvalid_0's binary_logloss: 0.486012\n", - "[38]\tvalid_0's auc: 0.808429\tvalid_0's binary_logloss: 0.485355\n", - "[39]\tvalid_0's auc: 0.808663\tvalid_0's binary_logloss: 0.484327\n", - "[40]\tvalid_0's auc: 0.809007\tvalid_0's binary_logloss: 0.483386\n", - "[41]\tvalid_0's auc: 0.809781\tvalid_0's binary_logloss: 0.482745\n", - "[42]\tvalid_0's auc: 0.810071\tvalid_0's binary_logloss: 0.482124\n", - "[43]\tvalid_0's auc: 0.810383\tvalid_0's binary_logloss: 0.481154\n", - "[44]\tvalid_0's auc: 0.810446\tvalid_0's binary_logloss: 0.480243\n", - "[45]\tvalid_0's auc: 0.811148\tvalid_0's binary_logloss: 0.479261\n", - "[46]\tvalid_0's auc: 0.811245\tvalid_0's binary_logloss: 0.478687\n", - "[47]\tvalid_0's auc: 0.811214\tvalid_0's binary_logloss: 0.477812\n", - "[48]\tvalid_0's auc: 0.811408\tvalid_0's binary_logloss: 0.47689\n", - "[49]\tvalid_0's auc: 0.811486\tvalid_0's binary_logloss: 0.476132\n", - "[50]\tvalid_0's auc: 0.811806\tvalid_0's binary_logloss: 0.475718\n", - "[51]\tvalid_0's auc: 0.812017\tvalid_0's binary_logloss: 0.475342\n", - "[52]\tvalid_0's auc: 0.812255\tvalid_0's binary_logloss: 0.474505\n", - "[53]\tvalid_0's auc: 0.812249\tvalid_0's binary_logloss: 0.473707\n", - "[54]\tvalid_0's auc: 0.812235\tvalid_0's binary_logloss: 0.47289\n", - "[55]\tvalid_0's auc: 0.812233\tvalid_0's binary_logloss: 0.472091\n", - "[56]\tvalid_0's auc: 0.812492\tvalid_0's binary_logloss: 0.471563\n", - "[57]\tvalid_0's auc: 0.812579\tvalid_0's binary_logloss: 0.47077\n", - "[58]\tvalid_0's auc: 0.812598\tvalid_0's binary_logloss: 0.469992\n", - "[59]\tvalid_0's auc: 0.812885\tvalid_0's binary_logloss: 0.469458\n", - "[60]\tvalid_0's auc: 0.812995\tvalid_0's binary_logloss: 0.468676\n", - "[61]\tvalid_0's auc: 0.812961\tvalid_0's binary_logloss: 0.467939\n", - "[62]\tvalid_0's auc: 0.812919\tvalid_0's binary_logloss: 0.467232\n", - "[63]\tvalid_0's auc: 0.813291\tvalid_0's binary_logloss: 0.466491\n", - "[64]\tvalid_0's auc: 0.813702\tvalid_0's binary_logloss: 0.465945\n", - "[65]\tvalid_0's auc: 0.813803\tvalid_0's binary_logloss: 0.465197\n", - "[66]\tvalid_0's auc: 0.813851\tvalid_0's binary_logloss: 0.4645\n", - "[67]\tvalid_0's auc: 0.814011\tvalid_0's binary_logloss: 0.463814\n", - "[68]\tvalid_0's auc: 0.814027\tvalid_0's binary_logloss: 0.463113\n", - "[69]\tvalid_0's auc: 0.814138\tvalid_0's binary_logloss: 0.462727\n", - "[70]\tvalid_0's auc: 0.814365\tvalid_0's binary_logloss: 0.462077\n", - "[71]\tvalid_0's auc: 0.814432\tvalid_0's binary_logloss: 0.461655\n", - "[72]\tvalid_0's auc: 0.8146\tvalid_0's binary_logloss: 0.461194\n", - "[73]\tvalid_0's auc: 0.815324\tvalid_0's binary_logloss: 0.460477\n", - "[74]\tvalid_0's auc: 0.815411\tvalid_0's binary_logloss: 0.459805\n", - "[75]\tvalid_0's auc: 0.815548\tvalid_0's binary_logloss: 0.459189\n", - "[76]\tvalid_0's auc: 0.815625\tvalid_0's binary_logloss: 0.458525\n", - "[77]\tvalid_0's auc: 0.81562\tvalid_0's binary_logloss: 0.457905\n", - "[78]\tvalid_0's auc: 0.815786\tvalid_0's binary_logloss: 0.45747\n", - "[79]\tvalid_0's auc: 0.815834\tvalid_0's binary_logloss: 0.456884\n", - "[80]\tvalid_0's auc: 0.816475\tvalid_0's binary_logloss: 0.45617\n", - "[81]\tvalid_0's auc: 0.816677\tvalid_0's binary_logloss: 0.455787\n", - "[82]\tvalid_0's auc: 0.817255\tvalid_0's binary_logloss: 0.455358\n", - "[83]\tvalid_0's auc: 0.817383\tvalid_0's binary_logloss: 0.454775\n", - "[84]\tvalid_0's auc: 0.817509\tvalid_0's binary_logloss: 0.454176\n", - "[85]\tvalid_0's auc: 0.817572\tvalid_0's binary_logloss: 0.453609\n", - "[86]\tvalid_0's auc: 0.817721\tvalid_0's binary_logloss: 0.453213\n", - "[87]\tvalid_0's auc: 0.817992\tvalid_0's binary_logloss: 0.452586\n", - "[88]\tvalid_0's auc: 0.81808\tvalid_0's binary_logloss: 0.45204\n", - "[89]\tvalid_0's auc: 0.818202\tvalid_0's binary_logloss: 0.451643\n", - "[90]\tvalid_0's auc: 0.818336\tvalid_0's binary_logloss: 0.451081\n", - "[91]\tvalid_0's auc: 0.818347\tvalid_0's binary_logloss: 0.450531\n", - "[92]\tvalid_0's auc: 0.818558\tvalid_0's binary_logloss: 0.450179\n", - "[93]\tvalid_0's auc: 0.818743\tvalid_0's binary_logloss: 0.449647\n", - "[94]\tvalid_0's auc: 0.818789\tvalid_0's binary_logloss: 0.449133\n", - "[95]\tvalid_0's auc: 0.818849\tvalid_0's binary_logloss: 0.44862\n", - "[96]\tvalid_0's auc: 0.81913\tvalid_0's binary_logloss: 0.448072\n", - "[97]\tvalid_0's auc: 0.819526\tvalid_0's binary_logloss: 0.447713\n", - "[98]\tvalid_0's auc: 0.819971\tvalid_0's binary_logloss: 0.447296\n", - "[99]\tvalid_0's auc: 0.819972\tvalid_0's binary_logloss: 0.446814\n" - ] + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:22.965433Z", + "start_time": "2020-11-18T04:21:17.799127Z" + } + }, + "outputs": [], + "source": [ + "# 排序模型训练\n", + "if offline:\n", + " lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train,\n", + " eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], \n", + " eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n", + "else:\n", + " lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train)" + ] }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "[100]\tvalid_0's auc: 0.820086\tvalid_0's binary_logloss: 0.446533\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.820086\tvalid_0's binary_logloss: 0.446533\n", - "[1]\tvalid_0's auc: 0.768646\tvalid_0's binary_logloss: 0.527167\n", - "Training until validation scores don't improve for 50 rounds\n", - "[2]\tvalid_0's auc: 0.779902\tvalid_0's binary_logloss: 0.525481\n", - "[3]\tvalid_0's auc: 0.789868\tvalid_0's binary_logloss: 0.524485\n", - "[4]\tvalid_0's auc: 0.791895\tvalid_0's binary_logloss: 0.523382\n", - "[5]\tvalid_0's auc: 0.795453\tvalid_0's binary_logloss: 0.521759\n", - "[6]\tvalid_0's auc: 0.796672\tvalid_0's binary_logloss: 0.520166\n", - "[7]\tvalid_0's auc: 0.798023\tvalid_0's binary_logloss: 0.518857\n", - "[8]\tvalid_0's auc: 0.799331\tvalid_0's binary_logloss: 0.517297\n", - "[9]\tvalid_0's auc: 0.800181\tvalid_0's binary_logloss: 0.516416\n", - "[10]\tvalid_0's auc: 0.800373\tvalid_0's binary_logloss: 0.514967\n", - "[11]\tvalid_0's auc: 0.801087\tvalid_0's binary_logloss: 0.513631\n", - "[12]\tvalid_0's auc: 0.801122\tvalid_0's binary_logloss: 0.512658\n", - "[13]\tvalid_0's auc: 0.801043\tvalid_0's binary_logloss: 0.511833\n", - "[14]\tvalid_0's auc: 0.801238\tvalid_0's binary_logloss: 0.510461\n", - "[15]\tvalid_0's auc: 0.801847\tvalid_0's binary_logloss: 0.509034\n", - "[16]\tvalid_0's auc: 0.803139\tvalid_0's binary_logloss: 0.507759\n", - "[17]\tvalid_0's auc: 0.803577\tvalid_0's binary_logloss: 0.506361\n", - "[18]\tvalid_0's auc: 0.803834\tvalid_0's binary_logloss: 0.505229\n", - "[19]\tvalid_0's auc: 0.803943\tvalid_0's binary_logloss: 0.504371\n", - "[20]\tvalid_0's auc: 0.80415\tvalid_0's binary_logloss: 0.503102\n", - "[21]\tvalid_0's auc: 0.804446\tvalid_0's binary_logloss: 0.502564\n", - "[22]\tvalid_0's auc: 0.805163\tvalid_0's binary_logloss: 0.501396\n", - "[23]\tvalid_0's auc: 0.805323\tvalid_0's binary_logloss: 0.500327\n", - "[24]\tvalid_0's auc: 0.805314\tvalid_0's binary_logloss: 0.499123\n", - "[25]\tvalid_0's auc: 0.80535\tvalid_0's binary_logloss: 0.497927\n", - "[26]\tvalid_0's auc: 0.805864\tvalid_0's binary_logloss: 0.496834\n", - "[27]\tvalid_0's auc: 0.805919\tvalid_0's binary_logloss: 0.495667\n", - "[28]\tvalid_0's auc: 0.806272\tvalid_0's binary_logloss: 0.494606\n", - "[29]\tvalid_0's auc: 0.806599\tvalid_0's binary_logloss: 0.49343\n", - "[30]\tvalid_0's auc: 0.806932\tvalid_0's binary_logloss: 0.492303\n", - "[31]\tvalid_0's auc: 0.806656\tvalid_0's binary_logloss: 0.491249\n", - "[32]\tvalid_0's auc: 0.807436\tvalid_0's binary_logloss: 0.490188\n", - "[33]\tvalid_0's auc: 0.807629\tvalid_0's binary_logloss: 0.489117\n", - "[34]\tvalid_0's auc: 0.807501\tvalid_0's binary_logloss: 0.48808\n", - "[35]\tvalid_0's auc: 0.807885\tvalid_0's binary_logloss: 0.487383\n", - "[36]\tvalid_0's auc: 0.807921\tvalid_0's binary_logloss: 0.48636\n", - "[37]\tvalid_0's auc: 0.808267\tvalid_0's binary_logloss: 0.485724\n", - "[38]\tvalid_0's auc: 0.808563\tvalid_0's binary_logloss: 0.485076\n", - "[39]\tvalid_0's auc: 0.808813\tvalid_0's binary_logloss: 0.484039\n", - "[40]\tvalid_0's auc: 0.809023\tvalid_0's binary_logloss: 0.483091\n", - "[41]\tvalid_0's auc: 0.809782\tvalid_0's binary_logloss: 0.482441\n", - "[42]\tvalid_0's auc: 0.810135\tvalid_0's binary_logloss: 0.48179\n", - "[43]\tvalid_0's auc: 0.810219\tvalid_0's binary_logloss: 0.48082\n", - "[44]\tvalid_0's auc: 0.81031\tvalid_0's binary_logloss: 0.479906\n", - "[45]\tvalid_0's auc: 0.810514\tvalid_0's binary_logloss: 0.479024\n", - "[46]\tvalid_0's auc: 0.810566\tvalid_0's binary_logloss: 0.478437\n", - "[47]\tvalid_0's auc: 0.810611\tvalid_0's binary_logloss: 0.477529\n", - "[48]\tvalid_0's auc: 0.810781\tvalid_0's binary_logloss: 0.476637\n", - "[49]\tvalid_0's auc: 0.81089\tvalid_0's binary_logloss: 0.475883\n", - "[50]\tvalid_0's auc: 0.811266\tvalid_0's binary_logloss: 0.475459\n", - "[51]\tvalid_0's auc: 0.811402\tvalid_0's binary_logloss: 0.475078\n", - "[52]\tvalid_0's auc: 0.811765\tvalid_0's binary_logloss: 0.474246\n", - "[53]\tvalid_0's auc: 0.811891\tvalid_0's binary_logloss: 0.473452\n", - "[54]\tvalid_0's auc: 0.811868\tvalid_0's binary_logloss: 0.47263\n", - "[55]\tvalid_0's auc: 0.81192\tvalid_0's binary_logloss: 0.471804\n", - "[56]\tvalid_0's auc: 0.812272\tvalid_0's binary_logloss: 0.471275\n", - "[57]\tvalid_0's auc: 0.812639\tvalid_0's binary_logloss: 0.470396\n", - "[58]\tvalid_0's auc: 0.812764\tvalid_0's binary_logloss: 0.469597\n", - "[59]\tvalid_0's auc: 0.813084\tvalid_0's binary_logloss: 0.469049\n", - "[60]\tvalid_0's auc: 0.813342\tvalid_0's binary_logloss: 0.468244\n", - "[61]\tvalid_0's auc: 0.813302\tvalid_0's binary_logloss: 0.467499\n", - "[62]\tvalid_0's auc: 0.813221\tvalid_0's binary_logloss: 0.466758\n", - "[63]\tvalid_0's auc: 0.813697\tvalid_0's binary_logloss: 0.466017\n", - "[64]\tvalid_0's auc: 0.813985\tvalid_0's binary_logloss: 0.465501\n", - "[65]\tvalid_0's auc: 0.81416\tvalid_0's binary_logloss: 0.464725\n", - "[66]\tvalid_0's auc: 0.814227\tvalid_0's binary_logloss: 0.46398\n", - "[67]\tvalid_0's auc: 0.814397\tvalid_0's binary_logloss: 0.463309\n", - "[68]\tvalid_0's auc: 0.814426\tvalid_0's binary_logloss: 0.462627\n", - "[69]\tvalid_0's auc: 0.814593\tvalid_0's binary_logloss: 0.462244\n", - "[70]\tvalid_0's auc: 0.814789\tvalid_0's binary_logloss: 0.461571\n", - "[71]\tvalid_0's auc: 0.814889\tvalid_0's binary_logloss: 0.461144\n", - "[72]\tvalid_0's auc: 0.815078\tvalid_0's binary_logloss: 0.460684\n", - "[73]\tvalid_0's auc: 0.815439\tvalid_0's binary_logloss: 0.460063\n", - "[74]\tvalid_0's auc: 0.815511\tvalid_0's binary_logloss: 0.459386\n", - "[75]\tvalid_0's auc: 0.815574\tvalid_0's binary_logloss: 0.45877\n", - "[76]\tvalid_0's auc: 0.815634\tvalid_0's binary_logloss: 0.458128\n", - "[77]\tvalid_0's auc: 0.815618\tvalid_0's binary_logloss: 0.457495\n", - "[78]\tvalid_0's auc: 0.81582\tvalid_0's binary_logloss: 0.457057\n", - "[79]\tvalid_0's auc: 0.81594\tvalid_0's binary_logloss: 0.456475\n", - "[80]\tvalid_0's auc: 0.815961\tvalid_0's binary_logloss: 0.455885\n", - "[81]\tvalid_0's auc: 0.816153\tvalid_0's binary_logloss: 0.455511\n", - "[82]\tvalid_0's auc: 0.816433\tvalid_0's binary_logloss: 0.455186\n", - "[83]\tvalid_0's auc: 0.816546\tvalid_0's binary_logloss: 0.454625\n", - "[84]\tvalid_0's auc: 0.816586\tvalid_0's binary_logloss: 0.454039\n", - "[85]\tvalid_0's auc: 0.816584\tvalid_0's binary_logloss: 0.453482\n", - "[86]\tvalid_0's auc: 0.816881\tvalid_0's binary_logloss: 0.453048\n", - "[87]\tvalid_0's auc: 0.817029\tvalid_0's binary_logloss: 0.452485\n", - "[88]\tvalid_0's auc: 0.81707\tvalid_0's binary_logloss: 0.451941\n", - "[89]\tvalid_0's auc: 0.817298\tvalid_0's binary_logloss: 0.451544\n", - "[90]\tvalid_0's auc: 0.817343\tvalid_0's binary_logloss: 0.450975\n", - "[91]\tvalid_0's auc: 0.817357\tvalid_0's binary_logloss: 0.450422\n", - "[92]\tvalid_0's auc: 0.817592\tvalid_0's binary_logloss: 0.450109\n", - "[93]\tvalid_0's auc: 0.817729\tvalid_0's binary_logloss: 0.449542\n", - "[94]\tvalid_0's auc: 0.817834\tvalid_0's binary_logloss: 0.448982\n", - "[95]\tvalid_0's auc: 0.81809\tvalid_0's binary_logloss: 0.448398\n", - "[96]\tvalid_0's auc: 0.818269\tvalid_0's binary_logloss: 0.447908\n", - "[97]\tvalid_0's auc: 0.818682\tvalid_0's binary_logloss: 0.447547\n", - "[98]\tvalid_0's auc: 0.819015\tvalid_0's binary_logloss: 0.447165\n", - "[99]\tvalid_0's auc: 0.819016\tvalid_0's binary_logloss: 0.446669\n", - "[100]\tvalid_0's auc: 0.819127\tvalid_0's binary_logloss: 0.446397\n", - "Did not meet early stopping. Best iteration is:\n", - "[100]\tvalid_0's auc: 0.819127\tvalid_0's binary_logloss: 0.446397\n" - ] - } - ], - "source": [ - "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", - "# 这一部分与前面的单独训练和验证是分开的\n", - "def get_kfold_users(trn_df, n=5):\n", - " user_ids = trn_df['user_id'].unique()\n", - " user_set = [user_ids[i::n] for i in range(n)]\n", - " return user_set\n", - "\n", - "k_fold = 5\n", - "trn_df = trn_user_item_feats_df_rank_model\n", - "user_set = get_kfold_users(trn_df, n=k_fold)\n", - "\n", - "score_list = []\n", - "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n", - "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", - "\n", - "# 五折交叉验证,并将中间结果保存用于staking\n", - "for n_fold, valid_user in enumerate(user_set):\n", - " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", - " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", - " \n", - " # 模型及参数的定义\n", - " lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", - " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", - " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) \n", - " # 训练模型\n", - " lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], \n", - " eval_metric=['auc', ],early_stopping_rounds=50, )\n", - " \n", - " # 预测验证集结果\n", - " valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], \n", - " num_iteration=lgb_Classfication.best_iteration_)[:,1]\n", - " \n", - " # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化\n", - " # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))\n", - " \n", - " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", - " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", - " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", - " \n", - " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", - " if not offline:\n", - " sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], \n", - " num_iteration=lgb_Classfication.best_iteration_)[:,1]\n", - " \n", - "score_df_ = pd.concat(score_list, axis=0)\n", - "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", - "# 保存训练集交叉验证产生的新特征\n", - "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)\n", - " \n", - "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", - "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n", - "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n", - "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n", - "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - "\n", - "# 保存测试集交叉验证的新特征\n", - "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:24:23.074237Z", - "start_time": "2020-11-18T04:24:13.812284Z" - } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]\n", - "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", - "submit(rank_results, topk=5, model_name='lgb_cls')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DIN模型" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 用户的历史点击行为列表\n", - "这个是为后面的DIN模型服务的" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:24:30.508213Z", - "start_time": "2020-11-18T04:24:27.426372Z" - } - }, - "outputs": [], - "source": [ - "if offline:\n", - " all_data = pd.read_csv('./data_raw/train_click_log.csv')\n", - "else:\n", - " trn_data = pd.read_csv('./data_raw/train_click_log.csv')\n", - " tst_data = pd.read_csv('./data_raw/testA_click_log.csv')\n", - " all_data = trn_data.append(tst_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:25:28.082071Z", - "start_time": "2020-11-18T04:24:33.649524Z" - } - }, - "outputs": [], - "source": [ - "hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()\n", - "his_behavior_df = pd.DataFrame()\n", - "his_behavior_df['user_id'] = hist_click['user_id']\n", - "his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:25:52.925866Z", - "start_time": "2020-11-18T04:25:52.863922Z" - } - }, - "outputs": [], - "source": [ - "trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()\n", - "\n", - "if offline:\n", - " val_user_item_feats_df_din_model = val_user_item_feats_df.copy()\n", - "else: \n", - " val_user_item_feats_df_din_model = None\n", - " \n", - "tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:00.070681Z", - "start_time": "2020-11-18T04:25:56.417197Z" - } - }, - "outputs": [], - "source": [ - "trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')\n", - "\n", - "if offline:\n", - " val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')\n", - "else:\n", - " val_user_item_feats_df_din_model = None\n", - "\n", - "tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### DIN模型简介\n", - "我们下面尝试使用DIN模型, DIN的全称是Deep Interest Network, 这是阿里2018年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型, 它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性,来计算用户兴趣的表示向量。具体来说就是通过引入局部激活单元,通过软搜索历史行为的相关部分来关注相关的用户兴趣,并采用加权和来获得有关候选广告的用户兴趣的表示。与候选广告相关性较高的行为会获得较高的激活权重,并支配着用户兴趣。该表示向量在不同广告上有所不同,大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合, 我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。 该模型的结构如下:\n", - "\n", - "![image-20201116201646983](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png)\n", - "\n", - "\n", - "我们这里直接调包来使用这个模型, 关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用:deepctr的函数原型如下:\n", - "> def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False,\n", - "> dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation=\"dice\",\n", - "> att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024,\n", - "> task='binary'):\n", - "> \n", - "> * dnn_feature_columns: 特征列, 包含数据所有特征的列表\n", - "> * history_feature_list: 用户历史行为列, 反应用户历史行为的特征的列表\n", - "> * dnn_use_bn: 是否使用BatchNormalization\n", - "> * dnn_hidden_units: 全连接层网络的层数和每一层神经元的个数, 一个列表或者元组\n", - "> * dnn_activation_relu: 全连接网络的激活单元类型\n", - "> * att_hidden_size: 注意力层的全连接网络的层数和每一层神经元的个数\n", - "> * att_activation: 注意力层的激活单元类型\n", - "> * att_weight_normalization: 是否归一化注意力得分\n", - "> * l2_reg_dnn: 全连接网络的正则化系数\n", - "> * l2_reg_embedding: embedding向量的正则化稀疏\n", - "> * dnn_dropout: 全连接网络的神经元的失活概率\n", - "> * task: 任务, 可以是分类, 也可是是回归\n", - "\n", - "在具体使用的时候, 我们必须要传入特征列和历史行为列, 但是再传入之前, 我们需要进行一下特征列的预处理。具体如下:\n", - "\n", - "1. 首先,我们要处理数据集, 得到数据, 由于我们是基于用户过去的行为去预测用户是否点击当前文章, 所以我们需要把数据的特征列划分成数值型特征, 离散型特征和历史行为特征列三部分, 对于每一部分, DIN模型的处理会有不同\n", - " 1. 对于离散型特征, 在我们的数据集中就是那些类别型的特征, 比如user_id这种, 这种类别型特征, 我们首先要经过embedding处理得到每个特征的低维稠密型表示, 既然要经过embedding, 那么我们就需要为每一列的类别特征的取值建立一个字典,并指明embedding维度, 所以在使用deepctr的DIN模型准备数据的时候, 我们需要通过SparseFeat函数指明这些类别型特征, 这个函数的传入参数就是列名, 列的唯一取值(建立字典用)和embedding维度。\n", - " 2. 对于用户历史行为特征列, 比如文章id, 文章的类别等这种, 同样的我们需要先经过embedding处理, 只不过和上面不一样的地方是,对于这种特征, 我们在得到每个特征的embedding表示之后, 还需要通过一个Attention_layer计算用户的历史行为和当前候选文章的相关性以此得到当前用户的embedding向量, 这个向量就可以基于当前的候选文章与用户过去点击过得历史文章的相似性的程度来反应用户的兴趣, 并且随着用户的不同的历史点击来变化,去动态的模拟用户兴趣的变化过程。这类特征对于每个用户都是一个历史行为序列, 对于每个用户, 历史行为序列长度会不一样, 可能有的用户点击的历史文章多,有的点击的历史文章少, 所以我们还需要把这个长度统一起来, 在为DIN模型准备数据的时候, 我们首先要通过SparseFeat函数指明这些类别型特征, 然后还需要通过VarLenSparseFeat函数再进行序列填充, 使得每个用户的历史序列一样长, 所以这个函数参数中会有个maxlen,来指明序列的最大长度是多少。\n", - " 3. 对于连续型特征列, 我们只需要用DenseFeat函数来指明列名和维度即可。\n", - "2. 处理完特征列之后, 我们把相应的数据与列进行对应,就得到了最后的数据。\n", - "\n", - "下面根据具体的代码感受一下, 逻辑是这样, 首先我们需要写一个数据准备函数, 在这里面就是根据上面的具体步骤准备数据, 得到数据和特征列, 然后就是建立DIN模型并训练, 最后基于模型进行测试。" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:08.405211Z", - "start_time": "2020-11-18T04:26:04.887013Z" - } - }, - "outputs": [], - "source": [ - "# 导入deepctr\n", - "from deepctr.models import DIN\n", - "from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names\n", - "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", - "\n", - "from tensorflow.keras import backend as K\n", - "from tensorflow.keras.layers import *\n", - "from tensorflow.keras.models import *\n", - "from tensorflow.keras.callbacks import * \n", - "import tensorflow as tf\n", - "\n", - "import os\n", - "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", - "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:13.485712Z", - "start_time": "2020-11-18T04:26:13.476042Z" - } - }, - "outputs": [], - "source": [ - "# 数据准备函数\n", - "def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100):\n", - " \"\"\"\n", - " 数据准备函数:\n", - " df: 数据集\n", - " dense_fea: 数值型特征列\n", - " sparse_fea: 离散型特征列\n", - " behavior_fea: 用户的候选行为特征列\n", - " his_behavior_fea: 用户的历史行为特征列\n", - " embedding_dim: embedding的维度, 这里为了简单, 统一把离散型特征列采用一样的隐向量维度\n", - " max_len: 用户序列的最大长度\n", - " \"\"\"\n", - " \n", - " sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea]\n", - " \n", - " dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]\n", - " \n", - " var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,\n", - " embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in hist_behavior_fea]\n", - " \n", - " dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns\n", - " \n", - " # 建立x, x是一个字典的形式\n", - " x = {}\n", - " for name in get_feature_names(dnn_feature_columns):\n", - " if name in his_behavior_fea:\n", - " # 这是历史行为序列\n", - " his_list = [l for l in df[name]]\n", - " x[name] = pad_sequences(his_list, maxlen=max_len, padding='post') # 二维数组\n", - " else:\n", - " x[name] = df[name].values\n", - " \n", - " return x, dnn_feature_columns" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:18.783217Z", - "start_time": "2020-11-18T04:26:18.776795Z" - } - }, - "outputs": [], - "source": [ - "# 把特征分开\n", - "sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', \n", - " 'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']\n", - "\n", - "behavior_fea = ['click_article_id']\n", - "\n", - "hist_behavior_fea = ['hist_click_article_id']\n", - "\n", - "dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',\n", - " 'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',\n", - " 'words_hbo','words_count']" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:25.469810Z", - "start_time": "2020-11-18T04:26:24.779347Z" - } - }, - "outputs": [], - "source": [ - "# dense特征进行归一化, 神经网络训练都需要将数值进行归一化处理\n", - "mm = MinMaxScaler()\n", - "\n", - "# 下面是做一些特殊处理,当在其他的地方出现无效值的时候,不处理无法进行归一化,刚开始可以先把他注释掉,在运行了下面的代码\n", - "# 之后如果发现报错,应该先去想办法处理如何不出现inf之类的值\n", - "# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)\n", - "# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)\n", - "\n", - "for feat in dense_fea:\n", - " trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])\n", - " \n", - " if val_user_item_feats_df_din_model is not None:\n", - " val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])\n", - " \n", - " tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:36.727753Z", - "start_time": "2020-11-18T04:26:28.854705Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:28.616665Z", + "start_time": "2020-11-18T04:21:24.672280Z" + } + }, + "outputs": [], + "source": [ + "# 模型预测\n", + "tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n", + "\n", + "# 将这里的排序结果保存一份,用户后面的模型融合\n", + "tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n" - ] - } - ], - "source": [ - "# 准备训练数据\n", - "x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - "y_trn = trn_user_item_feats_df_din_model['label'].values\n", - "\n", - "if offline:\n", - " # 准备验证数据\n", - " x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - " y_val = val_user_item_feats_df_din_model['label'].values\n", - " \n", - "dense_fea = [x for x in dense_fea if x != 'label']\n", - "x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:26:45.146318Z", - "start_time": "2020-11-18T04:26:40.423914Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:21:40.253692Z", + "start_time": "2020-11-18T04:21:30.546587Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]\n", + "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", + "submit(rank_results, topk=5, model_name='lgb_ranker')" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Call initializer instance with the dtype argument instead of passing it to the constructor\n", - "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:255: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", - "Model: \"model\"\n", - "__________________________________________________________________________________________________\n", - "Layer (type) Output Shape Param # Connected to \n", - "==================================================================================================\n", - "user_id (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_article_id (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "category_id (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_environment (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_deviceGroup (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_os (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_country (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_region (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_referrer_type (InputLayer [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "is_cat_hab (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_user_id (Embedding) (None, 1, 32) 1600032 user_id[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_seq_emb_hist_click_artic multiple 525664 click_article_id[0][0] \n", - " hist_click_article_id[0][0] \n", - " click_article_id[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_category_id (Embeddi (None, 1, 32) 7776 category_id[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_environment (E (None, 1, 32) 128 click_environment[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_deviceGroup (E (None, 1, 32) 160 click_deviceGroup[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_os (Embedding) (None, 1, 32) 288 click_os[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_country (Embed (None, 1, 32) 384 click_country[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_region (Embedd (None, 1, 32) 928 click_region[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_click_referrer_type (None, 1, 32) 256 click_referrer_type[0][0] \n", - "__________________________________________________________________________________________________\n", - "sparse_emb_is_cat_hab (Embeddin (None, 1, 32) 64 is_cat_hab[0][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask (NoMask) (None, 1, 32) 0 sparse_emb_user_id[0][0] \n", - " sparse_seq_emb_hist_click_article\n", - " sparse_emb_category_id[0][0] \n", - " sparse_emb_click_environment[0][0\n", - " sparse_emb_click_deviceGroup[0][0\n", - " sparse_emb_click_os[0][0] \n", - " sparse_emb_click_country[0][0] \n", - " sparse_emb_click_region[0][0] \n", - " sparse_emb_click_referrer_type[0]\n", - " sparse_emb_is_cat_hab[0][0] \n", - "__________________________________________________________________________________________________\n", - "hist_click_article_id (InputLay [(None, 50)] 0 \n", - "__________________________________________________________________________________________________\n", - "concatenate (Concatenate) (None, 1, 320) 0 no_mask[0][0] \n", - " no_mask[1][0] \n", - " no_mask[2][0] \n", - " no_mask[3][0] \n", - " no_mask[4][0] \n", - " no_mask[5][0] \n", - " no_mask[6][0] \n", - " no_mask[7][0] \n", - " no_mask[8][0] \n", - " no_mask[9][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask_1 (NoMask) (None, 1, 320) 0 concatenate[0][0] \n", - "__________________________________________________________________________________________________\n", - "attention_sequence_pooling_laye (None, 1, 32) 13961 sparse_seq_emb_hist_click_article\n", - " sparse_seq_emb_hist_click_article\n", - "__________________________________________________________________________________________________\n", - "concatenate_1 (Concatenate) (None, 1, 352) 0 no_mask_1[0][0] \n", - " attention_sequence_pooling_layer[\n", - "__________________________________________________________________________________________________\n", - "sim0 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "time_diff0 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "word_diff0 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sim_max (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sim_min (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sim_sum (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "sim_mean (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "score (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "rank (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "click_size (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "time_diff_mean (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "active_level (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "user_time_hob1 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "user_time_hob2 (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "words_hbo (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "words_count (InputLayer) [(None, 1)] 0 \n", - "__________________________________________________________________________________________________\n", - "flatten (Flatten) (None, 352) 0 concatenate_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask_3 (NoMask) (None, 1) 0 sim0[0][0] \n", - " time_diff0[0][0] \n", - " word_diff0[0][0] \n", - " sim_max[0][0] \n", - " sim_min[0][0] \n", - " sim_sum[0][0] \n", - " sim_mean[0][0] \n", - " score[0][0] \n", - " rank[0][0] \n", - " click_size[0][0] \n", - " time_diff_mean[0][0] \n", - " active_level[0][0] \n", - " user_time_hob1[0][0] \n", - " user_time_hob2[0][0] \n", - " words_hbo[0][0] \n", - " words_count[0][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask_2 (NoMask) (None, 352) 0 flatten[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_2 (Concatenate) (None, 16) 0 no_mask_3[0][0] \n", - " no_mask_3[1][0] \n", - " no_mask_3[2][0] \n", - " no_mask_3[3][0] \n", - " no_mask_3[4][0] \n", - " no_mask_3[5][0] \n", - " no_mask_3[6][0] \n", - " no_mask_3[7][0] \n", - " no_mask_3[8][0] \n", - " no_mask_3[9][0] \n", - " no_mask_3[10][0] \n", - " no_mask_3[11][0] \n", - " no_mask_3[12][0] \n", - " no_mask_3[13][0] \n", - " no_mask_3[14][0] \n", - " no_mask_3[15][0] \n", - "__________________________________________________________________________________________________\n", - "flatten_1 (Flatten) (None, 352) 0 no_mask_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "flatten_2 (Flatten) (None, 16) 0 concatenate_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "no_mask_4 (NoMask) multiple 0 flatten_1[0][0] \n", - " flatten_2[0][0] \n", - "__________________________________________________________________________________________________\n", - "concatenate_3 (Concatenate) (None, 368) 0 no_mask_4[0][0] \n", - " no_mask_4[1][0] \n", - "__________________________________________________________________________________________________\n", - "dnn_1 (DNN) (None, 80) 89880 concatenate_3[0][0] \n", - "__________________________________________________________________________________________________\n", - "dense (Dense) (None, 1) 80 dnn_1[0][0] \n", - "__________________________________________________________________________________________________\n", - "prediction_layer (PredictionLay (None, 1) 1 dense[0][0] \n", - "==================================================================================================\n", - "Total params: 2,239,602\n", - "Trainable params: 2,239,362\n", - "Non-trainable params: 240\n", - "__________________________________________________________________________________________________\n" - ] - } - ], - "source": [ - "# 建立模型\n", - "model = DIN(dnn_feature_columns, behavior_fea)\n", - "\n", - "# 查看模型结构\n", - "model.summary()\n", - "\n", - "# 模型编译\n", - "model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:28:43.885773Z", - "start_time": "2020-11-18T04:26:48.746787Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:22:26.195838Z", + "start_time": "2020-11-18T04:21:46.115002Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1]\tvalid_0's ndcg@1: 0.909975\tvalid_0's ndcg@2: 0.963068\tvalid_0's ndcg@3: 0.96533\tvalid_0's ndcg@4: 0.965729\tvalid_0's ndcg@5: 0.965864\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.9143\tvalid_0's ndcg@2: 0.964711\tvalid_0's ndcg@3: 0.966961\tvalid_0's ndcg@4: 0.967338\tvalid_0's ndcg@5: 0.967483\n", + "[3]\tvalid_0's ndcg@1: 0.9181\tvalid_0's ndcg@2: 0.966114\tvalid_0's ndcg@3: 0.968289\tvalid_0's ndcg@4: 0.968773\tvalid_0's ndcg@5: 0.96887\n", + "[4]\tvalid_0's ndcg@1: 0.925575\tvalid_0's ndcg@2: 0.969093\tvalid_0's ndcg@3: 0.971193\tvalid_0's ndcg@4: 0.971603\tvalid_0's ndcg@5: 0.97169\n", + "[5]\tvalid_0's ndcg@1: 0.9267\tvalid_0's ndcg@2: 0.969635\tvalid_0's ndcg@3: 0.97166\tvalid_0's ndcg@4: 0.972037\tvalid_0's ndcg@5: 0.972133\n", + "[6]\tvalid_0's ndcg@1: 0.927\tvalid_0's ndcg@2: 0.969682\tvalid_0's ndcg@3: 0.971757\tvalid_0's ndcg@4: 0.972134\tvalid_0's ndcg@5: 0.972231\n", + "[7]\tvalid_0's ndcg@1: 0.928825\tvalid_0's ndcg@2: 0.970451\tvalid_0's ndcg@3: 0.972476\tvalid_0's ndcg@4: 0.97282\tvalid_0's ndcg@5: 0.972927\n", + "[8]\tvalid_0's ndcg@1: 0.930025\tvalid_0's ndcg@2: 0.970988\tvalid_0's ndcg@3: 0.972951\tvalid_0's ndcg@4: 0.973295\tvalid_0's ndcg@5: 0.973402\n", + "[9]\tvalid_0's ndcg@1: 0.931125\tvalid_0's ndcg@2: 0.971347\tvalid_0's ndcg@3: 0.973384\tvalid_0's ndcg@4: 0.973707\tvalid_0's ndcg@5: 0.973794\n", + "[10]\tvalid_0's ndcg@1: 0.9311\tvalid_0's ndcg@2: 0.971385\tvalid_0's ndcg@3: 0.973372\tvalid_0's ndcg@4: 0.973717\tvalid_0's ndcg@5: 0.973794\n", + "[11]\tvalid_0's ndcg@1: 0.930975\tvalid_0's ndcg@2: 0.971433\tvalid_0's ndcg@3: 0.973333\tvalid_0's ndcg@4: 0.973699\tvalid_0's ndcg@5: 0.973767\n", + "[12]\tvalid_0's ndcg@1: 0.93145\tvalid_0's ndcg@2: 0.971656\tvalid_0's ndcg@3: 0.973493\tvalid_0's ndcg@4: 0.973881\tvalid_0's ndcg@5: 0.973949\n", + "[13]\tvalid_0's ndcg@1: 0.932525\tvalid_0's ndcg@2: 0.971927\tvalid_0's ndcg@3: 0.973839\tvalid_0's ndcg@4: 0.974227\tvalid_0's ndcg@5: 0.974304\n", + "[14]\tvalid_0's ndcg@1: 0.932575\tvalid_0's ndcg@2: 0.971898\tvalid_0's ndcg@3: 0.973823\tvalid_0's ndcg@4: 0.974243\tvalid_0's ndcg@5: 0.97432\n", + "[15]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972239\tvalid_0's ndcg@3: 0.974189\tvalid_0's ndcg@4: 0.974587\tvalid_0's ndcg@5: 0.974665\n", + "[16]\tvalid_0's ndcg@1: 0.933475\tvalid_0's ndcg@2: 0.972309\tvalid_0's ndcg@3: 0.974209\tvalid_0's ndcg@4: 0.974596\tvalid_0's ndcg@5: 0.974674\n", + "[17]\tvalid_0's ndcg@1: 0.933725\tvalid_0's ndcg@2: 0.972369\tvalid_0's ndcg@3: 0.974307\tvalid_0's ndcg@4: 0.974684\tvalid_0's ndcg@5: 0.974761\n", + "[18]\tvalid_0's ndcg@1: 0.9339\tvalid_0's ndcg@2: 0.972497\tvalid_0's ndcg@3: 0.974372\tvalid_0's ndcg@4: 0.974749\tvalid_0's ndcg@5: 0.974836\n", + "[19]\tvalid_0's ndcg@1: 0.9345\tvalid_0's ndcg@2: 0.972845\tvalid_0's ndcg@3: 0.974645\tvalid_0's ndcg@4: 0.974979\tvalid_0's ndcg@5: 0.975085\n", + "[20]\tvalid_0's ndcg@1: 0.9349\tvalid_0's ndcg@2: 0.973103\tvalid_0's ndcg@3: 0.97484\tvalid_0's ndcg@4: 0.975174\tvalid_0's ndcg@5: 0.975271\n", + "[21]\tvalid_0's ndcg@1: 0.935\tvalid_0's ndcg@2: 0.973092\tvalid_0's ndcg@3: 0.97488\tvalid_0's ndcg@4: 0.975192\tvalid_0's ndcg@5: 0.975289\n", + "[22]\tvalid_0's ndcg@1: 0.93525\tvalid_0's ndcg@2: 0.9732\tvalid_0's ndcg@3: 0.974988\tvalid_0's ndcg@4: 0.975289\tvalid_0's ndcg@5: 0.975386\n", + "[23]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.972949\tvalid_0's ndcg@3: 0.974824\tvalid_0's ndcg@4: 0.975136\tvalid_0's ndcg@5: 0.975223\n", + "[24]\tvalid_0's ndcg@1: 0.93545\tvalid_0's ndcg@2: 0.973274\tvalid_0's ndcg@3: 0.975087\tvalid_0's ndcg@4: 0.975388\tvalid_0's ndcg@5: 0.975475\n", + "[25]\tvalid_0's ndcg@1: 0.9356\tvalid_0's ndcg@2: 0.973345\tvalid_0's ndcg@3: 0.97512\tvalid_0's ndcg@4: 0.975443\tvalid_0's ndcg@5: 0.97553\n", + "[26]\tvalid_0's ndcg@1: 0.93525\tvalid_0's ndcg@2: 0.9732\tvalid_0's ndcg@3: 0.975\tvalid_0's ndcg@4: 0.975313\tvalid_0's ndcg@5: 0.9754\n", + "[27]\tvalid_0's ndcg@1: 0.935175\tvalid_0's ndcg@2: 0.97322\tvalid_0's ndcg@3: 0.974983\tvalid_0's ndcg@4: 0.975295\tvalid_0's ndcg@5: 0.975382\n", + "[28]\tvalid_0's ndcg@1: 0.935425\tvalid_0's ndcg@2: 0.973328\tvalid_0's ndcg@3: 0.975041\tvalid_0's ndcg@4: 0.975374\tvalid_0's ndcg@5: 0.975471\n", + "[29]\tvalid_0's ndcg@1: 0.935275\tvalid_0's ndcg@2: 0.973225\tvalid_0's ndcg@3: 0.974963\tvalid_0's ndcg@4: 0.975297\tvalid_0's ndcg@5: 0.975403\n", + "[30]\tvalid_0's ndcg@1: 0.9353\tvalid_0's ndcg@2: 0.973235\tvalid_0's ndcg@3: 0.97501\tvalid_0's ndcg@4: 0.975311\tvalid_0's ndcg@5: 0.975418\n", + "[31]\tvalid_0's ndcg@1: 0.9356\tvalid_0's ndcg@2: 0.973361\tvalid_0's ndcg@3: 0.975099\tvalid_0's ndcg@4: 0.975422\tvalid_0's ndcg@5: 0.975528\n", + "[32]\tvalid_0's ndcg@1: 0.9364\tvalid_0's ndcg@2: 0.973641\tvalid_0's ndcg@3: 0.975391\tvalid_0's ndcg@4: 0.975714\tvalid_0's ndcg@5: 0.97582\n", + "[33]\tvalid_0's ndcg@1: 0.9367\tvalid_0's ndcg@2: 0.973751\tvalid_0's ndcg@3: 0.975501\tvalid_0's ndcg@4: 0.975824\tvalid_0's ndcg@5: 0.975931\n", + "[34]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.973902\tvalid_0's ndcg@3: 0.975677\tvalid_0's ndcg@4: 0.975989\tvalid_0's ndcg@5: 0.976095\n", + "[35]\tvalid_0's ndcg@1: 0.9377\tvalid_0's ndcg@2: 0.974105\tvalid_0's ndcg@3: 0.975892\tvalid_0's ndcg@4: 0.976194\tvalid_0's ndcg@5: 0.9763\n", + "[36]\tvalid_0's ndcg@1: 0.938\tvalid_0's ndcg@2: 0.974184\tvalid_0's ndcg@3: 0.975984\tvalid_0's ndcg@4: 0.976296\tvalid_0's ndcg@5: 0.976402\n", + "[37]\tvalid_0's ndcg@1: 0.93845\tvalid_0's ndcg@2: 0.974366\tvalid_0's ndcg@3: 0.976166\tvalid_0's ndcg@4: 0.976467\tvalid_0's ndcg@5: 0.976574\n", + "[38]\tvalid_0's ndcg@1: 0.938925\tvalid_0's ndcg@2: 0.974557\tvalid_0's ndcg@3: 0.976332\tvalid_0's ndcg@4: 0.976655\tvalid_0's ndcg@5: 0.976751\n", + "[39]\tvalid_0's ndcg@1: 0.93865\tvalid_0's ndcg@2: 0.974471\tvalid_0's ndcg@3: 0.976234\tvalid_0's ndcg@4: 0.976557\tvalid_0's ndcg@5: 0.976653\n", + "[40]\tvalid_0's ndcg@1: 0.938325\tvalid_0's ndcg@2: 0.974335\tvalid_0's ndcg@3: 0.97611\tvalid_0's ndcg@4: 0.976433\tvalid_0's ndcg@5: 0.97653\n", + "[41]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.974669\tvalid_0's ndcg@3: 0.976431\tvalid_0's ndcg@4: 0.976743\tvalid_0's ndcg@5: 0.97683\n", + "[42]\tvalid_0's ndcg@1: 0.939375\tvalid_0's ndcg@2: 0.974833\tvalid_0's ndcg@3: 0.976546\tvalid_0's ndcg@4: 0.976858\tvalid_0's ndcg@5: 0.976945\n", + "[43]\tvalid_0's ndcg@1: 0.939625\tvalid_0's ndcg@2: 0.974878\tvalid_0's ndcg@3: 0.976628\tvalid_0's ndcg@4: 0.97694\tvalid_0's ndcg@5: 0.977027\n", + "[44]\tvalid_0's ndcg@1: 0.9395\tvalid_0's ndcg@2: 0.974832\tvalid_0's ndcg@3: 0.97657\tvalid_0's ndcg@4: 0.976893\tvalid_0's ndcg@5: 0.97698\n", + "[45]\tvalid_0's ndcg@1: 0.939775\tvalid_0's ndcg@2: 0.974949\tvalid_0's ndcg@3: 0.976674\tvalid_0's ndcg@4: 0.976997\tvalid_0's ndcg@5: 0.977084\n", + "[46]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.974945\tvalid_0's ndcg@3: 0.976708\tvalid_0's ndcg@4: 0.97702\tvalid_0's ndcg@5: 0.977107\n", + "[47]\tvalid_0's ndcg@1: 0.94005\tvalid_0's ndcg@2: 0.975004\tvalid_0's ndcg@3: 0.976766\tvalid_0's ndcg@4: 0.977078\tvalid_0's ndcg@5: 0.977175\n", + "[48]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975189\tvalid_0's ndcg@3: 0.976939\tvalid_0's ndcg@4: 0.97723\tvalid_0's ndcg@5: 0.977327\n", + "[49]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975189\tvalid_0's ndcg@3: 0.976939\tvalid_0's ndcg@4: 0.97723\tvalid_0's ndcg@5: 0.977327\n", + "[50]\tvalid_0's ndcg@1: 0.9405\tvalid_0's ndcg@2: 0.975264\tvalid_0's ndcg@3: 0.976989\tvalid_0's ndcg@4: 0.977291\tvalid_0's ndcg@5: 0.977368\n", + "[51]\tvalid_0's ndcg@1: 0.941125\tvalid_0's ndcg@2: 0.975526\tvalid_0's ndcg@3: 0.977226\tvalid_0's ndcg@4: 0.977528\tvalid_0's ndcg@5: 0.977605\n", + "[52]\tvalid_0's ndcg@1: 0.941\tvalid_0's ndcg@2: 0.97548\tvalid_0's ndcg@3: 0.977193\tvalid_0's ndcg@4: 0.977484\tvalid_0's ndcg@5: 0.977561\n", + "[53]\tvalid_0's ndcg@1: 0.9411\tvalid_0's ndcg@2: 0.975596\tvalid_0's ndcg@3: 0.977259\tvalid_0's ndcg@4: 0.977539\tvalid_0's ndcg@5: 0.977616\n", + "[54]\tvalid_0's ndcg@1: 0.9412\tvalid_0's ndcg@2: 0.975712\tvalid_0's ndcg@3: 0.977299\tvalid_0's ndcg@4: 0.97759\tvalid_0's ndcg@5: 0.977667\n", + "[55]\tvalid_0's ndcg@1: 0.94155\tvalid_0's ndcg@2: 0.975841\tvalid_0's ndcg@3: 0.977429\tvalid_0's ndcg@4: 0.977719\tvalid_0's ndcg@5: 0.977797\n", + "[56]\tvalid_0's ndcg@1: 0.941825\tvalid_0's ndcg@2: 0.975943\tvalid_0's ndcg@3: 0.97753\tvalid_0's ndcg@4: 0.977821\tvalid_0's ndcg@5: 0.977898\n", + "[57]\tvalid_0's ndcg@1: 0.9416\tvalid_0's ndcg@2: 0.975891\tvalid_0's ndcg@3: 0.977429\tvalid_0's ndcg@4: 0.977741\tvalid_0's ndcg@5: 0.977818\n", + "[58]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.975969\tvalid_0's ndcg@3: 0.977494\tvalid_0's ndcg@4: 0.977795\tvalid_0's ndcg@5: 0.977873\n", + "[59]\tvalid_0's ndcg@1: 0.942025\tvalid_0's ndcg@2: 0.975985\tvalid_0's ndcg@3: 0.977547\tvalid_0's ndcg@4: 0.977881\tvalid_0's ndcg@5: 0.977958\n", + "[60]\tvalid_0's ndcg@1: 0.94205\tvalid_0's ndcg@2: 0.975994\tvalid_0's ndcg@3: 0.977569\tvalid_0's ndcg@4: 0.977892\tvalid_0's ndcg@5: 0.977969\n", + "[61]\tvalid_0's ndcg@1: 0.94205\tvalid_0's ndcg@2: 0.975947\tvalid_0's ndcg@3: 0.977559\tvalid_0's ndcg@4: 0.977882\tvalid_0's ndcg@5: 0.97796\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[62]\tvalid_0's ndcg@1: 0.942225\tvalid_0's ndcg@2: 0.976027\tvalid_0's ndcg@3: 0.97764\tvalid_0's ndcg@4: 0.977941\tvalid_0's ndcg@5: 0.978028\n", + "[63]\tvalid_0's ndcg@1: 0.942125\tvalid_0's ndcg@2: 0.976022\tvalid_0's ndcg@3: 0.977622\tvalid_0's ndcg@4: 0.977912\tvalid_0's ndcg@5: 0.977999\n", + "[64]\tvalid_0's ndcg@1: 0.942675\tvalid_0's ndcg@2: 0.976193\tvalid_0's ndcg@3: 0.977793\tvalid_0's ndcg@4: 0.978105\tvalid_0's ndcg@5: 0.978192\n", + "[65]\tvalid_0's ndcg@1: 0.942725\tvalid_0's ndcg@2: 0.976227\tvalid_0's ndcg@3: 0.977802\tvalid_0's ndcg@4: 0.978125\tvalid_0's ndcg@5: 0.978212\n", + "[66]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976132\tvalid_0's ndcg@3: 0.977695\tvalid_0's ndcg@4: 0.978018\tvalid_0's ndcg@5: 0.978105\n", + "[67]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976092\tvalid_0's ndcg@3: 0.977679\tvalid_0's ndcg@4: 0.978002\tvalid_0's ndcg@5: 0.978089\n", + "[68]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976148\tvalid_0's ndcg@3: 0.977698\tvalid_0's ndcg@4: 0.978021\tvalid_0's ndcg@5: 0.978108\n", + "[69]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976123\tvalid_0's ndcg@3: 0.977686\tvalid_0's ndcg@4: 0.978009\tvalid_0's ndcg@5: 0.978096\n", + "[70]\tvalid_0's ndcg@1: 0.942625\tvalid_0's ndcg@2: 0.976222\tvalid_0's ndcg@3: 0.977785\tvalid_0's ndcg@4: 0.978097\tvalid_0's ndcg@5: 0.978184\n", + "[71]\tvalid_0's ndcg@1: 0.942575\tvalid_0's ndcg@2: 0.976188\tvalid_0's ndcg@3: 0.977763\tvalid_0's ndcg@4: 0.978075\tvalid_0's ndcg@5: 0.978162\n", + "[72]\tvalid_0's ndcg@1: 0.9427\tvalid_0's ndcg@2: 0.976234\tvalid_0's ndcg@3: 0.977809\tvalid_0's ndcg@4: 0.978121\tvalid_0's ndcg@5: 0.978208\n", + "[73]\tvalid_0's ndcg@1: 0.9428\tvalid_0's ndcg@2: 0.976255\tvalid_0's ndcg@3: 0.977843\tvalid_0's ndcg@4: 0.978155\tvalid_0's ndcg@5: 0.978242\n", + "[74]\tvalid_0's ndcg@1: 0.94295\tvalid_0's ndcg@2: 0.97631\tvalid_0's ndcg@3: 0.977898\tvalid_0's ndcg@4: 0.97821\tvalid_0's ndcg@5: 0.978297\n", + "[75]\tvalid_0's ndcg@1: 0.943\tvalid_0's ndcg@2: 0.976329\tvalid_0's ndcg@3: 0.977941\tvalid_0's ndcg@4: 0.978232\tvalid_0's ndcg@5: 0.978319\n", + "[76]\tvalid_0's ndcg@1: 0.9433\tvalid_0's ndcg@2: 0.976471\tvalid_0's ndcg@3: 0.978059\tvalid_0's ndcg@4: 0.97836\tvalid_0's ndcg@5: 0.978437\n", + "[77]\tvalid_0's ndcg@1: 0.94315\tvalid_0's ndcg@2: 0.976416\tvalid_0's ndcg@3: 0.977991\tvalid_0's ndcg@4: 0.978314\tvalid_0's ndcg@5: 0.978381\n", + "[78]\tvalid_0's ndcg@1: 0.943675\tvalid_0's ndcg@2: 0.976657\tvalid_0's ndcg@3: 0.978194\tvalid_0's ndcg@4: 0.978517\tvalid_0's ndcg@5: 0.978585\n", + "[79]\tvalid_0's ndcg@1: 0.94365\tvalid_0's ndcg@2: 0.976663\tvalid_0's ndcg@3: 0.978188\tvalid_0's ndcg@4: 0.978501\tvalid_0's ndcg@5: 0.978578\n", + "[80]\tvalid_0's ndcg@1: 0.943725\tvalid_0's ndcg@2: 0.976628\tvalid_0's ndcg@3: 0.978203\tvalid_0's ndcg@4: 0.978515\tvalid_0's ndcg@5: 0.978593\n", + "[81]\tvalid_0's ndcg@1: 0.943975\tvalid_0's ndcg@2: 0.97672\tvalid_0's ndcg@3: 0.978295\tvalid_0's ndcg@4: 0.978607\tvalid_0's ndcg@5: 0.978685\n", + "[82]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.976822\tvalid_0's ndcg@3: 0.978397\tvalid_0's ndcg@4: 0.97872\tvalid_0's ndcg@5: 0.978787\n", + "[83]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.976788\tvalid_0's ndcg@3: 0.978375\tvalid_0's ndcg@4: 0.978698\tvalid_0's ndcg@5: 0.978766\n", + "[84]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.97679\tvalid_0's ndcg@3: 0.97839\tvalid_0's ndcg@4: 0.978702\tvalid_0's ndcg@5: 0.97878\n", + "[85]\tvalid_0's ndcg@1: 0.9443\tvalid_0's ndcg@2: 0.976809\tvalid_0's ndcg@3: 0.978421\tvalid_0's ndcg@4: 0.978723\tvalid_0's ndcg@5: 0.9788\n", + "[86]\tvalid_0's ndcg@1: 0.944525\tvalid_0's ndcg@2: 0.976939\tvalid_0's ndcg@3: 0.978502\tvalid_0's ndcg@4: 0.978814\tvalid_0's ndcg@5: 0.978891\n", + "[87]\tvalid_0's ndcg@1: 0.944625\tvalid_0's ndcg@2: 0.976976\tvalid_0's ndcg@3: 0.978551\tvalid_0's ndcg@4: 0.978852\tvalid_0's ndcg@5: 0.97893\n", + "[88]\tvalid_0's ndcg@1: 0.944925\tvalid_0's ndcg@2: 0.977102\tvalid_0's ndcg@3: 0.978677\tvalid_0's ndcg@4: 0.978968\tvalid_0's ndcg@5: 0.979045\n", + "[89]\tvalid_0's ndcg@1: 0.945125\tvalid_0's ndcg@2: 0.977208\tvalid_0's ndcg@3: 0.978758\tvalid_0's ndcg@4: 0.979048\tvalid_0's ndcg@5: 0.979126\n", + "[90]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977135\tvalid_0's ndcg@3: 0.978735\tvalid_0's ndcg@4: 0.979026\tvalid_0's ndcg@5: 0.979104\n", + "[91]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977208\tvalid_0's ndcg@3: 0.978858\tvalid_0's ndcg@4: 0.979138\tvalid_0's ndcg@5: 0.979215\n", + "[92]\tvalid_0's ndcg@1: 0.9455\tvalid_0's ndcg@2: 0.977267\tvalid_0's ndcg@3: 0.978905\tvalid_0's ndcg@4: 0.979174\tvalid_0's ndcg@5: 0.979251\n", + "[93]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977193\tvalid_0's ndcg@3: 0.978818\tvalid_0's ndcg@4: 0.979098\tvalid_0's ndcg@5: 0.979176\n", + "[94]\tvalid_0's ndcg@1: 0.94545\tvalid_0's ndcg@2: 0.97728\tvalid_0's ndcg@3: 0.97888\tvalid_0's ndcg@4: 0.97916\tvalid_0's ndcg@5: 0.979238\n", + "[95]\tvalid_0's ndcg@1: 0.9458\tvalid_0's ndcg@2: 0.977394\tvalid_0's ndcg@3: 0.979006\tvalid_0's ndcg@4: 0.979286\tvalid_0's ndcg@5: 0.979364\n", + "[96]\tvalid_0's ndcg@1: 0.946075\tvalid_0's ndcg@2: 0.977527\tvalid_0's ndcg@3: 0.979114\tvalid_0's ndcg@4: 0.979394\tvalid_0's ndcg@5: 0.979472\n", + "[97]\tvalid_0's ndcg@1: 0.946475\tvalid_0's ndcg@2: 0.977659\tvalid_0's ndcg@3: 0.979259\tvalid_0's ndcg@4: 0.979539\tvalid_0's ndcg@5: 0.979616\n", + "[98]\tvalid_0's ndcg@1: 0.94675\tvalid_0's ndcg@2: 0.97776\tvalid_0's ndcg@3: 0.97936\tvalid_0's ndcg@4: 0.979651\tvalid_0's ndcg@5: 0.979719\n", + "[99]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.977831\tvalid_0's ndcg@3: 0.979419\tvalid_0's ndcg@4: 0.97971\tvalid_0's ndcg@5: 0.979777\n", + "[100]\tvalid_0's ndcg@1: 0.9468\tvalid_0's ndcg@2: 0.977794\tvalid_0's ndcg@3: 0.979369\tvalid_0's ndcg@4: 0.979671\tvalid_0's ndcg@5: 0.979739\n", + "Did not meet early stopping. Best iteration is:\n", + "[99]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.977831\tvalid_0's ndcg@3: 0.979419\tvalid_0's ndcg@4: 0.97971\tvalid_0's ndcg@5: 0.979777\n", + "[1]\tvalid_0's ndcg@1: 0.909075\tvalid_0's ndcg@2: 0.963019\tvalid_0's ndcg@3: 0.965069\tvalid_0's ndcg@4: 0.965543\tvalid_0's ndcg@5: 0.965601\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.9123\tvalid_0's ndcg@2: 0.964273\tvalid_0's ndcg@3: 0.966248\tvalid_0's ndcg@4: 0.966722\tvalid_0's ndcg@5: 0.966789\n", + "[3]\tvalid_0's ndcg@1: 0.915075\tvalid_0's ndcg@2: 0.965691\tvalid_0's ndcg@3: 0.967466\tvalid_0's ndcg@4: 0.967854\tvalid_0's ndcg@5: 0.967922\n", + "[4]\tvalid_0's ndcg@1: 0.91845\tvalid_0's ndcg@2: 0.967047\tvalid_0's ndcg@3: 0.968735\tvalid_0's ndcg@4: 0.969133\tvalid_0's ndcg@5: 0.969201\n", + "[5]\tvalid_0's ndcg@1: 0.92355\tvalid_0's ndcg@2: 0.968961\tvalid_0's ndcg@3: 0.970674\tvalid_0's ndcg@4: 0.97104\tvalid_0's ndcg@5: 0.971098\n", + "[6]\tvalid_0's ndcg@1: 0.9253\tvalid_0's ndcg@2: 0.969607\tvalid_0's ndcg@3: 0.971345\tvalid_0's ndcg@4: 0.971689\tvalid_0's ndcg@5: 0.971747\n", + "[7]\tvalid_0's ndcg@1: 0.926225\tvalid_0's ndcg@2: 0.969933\tvalid_0's ndcg@3: 0.971708\tvalid_0's ndcg@4: 0.972031\tvalid_0's ndcg@5: 0.972079\n", + "[8]\tvalid_0's ndcg@1: 0.926475\tvalid_0's ndcg@2: 0.970104\tvalid_0's ndcg@3: 0.971804\tvalid_0's ndcg@4: 0.972116\tvalid_0's ndcg@5: 0.972184\n", + "[9]\tvalid_0's ndcg@1: 0.9277\tvalid_0's ndcg@2: 0.970682\tvalid_0's ndcg@3: 0.972307\tvalid_0's ndcg@4: 0.972598\tvalid_0's ndcg@5: 0.972675\n", + "[10]\tvalid_0's ndcg@1: 0.92775\tvalid_0's ndcg@2: 0.970653\tvalid_0's ndcg@3: 0.972316\tvalid_0's ndcg@4: 0.972617\tvalid_0's ndcg@5: 0.972685\n", + "[11]\tvalid_0's ndcg@1: 0.9283\tvalid_0's ndcg@2: 0.97084\tvalid_0's ndcg@3: 0.97254\tvalid_0's ndcg@4: 0.97281\tvalid_0's ndcg@5: 0.972887\n", + "[12]\tvalid_0's ndcg@1: 0.9287\tvalid_0's ndcg@2: 0.971051\tvalid_0's ndcg@3: 0.972701\tvalid_0's ndcg@4: 0.97297\tvalid_0's ndcg@5: 0.973048\n", + "[13]\tvalid_0's ndcg@1: 0.9297\tvalid_0's ndcg@2: 0.971389\tvalid_0's ndcg@3: 0.973001\tvalid_0's ndcg@4: 0.973313\tvalid_0's ndcg@5: 0.9734\n", + "[14]\tvalid_0's ndcg@1: 0.92955\tvalid_0's ndcg@2: 0.971444\tvalid_0's ndcg@3: 0.972994\tvalid_0's ndcg@4: 0.973284\tvalid_0's ndcg@5: 0.973371\n", + "[15]\tvalid_0's ndcg@1: 0.930225\tvalid_0's ndcg@2: 0.97174\tvalid_0's ndcg@3: 0.973253\tvalid_0's ndcg@4: 0.973543\tvalid_0's ndcg@5: 0.97363\n", + "[16]\tvalid_0's ndcg@1: 0.930425\tvalid_0's ndcg@2: 0.971798\tvalid_0's ndcg@3: 0.973298\tvalid_0's ndcg@4: 0.97361\tvalid_0's ndcg@5: 0.973698\n", + "[17]\tvalid_0's ndcg@1: 0.93125\tvalid_0's ndcg@2: 0.971992\tvalid_0's ndcg@3: 0.97358\tvalid_0's ndcg@4: 0.973903\tvalid_0's ndcg@5: 0.97398\n", + "[18]\tvalid_0's ndcg@1: 0.931925\tvalid_0's ndcg@2: 0.972257\tvalid_0's ndcg@3: 0.973845\tvalid_0's ndcg@4: 0.974146\tvalid_0's ndcg@5: 0.974224\n", + "[19]\tvalid_0's ndcg@1: 0.932375\tvalid_0's ndcg@2: 0.972376\tvalid_0's ndcg@3: 0.974038\tvalid_0's ndcg@4: 0.974318\tvalid_0's ndcg@5: 0.974376\n", + "[20]\tvalid_0's ndcg@1: 0.932\tvalid_0's ndcg@2: 0.972269\tvalid_0's ndcg@3: 0.973907\tvalid_0's ndcg@4: 0.974187\tvalid_0's ndcg@5: 0.974245\n", + "[21]\tvalid_0's ndcg@1: 0.932725\tvalid_0's ndcg@2: 0.972568\tvalid_0's ndcg@3: 0.974181\tvalid_0's ndcg@4: 0.974471\tvalid_0's ndcg@5: 0.974529\n", + "[22]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.972735\tvalid_0's ndcg@3: 0.974298\tvalid_0's ndcg@4: 0.974599\tvalid_0's ndcg@5: 0.974657\n", + "[23]\tvalid_0's ndcg@1: 0.932925\tvalid_0's ndcg@2: 0.972642\tvalid_0's ndcg@3: 0.974255\tvalid_0's ndcg@4: 0.974545\tvalid_0's ndcg@5: 0.974594\n", + "[24]\tvalid_0's ndcg@1: 0.933175\tvalid_0's ndcg@2: 0.972734\tvalid_0's ndcg@3: 0.974347\tvalid_0's ndcg@4: 0.974638\tvalid_0's ndcg@5: 0.974686\n", + "[25]\tvalid_0's ndcg@1: 0.9331\tvalid_0's ndcg@2: 0.972754\tvalid_0's ndcg@3: 0.974366\tvalid_0's ndcg@4: 0.974636\tvalid_0's ndcg@5: 0.974674\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[26]\tvalid_0's ndcg@1: 0.933275\tvalid_0's ndcg@2: 0.972787\tvalid_0's ndcg@3: 0.974424\tvalid_0's ndcg@4: 0.974694\tvalid_0's ndcg@5: 0.974732\n", + "[27]\tvalid_0's ndcg@1: 0.93325\tvalid_0's ndcg@2: 0.972809\tvalid_0's ndcg@3: 0.974434\tvalid_0's ndcg@4: 0.974703\tvalid_0's ndcg@5: 0.974732\n", + "[28]\tvalid_0's ndcg@1: 0.933625\tvalid_0's ndcg@2: 0.972932\tvalid_0's ndcg@3: 0.974557\tvalid_0's ndcg@4: 0.974826\tvalid_0's ndcg@5: 0.974855\n", + "[29]\tvalid_0's ndcg@1: 0.933725\tvalid_0's ndcg@2: 0.972937\tvalid_0's ndcg@3: 0.974587\tvalid_0's ndcg@4: 0.974856\tvalid_0's ndcg@5: 0.974885\n", + "[30]\tvalid_0's ndcg@1: 0.93355\tvalid_0's ndcg@2: 0.972873\tvalid_0's ndcg@3: 0.974523\tvalid_0's ndcg@4: 0.974792\tvalid_0's ndcg@5: 0.974821\n", + "[31]\tvalid_0's ndcg@1: 0.9342\tvalid_0's ndcg@2: 0.973065\tvalid_0's ndcg@3: 0.974753\tvalid_0's ndcg@4: 0.975022\tvalid_0's ndcg@5: 0.975051\n", + "[32]\tvalid_0's ndcg@1: 0.93435\tvalid_0's ndcg@2: 0.973152\tvalid_0's ndcg@3: 0.974815\tvalid_0's ndcg@4: 0.975084\tvalid_0's ndcg@5: 0.975113\n", + "[33]\tvalid_0's ndcg@1: 0.934475\tvalid_0's ndcg@2: 0.97323\tvalid_0's ndcg@3: 0.974855\tvalid_0's ndcg@4: 0.975135\tvalid_0's ndcg@5: 0.975164\n", + "[34]\tvalid_0's ndcg@1: 0.9342\tvalid_0's ndcg@2: 0.973113\tvalid_0's ndcg@3: 0.974738\tvalid_0's ndcg@4: 0.975028\tvalid_0's ndcg@5: 0.975057\n", + "[35]\tvalid_0's ndcg@1: 0.93455\tvalid_0's ndcg@2: 0.973258\tvalid_0's ndcg@3: 0.97487\tvalid_0's ndcg@4: 0.975172\tvalid_0's ndcg@5: 0.975201\n", + "[36]\tvalid_0's ndcg@1: 0.9344\tvalid_0's ndcg@2: 0.973265\tvalid_0's ndcg@3: 0.974828\tvalid_0's ndcg@4: 0.975129\tvalid_0's ndcg@5: 0.975158\n", + "[37]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.973438\tvalid_0's ndcg@3: 0.975013\tvalid_0's ndcg@4: 0.975304\tvalid_0's ndcg@5: 0.975323\n", + "[38]\tvalid_0's ndcg@1: 0.934975\tvalid_0's ndcg@2: 0.973541\tvalid_0's ndcg@3: 0.975066\tvalid_0's ndcg@4: 0.975367\tvalid_0's ndcg@5: 0.975386\n", + "[39]\tvalid_0's ndcg@1: 0.935275\tvalid_0's ndcg@2: 0.973667\tvalid_0's ndcg@3: 0.975192\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975502\n", + "[40]\tvalid_0's ndcg@1: 0.9352\tvalid_0's ndcg@2: 0.973624\tvalid_0's ndcg@3: 0.975174\tvalid_0's ndcg@4: 0.975454\tvalid_0's ndcg@5: 0.975473\n", + "[41]\tvalid_0's ndcg@1: 0.935325\tvalid_0's ndcg@2: 0.973686\tvalid_0's ndcg@3: 0.975223\tvalid_0's ndcg@4: 0.975503\tvalid_0's ndcg@5: 0.975522\n", + "[42]\tvalid_0's ndcg@1: 0.93545\tvalid_0's ndcg@2: 0.973716\tvalid_0's ndcg@3: 0.975266\tvalid_0's ndcg@4: 0.975546\tvalid_0's ndcg@5: 0.975565\n", + "[43]\tvalid_0's ndcg@1: 0.93615\tvalid_0's ndcg@2: 0.974022\tvalid_0's ndcg@3: 0.975534\tvalid_0's ndcg@4: 0.975814\tvalid_0's ndcg@5: 0.975843\n", + "[44]\tvalid_0's ndcg@1: 0.936225\tvalid_0's ndcg@2: 0.974112\tvalid_0's ndcg@3: 0.975562\tvalid_0's ndcg@4: 0.975853\tvalid_0's ndcg@5: 0.975882\n", + "[45]\tvalid_0's ndcg@1: 0.9365\tvalid_0's ndcg@2: 0.974167\tvalid_0's ndcg@3: 0.975654\tvalid_0's ndcg@4: 0.975945\tvalid_0's ndcg@5: 0.975974\n", + "[46]\tvalid_0's ndcg@1: 0.93665\tvalid_0's ndcg@2: 0.974206\tvalid_0's ndcg@3: 0.975694\tvalid_0's ndcg@4: 0.975995\tvalid_0's ndcg@5: 0.976024\n", + "[47]\tvalid_0's ndcg@1: 0.93685\tvalid_0's ndcg@2: 0.974311\tvalid_0's ndcg@3: 0.975786\tvalid_0's ndcg@4: 0.976077\tvalid_0's ndcg@5: 0.976106\n", + "[48]\tvalid_0's ndcg@1: 0.937025\tvalid_0's ndcg@2: 0.974408\tvalid_0's ndcg@3: 0.975845\tvalid_0's ndcg@4: 0.976147\tvalid_0's ndcg@5: 0.976185\n", + "[49]\tvalid_0's ndcg@1: 0.936975\tvalid_0's ndcg@2: 0.974342\tvalid_0's ndcg@3: 0.975829\tvalid_0's ndcg@4: 0.97612\tvalid_0's ndcg@5: 0.976159\n", + "[50]\tvalid_0's ndcg@1: 0.9371\tvalid_0's ndcg@2: 0.974388\tvalid_0's ndcg@3: 0.97585\tvalid_0's ndcg@4: 0.976152\tvalid_0's ndcg@5: 0.976191\n", + "[51]\tvalid_0's ndcg@1: 0.937025\tvalid_0's ndcg@2: 0.974329\tvalid_0's ndcg@3: 0.975841\tvalid_0's ndcg@4: 0.976121\tvalid_0's ndcg@5: 0.97616\n", + "[52]\tvalid_0's ndcg@1: 0.9377\tvalid_0's ndcg@2: 0.974578\tvalid_0's ndcg@3: 0.976078\tvalid_0's ndcg@4: 0.976369\tvalid_0's ndcg@5: 0.976407\n", + "[53]\tvalid_0's ndcg@1: 0.9378\tvalid_0's ndcg@2: 0.974615\tvalid_0's ndcg@3: 0.976115\tvalid_0's ndcg@4: 0.976405\tvalid_0's ndcg@5: 0.976444\n", + "[54]\tvalid_0's ndcg@1: 0.938\tvalid_0's ndcg@2: 0.974689\tvalid_0's ndcg@3: 0.976214\tvalid_0's ndcg@4: 0.976483\tvalid_0's ndcg@5: 0.976521\n", + "[55]\tvalid_0's ndcg@1: 0.938225\tvalid_0's ndcg@2: 0.974803\tvalid_0's ndcg@3: 0.976303\tvalid_0's ndcg@4: 0.976572\tvalid_0's ndcg@5: 0.976611\n", + "[56]\tvalid_0's ndcg@1: 0.938175\tvalid_0's ndcg@2: 0.9748\tvalid_0's ndcg@3: 0.976275\tvalid_0's ndcg@4: 0.976555\tvalid_0's ndcg@5: 0.976594\n", + "[57]\tvalid_0's ndcg@1: 0.938525\tvalid_0's ndcg@2: 0.974914\tvalid_0's ndcg@3: 0.976414\tvalid_0's ndcg@4: 0.976683\tvalid_0's ndcg@5: 0.976722\n", + "[58]\tvalid_0's ndcg@1: 0.93875\tvalid_0's ndcg@2: 0.975028\tvalid_0's ndcg@3: 0.976503\tvalid_0's ndcg@4: 0.976773\tvalid_0's ndcg@5: 0.976811\n", + "[59]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975198\tvalid_0's ndcg@3: 0.976648\tvalid_0's ndcg@4: 0.976918\tvalid_0's ndcg@5: 0.976956\n", + "[60]\tvalid_0's ndcg@1: 0.939025\tvalid_0's ndcg@2: 0.975177\tvalid_0's ndcg@3: 0.976615\tvalid_0's ndcg@4: 0.976884\tvalid_0's ndcg@5: 0.976923\n", + "[61]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.975205\tvalid_0's ndcg@3: 0.976642\tvalid_0's ndcg@4: 0.976912\tvalid_0's ndcg@5: 0.97695\n", + "[62]\tvalid_0's ndcg@1: 0.93965\tvalid_0's ndcg@2: 0.975424\tvalid_0's ndcg@3: 0.976836\tvalid_0's ndcg@4: 0.977116\tvalid_0's ndcg@5: 0.977155\n", + "[63]\tvalid_0's ndcg@1: 0.940075\tvalid_0's ndcg@2: 0.975596\tvalid_0's ndcg@3: 0.976996\tvalid_0's ndcg@4: 0.977276\tvalid_0's ndcg@5: 0.977315\n", + "[64]\tvalid_0's ndcg@1: 0.940375\tvalid_0's ndcg@2: 0.975723\tvalid_0's ndcg@3: 0.977123\tvalid_0's ndcg@4: 0.977392\tvalid_0's ndcg@5: 0.977431\n", + "[65]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975766\tvalid_0's ndcg@3: 0.977154\tvalid_0's ndcg@4: 0.977423\tvalid_0's ndcg@5: 0.977462\n", + "[66]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.975744\tvalid_0's ndcg@3: 0.977156\tvalid_0's ndcg@4: 0.977426\tvalid_0's ndcg@5: 0.977464\n", + "[67]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.97576\tvalid_0's ndcg@3: 0.977172\tvalid_0's ndcg@4: 0.977431\tvalid_0's ndcg@5: 0.977469\n", + "[68]\tvalid_0's ndcg@1: 0.940675\tvalid_0's ndcg@2: 0.975849\tvalid_0's ndcg@3: 0.977249\tvalid_0's ndcg@4: 0.977508\tvalid_0's ndcg@5: 0.977546\n", + "[69]\tvalid_0's ndcg@1: 0.9413\tvalid_0's ndcg@2: 0.976017\tvalid_0's ndcg@3: 0.977454\tvalid_0's ndcg@4: 0.977724\tvalid_0's ndcg@5: 0.977762\n", + "[70]\tvalid_0's ndcg@1: 0.94105\tvalid_0's ndcg@2: 0.975925\tvalid_0's ndcg@3: 0.977362\tvalid_0's ndcg@4: 0.977631\tvalid_0's ndcg@5: 0.97767\n", + "[71]\tvalid_0's ndcg@1: 0.94105\tvalid_0's ndcg@2: 0.975925\tvalid_0's ndcg@3: 0.97735\tvalid_0's ndcg@4: 0.97763\tvalid_0's ndcg@5: 0.977668\n", + "[72]\tvalid_0's ndcg@1: 0.941325\tvalid_0's ndcg@2: 0.976058\tvalid_0's ndcg@3: 0.97747\tvalid_0's ndcg@4: 0.977739\tvalid_0's ndcg@5: 0.977778\n", + "[73]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.976076\tvalid_0's ndcg@3: 0.977476\tvalid_0's ndcg@4: 0.977756\tvalid_0's ndcg@5: 0.977795\n", + "[74]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.97619\tvalid_0's ndcg@3: 0.97759\tvalid_0's ndcg@4: 0.97788\tvalid_0's ndcg@5: 0.977919\n", + "[75]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.97619\tvalid_0's ndcg@3: 0.977602\tvalid_0's ndcg@4: 0.977882\tvalid_0's ndcg@5: 0.977921\n", + "[76]\tvalid_0's ndcg@1: 0.94195\tvalid_0's ndcg@2: 0.976273\tvalid_0's ndcg@3: 0.977685\tvalid_0's ndcg@4: 0.977965\tvalid_0's ndcg@5: 0.978004\n", + "[77]\tvalid_0's ndcg@1: 0.9419\tvalid_0's ndcg@2: 0.97627\tvalid_0's ndcg@3: 0.97767\tvalid_0's ndcg@4: 0.97795\tvalid_0's ndcg@5: 0.977989\n", + "[78]\tvalid_0's ndcg@1: 0.94235\tvalid_0's ndcg@2: 0.976452\tvalid_0's ndcg@3: 0.977839\tvalid_0's ndcg@4: 0.978119\tvalid_0's ndcg@5: 0.978158\n", + "[79]\tvalid_0's ndcg@1: 0.94265\tvalid_0's ndcg@2: 0.976562\tvalid_0's ndcg@3: 0.977937\tvalid_0's ndcg@4: 0.978228\tvalid_0's ndcg@5: 0.978267\n", + "[80]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976667\tvalid_0's ndcg@3: 0.978067\tvalid_0's ndcg@4: 0.978347\tvalid_0's ndcg@5: 0.978385\n", + "[81]\tvalid_0's ndcg@1: 0.94305\tvalid_0's ndcg@2: 0.97671\tvalid_0's ndcg@3: 0.978098\tvalid_0's ndcg@4: 0.978378\tvalid_0's ndcg@5: 0.978416\n", + "[82]\tvalid_0's ndcg@1: 0.943175\tvalid_0's ndcg@2: 0.97674\tvalid_0's ndcg@3: 0.978115\tvalid_0's ndcg@4: 0.978417\tvalid_0's ndcg@5: 0.978456\n", + "[83]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976752\tvalid_0's ndcg@3: 0.97814\tvalid_0's ndcg@4: 0.978441\tvalid_0's ndcg@5: 0.97848\n", + "[84]\tvalid_0's ndcg@1: 0.943375\tvalid_0's ndcg@2: 0.976767\tvalid_0's ndcg@3: 0.978179\tvalid_0's ndcg@4: 0.978481\tvalid_0's ndcg@5: 0.97852\n", + "[85]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976721\tvalid_0's ndcg@3: 0.978146\tvalid_0's ndcg@4: 0.978437\tvalid_0's ndcg@5: 0.978475\n", + "[86]\tvalid_0's ndcg@1: 0.9434\tvalid_0's ndcg@2: 0.976792\tvalid_0's ndcg@3: 0.978204\tvalid_0's ndcg@4: 0.978506\tvalid_0's ndcg@5: 0.978535\n", + "[87]\tvalid_0's ndcg@1: 0.943475\tvalid_0's ndcg@2: 0.976851\tvalid_0's ndcg@3: 0.978239\tvalid_0's ndcg@4: 0.97854\tvalid_0's ndcg@5: 0.978569\n", + "[88]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976882\tvalid_0's ndcg@3: 0.978282\tvalid_0's ndcg@4: 0.978572\tvalid_0's ndcg@5: 0.978611\n", + "[89]\tvalid_0's ndcg@1: 0.943775\tvalid_0's ndcg@2: 0.976915\tvalid_0's ndcg@3: 0.97834\tvalid_0's ndcg@4: 0.97863\tvalid_0's ndcg@5: 0.978669\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[90]\tvalid_0's ndcg@1: 0.943925\tvalid_0's ndcg@2: 0.976986\tvalid_0's ndcg@3: 0.978398\tvalid_0's ndcg@4: 0.978689\tvalid_0's ndcg@5: 0.978728\n", + "[91]\tvalid_0's ndcg@1: 0.943875\tvalid_0's ndcg@2: 0.976999\tvalid_0's ndcg@3: 0.978399\tvalid_0's ndcg@4: 0.978679\tvalid_0's ndcg@5: 0.978717\n", + "[92]\tvalid_0's ndcg@1: 0.94395\tvalid_0's ndcg@2: 0.977058\tvalid_0's ndcg@3: 0.978421\tvalid_0's ndcg@4: 0.978711\tvalid_0's ndcg@5: 0.97876\n", + "[93]\tvalid_0's ndcg@1: 0.944075\tvalid_0's ndcg@2: 0.977104\tvalid_0's ndcg@3: 0.978479\tvalid_0's ndcg@4: 0.978759\tvalid_0's ndcg@5: 0.978807\n", + "[94]\tvalid_0's ndcg@1: 0.944175\tvalid_0's ndcg@2: 0.977125\tvalid_0's ndcg@3: 0.978513\tvalid_0's ndcg@4: 0.978793\tvalid_0's ndcg@5: 0.978841\n", + "[95]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.977153\tvalid_0's ndcg@3: 0.97854\tvalid_0's ndcg@4: 0.97882\tvalid_0's ndcg@5: 0.978869\n", + "[96]\tvalid_0's ndcg@1: 0.944225\tvalid_0's ndcg@2: 0.977144\tvalid_0's ndcg@3: 0.978531\tvalid_0's ndcg@4: 0.978811\tvalid_0's ndcg@5: 0.97886\n", + "[97]\tvalid_0's ndcg@1: 0.94435\tvalid_0's ndcg@2: 0.977221\tvalid_0's ndcg@3: 0.978584\tvalid_0's ndcg@4: 0.978864\tvalid_0's ndcg@5: 0.978912\n", + "[98]\tvalid_0's ndcg@1: 0.944575\tvalid_0's ndcg@2: 0.977289\tvalid_0's ndcg@3: 0.978651\tvalid_0's ndcg@4: 0.978942\tvalid_0's ndcg@5: 0.97899\n", + "[99]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.977341\tvalid_0's ndcg@3: 0.978691\tvalid_0's ndcg@4: 0.978993\tvalid_0's ndcg@5: 0.979032\n", + "[100]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977482\tvalid_0's ndcg@3: 0.978857\tvalid_0's ndcg@4: 0.979148\tvalid_0's ndcg@5: 0.979187\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's ndcg@1: 0.9451\tvalid_0's ndcg@2: 0.977482\tvalid_0's ndcg@3: 0.978857\tvalid_0's ndcg@4: 0.979148\tvalid_0's ndcg@5: 0.979187\n", + "[1]\tvalid_0's ndcg@1: 0.911575\tvalid_0's ndcg@2: 0.964384\tvalid_0's ndcg@3: 0.966321\tvalid_0's ndcg@4: 0.966623\tvalid_0's ndcg@5: 0.966671\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.9136\tvalid_0's ndcg@2: 0.965257\tvalid_0's ndcg@3: 0.967107\tvalid_0's ndcg@4: 0.967398\tvalid_0's ndcg@5: 0.967456\n", + "[3]\tvalid_0's ndcg@1: 0.917425\tvalid_0's ndcg@2: 0.966732\tvalid_0's ndcg@3: 0.968545\tvalid_0's ndcg@4: 0.968814\tvalid_0's ndcg@5: 0.968882\n", + "[4]\tvalid_0's ndcg@1: 0.9222\tvalid_0's ndcg@2: 0.968558\tvalid_0's ndcg@3: 0.970383\tvalid_0's ndcg@4: 0.970619\tvalid_0's ndcg@5: 0.970668\n", + "[5]\tvalid_0's ndcg@1: 0.925875\tvalid_0's ndcg@2: 0.969914\tvalid_0's ndcg@3: 0.971714\tvalid_0's ndcg@4: 0.971972\tvalid_0's ndcg@5: 0.972021\n", + "[6]\tvalid_0's ndcg@1: 0.926875\tvalid_0's ndcg@2: 0.970425\tvalid_0's ndcg@3: 0.972112\tvalid_0's ndcg@4: 0.972371\tvalid_0's ndcg@5: 0.972419\n", + "[7]\tvalid_0's ndcg@1: 0.927475\tvalid_0's ndcg@2: 0.970631\tvalid_0's ndcg@3: 0.972306\tvalid_0's ndcg@4: 0.972586\tvalid_0's ndcg@5: 0.972634\n", + "[8]\tvalid_0's ndcg@1: 0.93015\tvalid_0's ndcg@2: 0.971649\tvalid_0's ndcg@3: 0.973287\tvalid_0's ndcg@4: 0.973567\tvalid_0's ndcg@5: 0.973625\n", + "[9]\tvalid_0's ndcg@1: 0.9312\tvalid_0's ndcg@2: 0.972084\tvalid_0's ndcg@3: 0.973684\tvalid_0's ndcg@4: 0.973964\tvalid_0's ndcg@5: 0.974022\n", + "[10]\tvalid_0's ndcg@1: 0.93225\tvalid_0's ndcg@2: 0.972456\tvalid_0's ndcg@3: 0.974081\tvalid_0's ndcg@4: 0.974361\tvalid_0's ndcg@5: 0.974409\n", + "[11]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.972704\tvalid_0's ndcg@3: 0.974379\tvalid_0's ndcg@4: 0.974648\tvalid_0's ndcg@5: 0.974696\n", + "[12]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972949\tvalid_0's ndcg@3: 0.974574\tvalid_0's ndcg@4: 0.974832\tvalid_0's ndcg@5: 0.974881\n", + "[13]\tvalid_0's ndcg@1: 0.93415\tvalid_0's ndcg@2: 0.97322\tvalid_0's ndcg@3: 0.97482\tvalid_0's ndcg@4: 0.975079\tvalid_0's ndcg@5: 0.975127\n", + "[14]\tvalid_0's ndcg@1: 0.9352\tvalid_0's ndcg@2: 0.973671\tvalid_0's ndcg@3: 0.975246\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975531\n", + "[15]\tvalid_0's ndcg@1: 0.9358\tvalid_0's ndcg@2: 0.973877\tvalid_0's ndcg@3: 0.975452\tvalid_0's ndcg@4: 0.975699\tvalid_0's ndcg@5: 0.975748\n", + "[16]\tvalid_0's ndcg@1: 0.935825\tvalid_0's ndcg@2: 0.973917\tvalid_0's ndcg@3: 0.975442\tvalid_0's ndcg@4: 0.975712\tvalid_0's ndcg@5: 0.97576\n", + "[17]\tvalid_0's ndcg@1: 0.936475\tvalid_0's ndcg@2: 0.97411\tvalid_0's ndcg@3: 0.975697\tvalid_0's ndcg@4: 0.975956\tvalid_0's ndcg@5: 0.975995\n", + "[18]\tvalid_0's ndcg@1: 0.936925\tvalid_0's ndcg@2: 0.974292\tvalid_0's ndcg@3: 0.975867\tvalid_0's ndcg@4: 0.976114\tvalid_0's ndcg@5: 0.976163\n", + "[19]\tvalid_0's ndcg@1: 0.937525\tvalid_0's ndcg@2: 0.974545\tvalid_0's ndcg@3: 0.976095\tvalid_0's ndcg@4: 0.976342\tvalid_0's ndcg@5: 0.976391\n", + "[20]\tvalid_0's ndcg@1: 0.937775\tvalid_0's ndcg@2: 0.974653\tvalid_0's ndcg@3: 0.976203\tvalid_0's ndcg@4: 0.976429\tvalid_0's ndcg@5: 0.976487\n", + "[21]\tvalid_0's ndcg@1: 0.938825\tvalid_0's ndcg@2: 0.975072\tvalid_0's ndcg@3: 0.976597\tvalid_0's ndcg@4: 0.976823\tvalid_0's ndcg@5: 0.976881\n", + "[22]\tvalid_0's ndcg@1: 0.93885\tvalid_0's ndcg@2: 0.975097\tvalid_0's ndcg@3: 0.976609\tvalid_0's ndcg@4: 0.976846\tvalid_0's ndcg@5: 0.976895\n", + "[23]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975246\tvalid_0's ndcg@3: 0.976733\tvalid_0's ndcg@4: 0.976959\tvalid_0's ndcg@5: 0.977008\n", + "[24]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.975246\tvalid_0's ndcg@3: 0.976721\tvalid_0's ndcg@4: 0.976947\tvalid_0's ndcg@5: 0.977005\n", + "[25]\tvalid_0's ndcg@1: 0.9396\tvalid_0's ndcg@2: 0.975421\tvalid_0's ndcg@3: 0.976909\tvalid_0's ndcg@4: 0.977124\tvalid_0's ndcg@5: 0.977182\n", + "[26]\tvalid_0's ndcg@1: 0.9393\tvalid_0's ndcg@2: 0.975342\tvalid_0's ndcg@3: 0.976804\tvalid_0's ndcg@4: 0.97702\tvalid_0's ndcg@5: 0.977078\n", + "[27]\tvalid_0's ndcg@1: 0.93925\tvalid_0's ndcg@2: 0.975323\tvalid_0's ndcg@3: 0.976798\tvalid_0's ndcg@4: 0.977014\tvalid_0's ndcg@5: 0.977062\n", + "[28]\tvalid_0's ndcg@1: 0.93925\tvalid_0's ndcg@2: 0.975308\tvalid_0's ndcg@3: 0.976783\tvalid_0's ndcg@4: 0.977009\tvalid_0's ndcg@5: 0.977057\n", + "[29]\tvalid_0's ndcg@1: 0.94\tvalid_0's ndcg@2: 0.975569\tvalid_0's ndcg@3: 0.977056\tvalid_0's ndcg@4: 0.977282\tvalid_0's ndcg@5: 0.977331\n", + "[30]\tvalid_0's ndcg@1: 0.940325\tvalid_0's ndcg@2: 0.975673\tvalid_0's ndcg@3: 0.977173\tvalid_0's ndcg@4: 0.977399\tvalid_0's ndcg@5: 0.977447\n", + "[31]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975731\tvalid_0's ndcg@3: 0.977243\tvalid_0's ndcg@4: 0.977469\tvalid_0's ndcg@5: 0.977518\n", + "[32]\tvalid_0's ndcg@1: 0.940625\tvalid_0's ndcg@2: 0.975831\tvalid_0's ndcg@3: 0.977306\tvalid_0's ndcg@4: 0.977521\tvalid_0's ndcg@5: 0.97757\n", + "[33]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975766\tvalid_0's ndcg@3: 0.977241\tvalid_0's ndcg@4: 0.977457\tvalid_0's ndcg@5: 0.977505\n", + "[34]\tvalid_0's ndcg@1: 0.940625\tvalid_0's ndcg@2: 0.975831\tvalid_0's ndcg@3: 0.977306\tvalid_0's ndcg@4: 0.977521\tvalid_0's ndcg@5: 0.97757\n", + "[35]\tvalid_0's ndcg@1: 0.940725\tvalid_0's ndcg@2: 0.975868\tvalid_0's ndcg@3: 0.977343\tvalid_0's ndcg@4: 0.977558\tvalid_0's ndcg@5: 0.977606\n", + "[36]\tvalid_0's ndcg@1: 0.94115\tvalid_0's ndcg@2: 0.976056\tvalid_0's ndcg@3: 0.977506\tvalid_0's ndcg@4: 0.977722\tvalid_0's ndcg@5: 0.97777\n", + "[37]\tvalid_0's ndcg@1: 0.9414\tvalid_0's ndcg@2: 0.976133\tvalid_0's ndcg@3: 0.977595\tvalid_0's ndcg@4: 0.977811\tvalid_0's ndcg@5: 0.977859\n", + "[38]\tvalid_0's ndcg@1: 0.94175\tvalid_0's ndcg@2: 0.976278\tvalid_0's ndcg@3: 0.977715\tvalid_0's ndcg@4: 0.977941\tvalid_0's ndcg@5: 0.97799\n", + "[39]\tvalid_0's ndcg@1: 0.942075\tvalid_0's ndcg@2: 0.976366\tvalid_0's ndcg@3: 0.977841\tvalid_0's ndcg@4: 0.978056\tvalid_0's ndcg@5: 0.978105\n", + "[40]\tvalid_0's ndcg@1: 0.94215\tvalid_0's ndcg@2: 0.976409\tvalid_0's ndcg@3: 0.977872\tvalid_0's ndcg@4: 0.978087\tvalid_0's ndcg@5: 0.978136\n", + "[41]\tvalid_0's ndcg@1: 0.94245\tvalid_0's ndcg@2: 0.97652\tvalid_0's ndcg@3: 0.977983\tvalid_0's ndcg@4: 0.978198\tvalid_0's ndcg@5: 0.978246\n", + "[42]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976682\tvalid_0's ndcg@3: 0.97817\tvalid_0's ndcg@4: 0.978385\tvalid_0's ndcg@5: 0.978434\n", + "[43]\tvalid_0's ndcg@1: 0.942975\tvalid_0's ndcg@2: 0.976682\tvalid_0's ndcg@3: 0.97817\tvalid_0's ndcg@4: 0.978385\tvalid_0's ndcg@5: 0.978434\n", + "[44]\tvalid_0's ndcg@1: 0.94285\tvalid_0's ndcg@2: 0.976636\tvalid_0's ndcg@3: 0.978111\tvalid_0's ndcg@4: 0.978337\tvalid_0's ndcg@5: 0.978386\n", + "[45]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.9768\tvalid_0's ndcg@3: 0.978262\tvalid_0's ndcg@4: 0.978488\tvalid_0's ndcg@5: 0.978537\n", + "[46]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976913\tvalid_0's ndcg@3: 0.978388\tvalid_0's ndcg@4: 0.978614\tvalid_0's ndcg@5: 0.978663\n", + "[47]\tvalid_0's ndcg@1: 0.943525\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.97836\tvalid_0's ndcg@4: 0.978576\tvalid_0's ndcg@5: 0.978634\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[48]\tvalid_0's ndcg@1: 0.943525\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.978373\tvalid_0's ndcg@4: 0.978577\tvalid_0's ndcg@5: 0.978636\n", + "[49]\tvalid_0's ndcg@1: 0.9436\tvalid_0's ndcg@2: 0.976913\tvalid_0's ndcg@3: 0.978388\tvalid_0's ndcg@4: 0.978614\tvalid_0's ndcg@5: 0.978663\n", + "[50]\tvalid_0's ndcg@1: 0.943975\tvalid_0's ndcg@2: 0.97702\tvalid_0's ndcg@3: 0.97852\tvalid_0's ndcg@4: 0.978746\tvalid_0's ndcg@5: 0.978794\n", + "[51]\tvalid_0's ndcg@1: 0.9441\tvalid_0's ndcg@2: 0.97705\tvalid_0's ndcg@3: 0.97855\tvalid_0's ndcg@4: 0.978787\tvalid_0's ndcg@5: 0.978836\n", + "[52]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.977121\tvalid_0's ndcg@3: 0.978609\tvalid_0's ndcg@4: 0.978846\tvalid_0's ndcg@5: 0.978894\n", + "[53]\tvalid_0's ndcg@1: 0.944225\tvalid_0's ndcg@2: 0.977081\tvalid_0's ndcg@3: 0.978618\tvalid_0's ndcg@4: 0.978834\tvalid_0's ndcg@5: 0.978882\n", + "[54]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.977071\tvalid_0's ndcg@3: 0.978609\tvalid_0's ndcg@4: 0.978824\tvalid_0's ndcg@5: 0.978873\n", + "[55]\tvalid_0's ndcg@1: 0.94435\tvalid_0's ndcg@2: 0.977143\tvalid_0's ndcg@3: 0.978668\tvalid_0's ndcg@4: 0.978883\tvalid_0's ndcg@5: 0.978931\n", + "[56]\tvalid_0's ndcg@1: 0.9444\tvalid_0's ndcg@2: 0.977177\tvalid_0's ndcg@3: 0.978702\tvalid_0's ndcg@4: 0.978906\tvalid_0's ndcg@5: 0.978955\n", + "[57]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.977263\tvalid_0's ndcg@3: 0.978788\tvalid_0's ndcg@4: 0.979003\tvalid_0's ndcg@5: 0.979051\n", + "[58]\tvalid_0's ndcg@1: 0.9448\tvalid_0's ndcg@2: 0.977293\tvalid_0's ndcg@3: 0.978843\tvalid_0's ndcg@4: 0.979047\tvalid_0's ndcg@5: 0.979096\n", + "[59]\tvalid_0's ndcg@1: 0.9452\tvalid_0's ndcg@2: 0.977472\tvalid_0's ndcg@3: 0.978997\tvalid_0's ndcg@4: 0.979202\tvalid_0's ndcg@5: 0.97925\n", + "[60]\tvalid_0's ndcg@1: 0.9455\tvalid_0's ndcg@2: 0.97763\tvalid_0's ndcg@3: 0.979118\tvalid_0's ndcg@4: 0.979322\tvalid_0's ndcg@5: 0.979371\n", + "[61]\tvalid_0's ndcg@1: 0.945725\tvalid_0's ndcg@2: 0.977682\tvalid_0's ndcg@3: 0.979194\tvalid_0's ndcg@4: 0.979399\tvalid_0's ndcg@5: 0.979447\n", + "[62]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977812\tvalid_0's ndcg@3: 0.979312\tvalid_0's ndcg@4: 0.979495\tvalid_0's ndcg@5: 0.979543\n", + "[63]\tvalid_0's ndcg@1: 0.946\tvalid_0's ndcg@2: 0.977878\tvalid_0's ndcg@3: 0.97934\tvalid_0's ndcg@4: 0.979523\tvalid_0's ndcg@5: 0.979572\n", + "[64]\tvalid_0's ndcg@1: 0.946525\tvalid_0's ndcg@2: 0.978056\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979714\tvalid_0's ndcg@5: 0.979762\n", + "[65]\tvalid_0's ndcg@1: 0.9467\tvalid_0's ndcg@2: 0.978105\tvalid_0's ndcg@3: 0.979592\tvalid_0's ndcg@4: 0.979775\tvalid_0's ndcg@5: 0.979823\n", + "[66]\tvalid_0's ndcg@1: 0.9465\tvalid_0's ndcg@2: 0.978046\tvalid_0's ndcg@3: 0.979534\tvalid_0's ndcg@4: 0.979706\tvalid_0's ndcg@5: 0.979755\n", + "[67]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.978127\tvalid_0's ndcg@3: 0.979614\tvalid_0's ndcg@4: 0.979776\tvalid_0's ndcg@5: 0.979824\n", + "[68]\tvalid_0's ndcg@1: 0.9467\tvalid_0's ndcg@2: 0.97812\tvalid_0's ndcg@3: 0.979608\tvalid_0's ndcg@4: 0.97978\tvalid_0's ndcg@5: 0.979828\n", + "[69]\tvalid_0's ndcg@1: 0.946875\tvalid_0's ndcg@2: 0.978216\tvalid_0's ndcg@3: 0.979679\tvalid_0's ndcg@4: 0.979851\tvalid_0's ndcg@5: 0.9799\n", + "[70]\tvalid_0's ndcg@1: 0.9469\tvalid_0's ndcg@2: 0.978194\tvalid_0's ndcg@3: 0.979682\tvalid_0's ndcg@4: 0.979854\tvalid_0's ndcg@5: 0.979902\n", + "[71]\tvalid_0's ndcg@1: 0.947025\tvalid_0's ndcg@2: 0.978209\tvalid_0's ndcg@3: 0.979721\tvalid_0's ndcg@4: 0.979893\tvalid_0's ndcg@5: 0.979942\n", + "[72]\tvalid_0's ndcg@1: 0.9472\tvalid_0's ndcg@2: 0.978273\tvalid_0's ndcg@3: 0.979773\tvalid_0's ndcg@4: 0.979956\tvalid_0's ndcg@5: 0.980005\n", + "[73]\tvalid_0's ndcg@1: 0.947475\tvalid_0's ndcg@2: 0.978391\tvalid_0's ndcg@3: 0.979878\tvalid_0's ndcg@4: 0.980061\tvalid_0's ndcg@5: 0.980109\n", + "[74]\tvalid_0's ndcg@1: 0.94715\tvalid_0's ndcg@2: 0.978271\tvalid_0's ndcg@3: 0.979758\tvalid_0's ndcg@4: 0.979941\tvalid_0's ndcg@5: 0.97999\n", + "[75]\tvalid_0's ndcg@1: 0.947275\tvalid_0's ndcg@2: 0.978333\tvalid_0's ndcg@3: 0.979808\tvalid_0's ndcg@4: 0.979991\tvalid_0's ndcg@5: 0.980039\n", + "[76]\tvalid_0's ndcg@1: 0.9474\tvalid_0's ndcg@2: 0.97841\tvalid_0's ndcg@3: 0.979873\tvalid_0's ndcg@4: 0.980045\tvalid_0's ndcg@5: 0.980093\n", + "[77]\tvalid_0's ndcg@1: 0.94745\tvalid_0's ndcg@2: 0.97846\tvalid_0's ndcg@3: 0.979898\tvalid_0's ndcg@4: 0.98007\tvalid_0's ndcg@5: 0.980118\n", + "[78]\tvalid_0's ndcg@1: 0.94775\tvalid_0's ndcg@2: 0.978555\tvalid_0's ndcg@3: 0.980005\tvalid_0's ndcg@4: 0.980177\tvalid_0's ndcg@5: 0.980226\n", + "[79]\tvalid_0's ndcg@1: 0.947875\tvalid_0's ndcg@2: 0.978617\tvalid_0's ndcg@3: 0.980055\tvalid_0's ndcg@4: 0.980238\tvalid_0's ndcg@5: 0.980276\n", + "[80]\tvalid_0's ndcg@1: 0.947875\tvalid_0's ndcg@2: 0.978617\tvalid_0's ndcg@3: 0.980055\tvalid_0's ndcg@4: 0.980238\tvalid_0's ndcg@5: 0.980276\n", + "[81]\tvalid_0's ndcg@1: 0.948175\tvalid_0's ndcg@2: 0.978744\tvalid_0's ndcg@3: 0.980169\tvalid_0's ndcg@4: 0.980352\tvalid_0's ndcg@5: 0.98039\n", + "[82]\tvalid_0's ndcg@1: 0.948375\tvalid_0's ndcg@2: 0.97888\tvalid_0's ndcg@3: 0.980255\tvalid_0's ndcg@4: 0.980438\tvalid_0's ndcg@5: 0.980477\n", + "[83]\tvalid_0's ndcg@1: 0.94825\tvalid_0's ndcg@2: 0.978834\tvalid_0's ndcg@3: 0.980209\tvalid_0's ndcg@4: 0.980392\tvalid_0's ndcg@5: 0.980431\n", + "[84]\tvalid_0's ndcg@1: 0.948275\tvalid_0's ndcg@2: 0.978844\tvalid_0's ndcg@3: 0.980219\tvalid_0's ndcg@4: 0.980402\tvalid_0's ndcg@5: 0.98044\n", + "[85]\tvalid_0's ndcg@1: 0.948475\tvalid_0's ndcg@2: 0.978917\tvalid_0's ndcg@3: 0.980292\tvalid_0's ndcg@4: 0.980475\tvalid_0's ndcg@5: 0.980514\n", + "[86]\tvalid_0's ndcg@1: 0.948975\tvalid_0's ndcg@2: 0.979102\tvalid_0's ndcg@3: 0.980477\tvalid_0's ndcg@4: 0.98066\tvalid_0's ndcg@5: 0.980699\n", + "[87]\tvalid_0's ndcg@1: 0.948975\tvalid_0's ndcg@2: 0.979086\tvalid_0's ndcg@3: 0.980474\tvalid_0's ndcg@4: 0.980657\tvalid_0's ndcg@5: 0.980695\n", + "[88]\tvalid_0's ndcg@1: 0.949025\tvalid_0's ndcg@2: 0.979136\tvalid_0's ndcg@3: 0.980499\tvalid_0's ndcg@4: 0.980682\tvalid_0's ndcg@5: 0.98072\n", + "[89]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979285\tvalid_0's ndcg@3: 0.98061\tvalid_0's ndcg@4: 0.980793\tvalid_0's ndcg@5: 0.980832\n", + "[90]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979269\tvalid_0's ndcg@3: 0.980607\tvalid_0's ndcg@4: 0.98079\tvalid_0's ndcg@5: 0.980828\n", + "[91]\tvalid_0's ndcg@1: 0.9493\tvalid_0's ndcg@2: 0.979269\tvalid_0's ndcg@3: 0.980607\tvalid_0's ndcg@4: 0.98079\tvalid_0's ndcg@5: 0.980828\n", + "[92]\tvalid_0's ndcg@1: 0.9494\tvalid_0's ndcg@2: 0.97929\tvalid_0's ndcg@3: 0.98064\tvalid_0's ndcg@4: 0.980823\tvalid_0's ndcg@5: 0.980862\n", + "[93]\tvalid_0's ndcg@1: 0.949375\tvalid_0's ndcg@2: 0.979297\tvalid_0's ndcg@3: 0.980634\tvalid_0's ndcg@4: 0.980817\tvalid_0's ndcg@5: 0.980856\n", + "[94]\tvalid_0's ndcg@1: 0.949525\tvalid_0's ndcg@2: 0.979336\tvalid_0's ndcg@3: 0.980686\tvalid_0's ndcg@4: 0.980869\tvalid_0's ndcg@5: 0.980908\n", + "[95]\tvalid_0's ndcg@1: 0.949825\tvalid_0's ndcg@2: 0.979416\tvalid_0's ndcg@3: 0.980791\tvalid_0's ndcg@4: 0.980974\tvalid_0's ndcg@5: 0.981012\n", + "[96]\tvalid_0's ndcg@1: 0.94975\tvalid_0's ndcg@2: 0.979404\tvalid_0's ndcg@3: 0.980779\tvalid_0's ndcg@4: 0.980951\tvalid_0's ndcg@5: 0.98099\n", + "[97]\tvalid_0's ndcg@1: 0.950025\tvalid_0's ndcg@2: 0.979537\tvalid_0's ndcg@3: 0.980874\tvalid_0's ndcg@4: 0.981057\tvalid_0's ndcg@5: 0.981096\n", + "[98]\tvalid_0's ndcg@1: 0.9501\tvalid_0's ndcg@2: 0.979564\tvalid_0's ndcg@3: 0.980889\tvalid_0's ndcg@4: 0.981083\tvalid_0's ndcg@5: 0.981122\n", + "[99]\tvalid_0's ndcg@1: 0.950275\tvalid_0's ndcg@2: 0.979629\tvalid_0's ndcg@3: 0.980967\tvalid_0's ndcg@4: 0.98115\tvalid_0's ndcg@5: 0.981188\n", + "[100]\tvalid_0's ndcg@1: 0.950325\tvalid_0's ndcg@2: 0.979647\tvalid_0's ndcg@3: 0.980985\tvalid_0's ndcg@4: 0.981168\tvalid_0's ndcg@5: 0.981207\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's ndcg@1: 0.950325\tvalid_0's ndcg@2: 0.979647\tvalid_0's ndcg@3: 0.980985\tvalid_0's ndcg@4: 0.981168\tvalid_0's ndcg@5: 0.981207\n", + "[1]\tvalid_0's ndcg@1: 0.910175\tvalid_0's ndcg@2: 0.96382\tvalid_0's ndcg@3: 0.965707\tvalid_0's ndcg@4: 0.966009\tvalid_0's ndcg@5: 0.966086\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.91415\tvalid_0's ndcg@2: 0.965492\tvalid_0's ndcg@3: 0.967254\tvalid_0's ndcg@4: 0.967556\tvalid_0's ndcg@5: 0.967604\n", + "[3]\tvalid_0's ndcg@1: 0.916025\tvalid_0's ndcg@2: 0.966389\tvalid_0's ndcg@3: 0.967976\tvalid_0's ndcg@4: 0.968278\tvalid_0's ndcg@5: 0.968355\n", + "[4]\tvalid_0's ndcg@1: 0.919\tvalid_0's ndcg@2: 0.967392\tvalid_0's ndcg@3: 0.96903\tvalid_0's ndcg@4: 0.969364\tvalid_0's ndcg@5: 0.969431\n", + "[5]\tvalid_0's ndcg@1: 0.921125\tvalid_0's ndcg@2: 0.968192\tvalid_0's ndcg@3: 0.969855\tvalid_0's ndcg@4: 0.970156\tvalid_0's ndcg@5: 0.970224\n", + "[6]\tvalid_0's ndcg@1: 0.921675\tvalid_0's ndcg@2: 0.968411\tvalid_0's ndcg@3: 0.970111\tvalid_0's ndcg@4: 0.97037\tvalid_0's ndcg@5: 0.970437\n", + "[7]\tvalid_0's ndcg@1: 0.9237\tvalid_0's ndcg@2: 0.969332\tvalid_0's ndcg@3: 0.970882\tvalid_0's ndcg@4: 0.97113\tvalid_0's ndcg@5: 0.971217\n", + "[8]\tvalid_0's ndcg@1: 0.925775\tvalid_0's ndcg@2: 0.970129\tvalid_0's ndcg@3: 0.971642\tvalid_0's ndcg@4: 0.971922\tvalid_0's ndcg@5: 0.97199\n", + "[9]\tvalid_0's ndcg@1: 0.926775\tvalid_0's ndcg@2: 0.970435\tvalid_0's ndcg@3: 0.971985\tvalid_0's ndcg@4: 0.972276\tvalid_0's ndcg@5: 0.972334\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[10]\tvalid_0's ndcg@1: 0.9277\tvalid_0's ndcg@2: 0.970761\tvalid_0's ndcg@3: 0.972311\tvalid_0's ndcg@4: 0.972612\tvalid_0's ndcg@5: 0.97267\n", + "[11]\tvalid_0's ndcg@1: 0.928975\tvalid_0's ndcg@2: 0.97131\tvalid_0's ndcg@3: 0.972798\tvalid_0's ndcg@4: 0.973089\tvalid_0's ndcg@5: 0.973166\n", + "[12]\tvalid_0's ndcg@1: 0.929375\tvalid_0's ndcg@2: 0.971505\tvalid_0's ndcg@3: 0.972968\tvalid_0's ndcg@4: 0.973259\tvalid_0's ndcg@5: 0.973326\n", + "[13]\tvalid_0's ndcg@1: 0.929375\tvalid_0's ndcg@2: 0.971426\tvalid_0's ndcg@3: 0.972939\tvalid_0's ndcg@4: 0.97324\tvalid_0's ndcg@5: 0.973318\n", + "[14]\tvalid_0's ndcg@1: 0.929775\tvalid_0's ndcg@2: 0.971621\tvalid_0's ndcg@3: 0.973121\tvalid_0's ndcg@4: 0.973412\tvalid_0's ndcg@5: 0.97348\n", + "[15]\tvalid_0's ndcg@1: 0.9304\tvalid_0's ndcg@2: 0.971868\tvalid_0's ndcg@3: 0.97338\tvalid_0's ndcg@4: 0.97365\tvalid_0's ndcg@5: 0.973717\n", + "[16]\tvalid_0's ndcg@1: 0.930975\tvalid_0's ndcg@2: 0.972096\tvalid_0's ndcg@3: 0.973558\tvalid_0's ndcg@4: 0.973849\tvalid_0's ndcg@5: 0.973926\n", + "[17]\tvalid_0's ndcg@1: 0.93105\tvalid_0's ndcg@2: 0.972108\tvalid_0's ndcg@3: 0.973583\tvalid_0's ndcg@4: 0.973884\tvalid_0's ndcg@5: 0.973952\n", + "[18]\tvalid_0's ndcg@1: 0.931725\tvalid_0's ndcg@2: 0.972373\tvalid_0's ndcg@3: 0.97386\tvalid_0's ndcg@4: 0.974129\tvalid_0's ndcg@5: 0.974207\n", + "[19]\tvalid_0's ndcg@1: 0.932175\tvalid_0's ndcg@2: 0.972681\tvalid_0's ndcg@3: 0.974068\tvalid_0's ndcg@4: 0.974348\tvalid_0's ndcg@5: 0.974406\n", + "[20]\tvalid_0's ndcg@1: 0.93305\tvalid_0's ndcg@2: 0.973019\tvalid_0's ndcg@3: 0.974382\tvalid_0's ndcg@4: 0.974673\tvalid_0's ndcg@5: 0.974731\n", + "[21]\tvalid_0's ndcg@1: 0.933075\tvalid_0's ndcg@2: 0.97306\tvalid_0's ndcg@3: 0.974423\tvalid_0's ndcg@4: 0.974703\tvalid_0's ndcg@5: 0.97477\n", + "[22]\tvalid_0's ndcg@1: 0.93375\tvalid_0's ndcg@2: 0.973262\tvalid_0's ndcg@3: 0.974649\tvalid_0's ndcg@4: 0.974929\tvalid_0's ndcg@5: 0.975007\n", + "[23]\tvalid_0's ndcg@1: 0.933675\tvalid_0's ndcg@2: 0.973219\tvalid_0's ndcg@3: 0.974606\tvalid_0's ndcg@4: 0.974886\tvalid_0's ndcg@5: 0.974973\n", + "[24]\tvalid_0's ndcg@1: 0.934\tvalid_0's ndcg@2: 0.97337\tvalid_0's ndcg@3: 0.974745\tvalid_0's ndcg@4: 0.975014\tvalid_0's ndcg@5: 0.975101\n", + "[25]\tvalid_0's ndcg@1: 0.934825\tvalid_0's ndcg@2: 0.973674\tvalid_0's ndcg@3: 0.975062\tvalid_0's ndcg@4: 0.975342\tvalid_0's ndcg@5: 0.97541\n", + "[26]\tvalid_0's ndcg@1: 0.93495\tvalid_0's ndcg@2: 0.973721\tvalid_0's ndcg@3: 0.975096\tvalid_0's ndcg@4: 0.975365\tvalid_0's ndcg@5: 0.975452\n", + "[27]\tvalid_0's ndcg@1: 0.9358\tvalid_0's ndcg@2: 0.974082\tvalid_0's ndcg@3: 0.975444\tvalid_0's ndcg@4: 0.975713\tvalid_0's ndcg@5: 0.975781\n", + "[28]\tvalid_0's ndcg@1: 0.935325\tvalid_0's ndcg@2: 0.973875\tvalid_0's ndcg@3: 0.975275\tvalid_0's ndcg@4: 0.975512\tvalid_0's ndcg@5: 0.975599\n", + "[29]\tvalid_0's ndcg@1: 0.935925\tvalid_0's ndcg@2: 0.974159\tvalid_0's ndcg@3: 0.975522\tvalid_0's ndcg@4: 0.975759\tvalid_0's ndcg@5: 0.975836\n", + "[30]\tvalid_0's ndcg@1: 0.9362\tvalid_0's ndcg@2: 0.974214\tvalid_0's ndcg@3: 0.975589\tvalid_0's ndcg@4: 0.975847\tvalid_0's ndcg@5: 0.975924\n", + "[31]\tvalid_0's ndcg@1: 0.93625\tvalid_0's ndcg@2: 0.974216\tvalid_0's ndcg@3: 0.975629\tvalid_0's ndcg@4: 0.975876\tvalid_0's ndcg@5: 0.975944\n", + "[32]\tvalid_0's ndcg@1: 0.93665\tvalid_0's ndcg@2: 0.974427\tvalid_0's ndcg@3: 0.975814\tvalid_0's ndcg@4: 0.97603\tvalid_0's ndcg@5: 0.976107\n", + "[33]\tvalid_0's ndcg@1: 0.936775\tvalid_0's ndcg@2: 0.974505\tvalid_0's ndcg@3: 0.975855\tvalid_0's ndcg@4: 0.976081\tvalid_0's ndcg@5: 0.976158\n", + "[34]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.974643\tvalid_0's ndcg@3: 0.975993\tvalid_0's ndcg@4: 0.976219\tvalid_0's ndcg@5: 0.976296\n", + "[35]\tvalid_0's ndcg@1: 0.937675\tvalid_0's ndcg@2: 0.974805\tvalid_0's ndcg@3: 0.97618\tvalid_0's ndcg@4: 0.976406\tvalid_0's ndcg@5: 0.976484\n", + "[36]\tvalid_0's ndcg@1: 0.9382\tvalid_0's ndcg@2: 0.974983\tvalid_0's ndcg@3: 0.976371\tvalid_0's ndcg@4: 0.976597\tvalid_0's ndcg@5: 0.976674\n", + "[37]\tvalid_0's ndcg@1: 0.938175\tvalid_0's ndcg@2: 0.974974\tvalid_0's ndcg@3: 0.976349\tvalid_0's ndcg@4: 0.976586\tvalid_0's ndcg@5: 0.976663\n", + "[38]\tvalid_0's ndcg@1: 0.938675\tvalid_0's ndcg@2: 0.975143\tvalid_0's ndcg@3: 0.976518\tvalid_0's ndcg@4: 0.976776\tvalid_0's ndcg@5: 0.976844\n", + "[39]\tvalid_0's ndcg@1: 0.938575\tvalid_0's ndcg@2: 0.975106\tvalid_0's ndcg@3: 0.976481\tvalid_0's ndcg@4: 0.976739\tvalid_0's ndcg@5: 0.976807\n", + "[40]\tvalid_0's ndcg@1: 0.938675\tvalid_0's ndcg@2: 0.97519\tvalid_0's ndcg@3: 0.976528\tvalid_0's ndcg@4: 0.976775\tvalid_0's ndcg@5: 0.976853\n", + "[41]\tvalid_0's ndcg@1: 0.9391\tvalid_0's ndcg@2: 0.975347\tvalid_0's ndcg@3: 0.976697\tvalid_0's ndcg@4: 0.976934\tvalid_0's ndcg@5: 0.977001\n", + "[42]\tvalid_0's ndcg@1: 0.939825\tvalid_0's ndcg@2: 0.975599\tvalid_0's ndcg@3: 0.976961\tvalid_0's ndcg@4: 0.977198\tvalid_0's ndcg@5: 0.977266\n", + "[43]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.975639\tvalid_0's ndcg@3: 0.976977\tvalid_0's ndcg@4: 0.977214\tvalid_0's ndcg@5: 0.977282\n", + "[44]\tvalid_0's ndcg@1: 0.9398\tvalid_0's ndcg@2: 0.975605\tvalid_0's ndcg@3: 0.976955\tvalid_0's ndcg@4: 0.977192\tvalid_0's ndcg@5: 0.97726\n", + "[45]\tvalid_0's ndcg@1: 0.9401\tvalid_0's ndcg@2: 0.9757\tvalid_0's ndcg@3: 0.977075\tvalid_0's ndcg@4: 0.977291\tvalid_0's ndcg@5: 0.977368\n", + "[46]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975845\tvalid_0's ndcg@3: 0.977183\tvalid_0's ndcg@4: 0.97742\tvalid_0's ndcg@5: 0.977497\n", + "[47]\tvalid_0's ndcg@1: 0.940475\tvalid_0's ndcg@2: 0.975854\tvalid_0's ndcg@3: 0.977204\tvalid_0's ndcg@4: 0.97743\tvalid_0's ndcg@5: 0.977508\n", + "[48]\tvalid_0's ndcg@1: 0.940575\tvalid_0's ndcg@2: 0.975923\tvalid_0's ndcg@3: 0.977273\tvalid_0's ndcg@4: 0.977488\tvalid_0's ndcg@5: 0.977556\n", + "[49]\tvalid_0's ndcg@1: 0.9407\tvalid_0's ndcg@2: 0.975922\tvalid_0's ndcg@3: 0.977297\tvalid_0's ndcg@4: 0.977501\tvalid_0's ndcg@5: 0.977588\n", + "[50]\tvalid_0's ndcg@1: 0.940725\tvalid_0's ndcg@2: 0.975947\tvalid_0's ndcg@3: 0.977322\tvalid_0's ndcg@4: 0.977505\tvalid_0's ndcg@5: 0.977592\n", + "[51]\tvalid_0's ndcg@1: 0.9406\tvalid_0's ndcg@2: 0.975837\tvalid_0's ndcg@3: 0.97725\tvalid_0's ndcg@4: 0.977422\tvalid_0's ndcg@5: 0.977509\n", + "[52]\tvalid_0's ndcg@1: 0.941075\tvalid_0's ndcg@2: 0.975997\tvalid_0's ndcg@3: 0.977422\tvalid_0's ndcg@4: 0.977594\tvalid_0's ndcg@5: 0.977691\n", + "[53]\tvalid_0's ndcg@1: 0.940925\tvalid_0's ndcg@2: 0.975989\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.977538\tvalid_0's ndcg@5: 0.977644\n", + "[54]\tvalid_0's ndcg@1: 0.94125\tvalid_0's ndcg@2: 0.976062\tvalid_0's ndcg@3: 0.977487\tvalid_0's ndcg@4: 0.977659\tvalid_0's ndcg@5: 0.977756\n", + "[55]\tvalid_0's ndcg@1: 0.94145\tvalid_0's ndcg@2: 0.976183\tvalid_0's ndcg@3: 0.97757\tvalid_0's ndcg@4: 0.977742\tvalid_0's ndcg@5: 0.977839\n", + "[56]\tvalid_0's ndcg@1: 0.941475\tvalid_0's ndcg@2: 0.976176\tvalid_0's ndcg@3: 0.977576\tvalid_0's ndcg@4: 0.977748\tvalid_0's ndcg@5: 0.977845\n", + "[57]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.976139\tvalid_0's ndcg@3: 0.977539\tvalid_0's ndcg@4: 0.977712\tvalid_0's ndcg@5: 0.977808\n", + "[58]\tvalid_0's ndcg@1: 0.941675\tvalid_0's ndcg@2: 0.97625\tvalid_0's ndcg@3: 0.97765\tvalid_0's ndcg@4: 0.977822\tvalid_0's ndcg@5: 0.977919\n", + "[59]\tvalid_0's ndcg@1: 0.941725\tvalid_0's ndcg@2: 0.976253\tvalid_0's ndcg@3: 0.977653\tvalid_0's ndcg@4: 0.977836\tvalid_0's ndcg@5: 0.977932\n", + "[60]\tvalid_0's ndcg@1: 0.941675\tvalid_0's ndcg@2: 0.976234\tvalid_0's ndcg@3: 0.977634\tvalid_0's ndcg@4: 0.977817\tvalid_0's ndcg@5: 0.977914\n", + "[61]\tvalid_0's ndcg@1: 0.9419\tvalid_0's ndcg@2: 0.976333\tvalid_0's ndcg@3: 0.977745\tvalid_0's ndcg@4: 0.977918\tvalid_0's ndcg@5: 0.978005\n", + "[62]\tvalid_0's ndcg@1: 0.941975\tvalid_0's ndcg@2: 0.976345\tvalid_0's ndcg@3: 0.977757\tvalid_0's ndcg@4: 0.97794\tvalid_0's ndcg@5: 0.978027\n", + "[63]\tvalid_0's ndcg@1: 0.9423\tvalid_0's ndcg@2: 0.976496\tvalid_0's ndcg@3: 0.977871\tvalid_0's ndcg@4: 0.978065\tvalid_0's ndcg@5: 0.978152\n", + "[64]\tvalid_0's ndcg@1: 0.942625\tvalid_0's ndcg@2: 0.976632\tvalid_0's ndcg@3: 0.977995\tvalid_0's ndcg@4: 0.978188\tvalid_0's ndcg@5: 0.978275\n", + "[65]\tvalid_0's ndcg@1: 0.942575\tvalid_0's ndcg@2: 0.976629\tvalid_0's ndcg@3: 0.977979\tvalid_0's ndcg@4: 0.978173\tvalid_0's ndcg@5: 0.97826\n", + "[66]\tvalid_0's ndcg@1: 0.942725\tvalid_0's ndcg@2: 0.976685\tvalid_0's ndcg@3: 0.978035\tvalid_0's ndcg@4: 0.978229\tvalid_0's ndcg@5: 0.978316\n", + "[67]\tvalid_0's ndcg@1: 0.94275\tvalid_0's ndcg@2: 0.976678\tvalid_0's ndcg@3: 0.978041\tvalid_0's ndcg@4: 0.978224\tvalid_0's ndcg@5: 0.97832\n", + "[68]\tvalid_0's ndcg@1: 0.94275\tvalid_0's ndcg@2: 0.976694\tvalid_0's ndcg@3: 0.978044\tvalid_0's ndcg@4: 0.978227\tvalid_0's ndcg@5: 0.978324\n", + "[69]\tvalid_0's ndcg@1: 0.943\tvalid_0's ndcg@2: 0.976834\tvalid_0's ndcg@3: 0.978146\tvalid_0's ndcg@4: 0.978329\tvalid_0's ndcg@5: 0.978426\n", + "[70]\tvalid_0's ndcg@1: 0.943025\tvalid_0's ndcg@2: 0.976827\tvalid_0's ndcg@3: 0.978152\tvalid_0's ndcg@4: 0.978324\tvalid_0's ndcg@5: 0.978431\n", + "[71]\tvalid_0's ndcg@1: 0.9432\tvalid_0's ndcg@2: 0.976923\tvalid_0's ndcg@3: 0.978236\tvalid_0's ndcg@4: 0.978397\tvalid_0's ndcg@5: 0.978504\n", + "[72]\tvalid_0's ndcg@1: 0.943225\tvalid_0's ndcg@2: 0.976917\tvalid_0's ndcg@3: 0.978254\tvalid_0's ndcg@4: 0.978405\tvalid_0's ndcg@5: 0.978511\n", + "[73]\tvalid_0's ndcg@1: 0.94315\tvalid_0's ndcg@2: 0.976936\tvalid_0's ndcg@3: 0.978236\tvalid_0's ndcg@4: 0.978409\tvalid_0's ndcg@5: 0.978496\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[74]\tvalid_0's ndcg@1: 0.94325\tvalid_0's ndcg@2: 0.976957\tvalid_0's ndcg@3: 0.97827\tvalid_0's ndcg@4: 0.978431\tvalid_0's ndcg@5: 0.978528\n", + "[75]\tvalid_0's ndcg@1: 0.943075\tvalid_0's ndcg@2: 0.976861\tvalid_0's ndcg@3: 0.978199\tvalid_0's ndcg@4: 0.97836\tvalid_0's ndcg@5: 0.978457\n", + "[76]\tvalid_0's ndcg@1: 0.94335\tvalid_0's ndcg@2: 0.976963\tvalid_0's ndcg@3: 0.978288\tvalid_0's ndcg@4: 0.978471\tvalid_0's ndcg@5: 0.978568\n", + "[77]\tvalid_0's ndcg@1: 0.94345\tvalid_0's ndcg@2: 0.977031\tvalid_0's ndcg@3: 0.978331\tvalid_0's ndcg@4: 0.978514\tvalid_0's ndcg@5: 0.978611\n", + "[78]\tvalid_0's ndcg@1: 0.943475\tvalid_0's ndcg@2: 0.977088\tvalid_0's ndcg@3: 0.97835\tvalid_0's ndcg@4: 0.978533\tvalid_0's ndcg@5: 0.97863\n", + "[79]\tvalid_0's ndcg@1: 0.943625\tvalid_0's ndcg@2: 0.977096\tvalid_0's ndcg@3: 0.978396\tvalid_0's ndcg@4: 0.978579\tvalid_0's ndcg@5: 0.978676\n", + "[80]\tvalid_0's ndcg@1: 0.943825\tvalid_0's ndcg@2: 0.977154\tvalid_0's ndcg@3: 0.978479\tvalid_0's ndcg@4: 0.978651\tvalid_0's ndcg@5: 0.978748\n", + "[81]\tvalid_0's ndcg@1: 0.943775\tvalid_0's ndcg@2: 0.977135\tvalid_0's ndcg@3: 0.97846\tvalid_0's ndcg@4: 0.978633\tvalid_0's ndcg@5: 0.978729\n", + "[82]\tvalid_0's ndcg@1: 0.9443\tvalid_0's ndcg@2: 0.977361\tvalid_0's ndcg@3: 0.978673\tvalid_0's ndcg@4: 0.978845\tvalid_0's ndcg@5: 0.978933\n", + "[83]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.977324\tvalid_0's ndcg@3: 0.978624\tvalid_0's ndcg@4: 0.978796\tvalid_0's ndcg@5: 0.978893\n", + "[84]\tvalid_0's ndcg@1: 0.94405\tvalid_0's ndcg@2: 0.977253\tvalid_0's ndcg@3: 0.978565\tvalid_0's ndcg@4: 0.978737\tvalid_0's ndcg@5: 0.978834\n", + "[85]\tvalid_0's ndcg@1: 0.944175\tvalid_0's ndcg@2: 0.977283\tvalid_0's ndcg@3: 0.978633\tvalid_0's ndcg@4: 0.978795\tvalid_0's ndcg@5: 0.978882\n", + "[86]\tvalid_0's ndcg@1: 0.9445\tvalid_0's ndcg@2: 0.97745\tvalid_0's ndcg@3: 0.978763\tvalid_0's ndcg@4: 0.978924\tvalid_0's ndcg@5: 0.979011\n", + "[87]\tvalid_0's ndcg@1: 0.9445\tvalid_0's ndcg@2: 0.977419\tvalid_0's ndcg@3: 0.978756\tvalid_0's ndcg@4: 0.978918\tvalid_0's ndcg@5: 0.979005\n", + "[88]\tvalid_0's ndcg@1: 0.944825\tvalid_0's ndcg@2: 0.977554\tvalid_0's ndcg@3: 0.978867\tvalid_0's ndcg@4: 0.979039\tvalid_0's ndcg@5: 0.979126\n", + "[89]\tvalid_0's ndcg@1: 0.9454\tvalid_0's ndcg@2: 0.977767\tvalid_0's ndcg@3: 0.979079\tvalid_0's ndcg@4: 0.979262\tvalid_0's ndcg@5: 0.97934\n", + "[90]\tvalid_0's ndcg@1: 0.945375\tvalid_0's ndcg@2: 0.977773\tvalid_0's ndcg@3: 0.979073\tvalid_0's ndcg@4: 0.979256\tvalid_0's ndcg@5: 0.979334\n", + "[91]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977792\tvalid_0's ndcg@3: 0.979092\tvalid_0's ndcg@4: 0.979275\tvalid_0's ndcg@5: 0.979352\n", + "[92]\tvalid_0's ndcg@1: 0.945425\tvalid_0's ndcg@2: 0.977776\tvalid_0's ndcg@3: 0.979088\tvalid_0's ndcg@4: 0.979261\tvalid_0's ndcg@5: 0.979348\n", + "[93]\tvalid_0's ndcg@1: 0.945375\tvalid_0's ndcg@2: 0.977757\tvalid_0's ndcg@3: 0.979082\tvalid_0's ndcg@4: 0.979244\tvalid_0's ndcg@5: 0.979331\n", + "[94]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977761\tvalid_0's ndcg@3: 0.979061\tvalid_0's ndcg@4: 0.979223\tvalid_0's ndcg@5: 0.97931\n", + "[95]\tvalid_0's ndcg@1: 0.9454\tvalid_0's ndcg@2: 0.977798\tvalid_0's ndcg@3: 0.979086\tvalid_0's ndcg@4: 0.979258\tvalid_0's ndcg@5: 0.979345\n", + "[96]\tvalid_0's ndcg@1: 0.945825\tvalid_0's ndcg@2: 0.977955\tvalid_0's ndcg@3: 0.97923\tvalid_0's ndcg@4: 0.979413\tvalid_0's ndcg@5: 0.9795\n", + "[97]\tvalid_0's ndcg@1: 0.945925\tvalid_0's ndcg@2: 0.97796\tvalid_0's ndcg@3: 0.97926\tvalid_0's ndcg@4: 0.979443\tvalid_0's ndcg@5: 0.979531\n", + "[98]\tvalid_0's ndcg@1: 0.9464\tvalid_0's ndcg@2: 0.97812\tvalid_0's ndcg@3: 0.97942\tvalid_0's ndcg@4: 0.979625\tvalid_0's ndcg@5: 0.979702\n", + "[99]\tvalid_0's ndcg@1: 0.94655\tvalid_0's ndcg@2: 0.978191\tvalid_0's ndcg@3: 0.979479\tvalid_0's ndcg@4: 0.979683\tvalid_0's ndcg@5: 0.97977\n", + "[100]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.978244\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979725\tvalid_0's ndcg@5: 0.979812\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.978244\tvalid_0's ndcg@3: 0.979531\tvalid_0's ndcg@4: 0.979725\tvalid_0's ndcg@5: 0.979812\n", + "[1]\tvalid_0's ndcg@1: 0.910175\tvalid_0's ndcg@2: 0.963031\tvalid_0's ndcg@3: 0.965281\tvalid_0's ndcg@4: 0.965819\tvalid_0's ndcg@5: 0.965887\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's ndcg@1: 0.9141\tvalid_0's ndcg@2: 0.964748\tvalid_0's ndcg@3: 0.96681\tvalid_0's ndcg@4: 0.967316\tvalid_0's ndcg@5: 0.967394\n", + "[3]\tvalid_0's ndcg@1: 0.915925\tvalid_0's ndcg@2: 0.9655\tvalid_0's ndcg@3: 0.967575\tvalid_0's ndcg@4: 0.968028\tvalid_0's ndcg@5: 0.968105\n", + "[4]\tvalid_0's ndcg@1: 0.91915\tvalid_0's ndcg@2: 0.966943\tvalid_0's ndcg@3: 0.968968\tvalid_0's ndcg@4: 0.969334\tvalid_0's ndcg@5: 0.969373\n", + "[5]\tvalid_0's ndcg@1: 0.920625\tvalid_0's ndcg@2: 0.967598\tvalid_0's ndcg@3: 0.969498\tvalid_0's ndcg@4: 0.969896\tvalid_0's ndcg@5: 0.969944\n", + "[6]\tvalid_0's ndcg@1: 0.922625\tvalid_0's ndcg@2: 0.968336\tvalid_0's ndcg@3: 0.970261\tvalid_0's ndcg@4: 0.970659\tvalid_0's ndcg@5: 0.970688\n", + "[7]\tvalid_0's ndcg@1: 0.923625\tvalid_0's ndcg@2: 0.968768\tvalid_0's ndcg@3: 0.970656\tvalid_0's ndcg@4: 0.971043\tvalid_0's ndcg@5: 0.971072\n", + "[8]\tvalid_0's ndcg@1: 0.925825\tvalid_0's ndcg@2: 0.969612\tvalid_0's ndcg@3: 0.971462\tvalid_0's ndcg@4: 0.97186\tvalid_0's ndcg@5: 0.971879\n", + "[9]\tvalid_0's ndcg@1: 0.926475\tvalid_0's ndcg@2: 0.969899\tvalid_0's ndcg@3: 0.971711\tvalid_0's ndcg@4: 0.97211\tvalid_0's ndcg@5: 0.972129\n", + "[10]\tvalid_0's ndcg@1: 0.927775\tvalid_0's ndcg@2: 0.97041\tvalid_0's ndcg@3: 0.972185\tvalid_0's ndcg@4: 0.972594\tvalid_0's ndcg@5: 0.972614\n", + "[11]\tvalid_0's ndcg@1: 0.92885\tvalid_0's ndcg@2: 0.970838\tvalid_0's ndcg@3: 0.972588\tvalid_0's ndcg@4: 0.973008\tvalid_0's ndcg@5: 0.973028\n", + "[12]\tvalid_0's ndcg@1: 0.930325\tvalid_0's ndcg@2: 0.971367\tvalid_0's ndcg@3: 0.973129\tvalid_0's ndcg@4: 0.973549\tvalid_0's ndcg@5: 0.973569\n", + "[13]\tvalid_0's ndcg@1: 0.931125\tvalid_0's ndcg@2: 0.971631\tvalid_0's ndcg@3: 0.973443\tvalid_0's ndcg@4: 0.973842\tvalid_0's ndcg@5: 0.973871\n", + "[14]\tvalid_0's ndcg@1: 0.931525\tvalid_0's ndcg@2: 0.971778\tvalid_0's ndcg@3: 0.973616\tvalid_0's ndcg@4: 0.973993\tvalid_0's ndcg@5: 0.974022\n", + "[15]\tvalid_0's ndcg@1: 0.9311\tvalid_0's ndcg@2: 0.9717\tvalid_0's ndcg@3: 0.973475\tvalid_0's ndcg@4: 0.973852\tvalid_0's ndcg@5: 0.973872\n", + "[16]\tvalid_0's ndcg@1: 0.931775\tvalid_0's ndcg@2: 0.971902\tvalid_0's ndcg@3: 0.973702\tvalid_0's ndcg@4: 0.97409\tvalid_0's ndcg@5: 0.974109\n", + "[17]\tvalid_0's ndcg@1: 0.931425\tvalid_0's ndcg@2: 0.971805\tvalid_0's ndcg@3: 0.97358\tvalid_0's ndcg@4: 0.973967\tvalid_0's ndcg@5: 0.973986\n", + "[18]\tvalid_0's ndcg@1: 0.931575\tvalid_0's ndcg@2: 0.971876\tvalid_0's ndcg@3: 0.973651\tvalid_0's ndcg@4: 0.974027\tvalid_0's ndcg@5: 0.974047\n", + "[19]\tvalid_0's ndcg@1: 0.932\tvalid_0's ndcg@2: 0.97208\tvalid_0's ndcg@3: 0.973805\tvalid_0's ndcg@4: 0.974192\tvalid_0's ndcg@5: 0.974212\n", + "[20]\tvalid_0's ndcg@1: 0.932075\tvalid_0's ndcg@2: 0.972092\tvalid_0's ndcg@3: 0.973829\tvalid_0's ndcg@4: 0.974217\tvalid_0's ndcg@5: 0.974236\n", + "[21]\tvalid_0's ndcg@1: 0.932675\tvalid_0's ndcg@2: 0.972282\tvalid_0's ndcg@3: 0.974057\tvalid_0's ndcg@4: 0.974444\tvalid_0's ndcg@5: 0.974454\n", + "[22]\tvalid_0's ndcg@1: 0.932925\tvalid_0's ndcg@2: 0.972358\tvalid_0's ndcg@3: 0.974146\tvalid_0's ndcg@4: 0.974533\tvalid_0's ndcg@5: 0.974543\n", + "[23]\tvalid_0's ndcg@1: 0.93325\tvalid_0's ndcg@2: 0.972478\tvalid_0's ndcg@3: 0.974253\tvalid_0's ndcg@4: 0.974651\tvalid_0's ndcg@5: 0.974661\n", + "[24]\tvalid_0's ndcg@1: 0.9335\tvalid_0's ndcg@2: 0.972539\tvalid_0's ndcg@3: 0.974351\tvalid_0's ndcg@4: 0.974739\tvalid_0's ndcg@5: 0.974749\n", + "[25]\tvalid_0's ndcg@1: 0.93475\tvalid_0's ndcg@2: 0.973\tvalid_0's ndcg@3: 0.974788\tvalid_0's ndcg@4: 0.975197\tvalid_0's ndcg@5: 0.975206\n", + "[26]\tvalid_0's ndcg@1: 0.935075\tvalid_0's ndcg@2: 0.97312\tvalid_0's ndcg@3: 0.974895\tvalid_0's ndcg@4: 0.975315\tvalid_0's ndcg@5: 0.975325\n", + "[27]\tvalid_0's ndcg@1: 0.9349\tvalid_0's ndcg@2: 0.973103\tvalid_0's ndcg@3: 0.974865\tvalid_0's ndcg@4: 0.975264\tvalid_0's ndcg@5: 0.975273\n", + "[28]\tvalid_0's ndcg@1: 0.935075\tvalid_0's ndcg@2: 0.973152\tvalid_0's ndcg@3: 0.974939\tvalid_0's ndcg@4: 0.975327\tvalid_0's ndcg@5: 0.975336\n", + "[29]\tvalid_0's ndcg@1: 0.935475\tvalid_0's ndcg@2: 0.973315\tvalid_0's ndcg@3: 0.975128\tvalid_0's ndcg@4: 0.975483\tvalid_0's ndcg@5: 0.975492\n", + "[30]\tvalid_0's ndcg@1: 0.93595\tvalid_0's ndcg@2: 0.973522\tvalid_0's ndcg@3: 0.975297\tvalid_0's ndcg@4: 0.975663\tvalid_0's ndcg@5: 0.975673\n", + "[31]\tvalid_0's ndcg@1: 0.93595\tvalid_0's ndcg@2: 0.973506\tvalid_0's ndcg@3: 0.975281\tvalid_0's ndcg@4: 0.975658\tvalid_0's ndcg@5: 0.975668\n", + "[32]\tvalid_0's ndcg@1: 0.93675\tvalid_0's ndcg@2: 0.973833\tvalid_0's ndcg@3: 0.975595\tvalid_0's ndcg@4: 0.975961\tvalid_0's ndcg@5: 0.975971\n", + "[33]\tvalid_0's ndcg@1: 0.936475\tvalid_0's ndcg@2: 0.973763\tvalid_0's ndcg@3: 0.975488\tvalid_0's ndcg@4: 0.975865\tvalid_0's ndcg@5: 0.975874\n", + "[34]\tvalid_0's ndcg@1: 0.9367\tvalid_0's ndcg@2: 0.973893\tvalid_0's ndcg@3: 0.975568\tvalid_0's ndcg@4: 0.975956\tvalid_0's ndcg@5: 0.975966\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[35]\tvalid_0's ndcg@1: 0.93715\tvalid_0's ndcg@2: 0.974059\tvalid_0's ndcg@3: 0.975722\tvalid_0's ndcg@4: 0.97612\tvalid_0's ndcg@5: 0.97613\n", + "[36]\tvalid_0's ndcg@1: 0.9374\tvalid_0's ndcg@2: 0.974183\tvalid_0's ndcg@3: 0.975846\tvalid_0's ndcg@4: 0.976223\tvalid_0's ndcg@5: 0.976232\n", + "[37]\tvalid_0's ndcg@1: 0.9374\tvalid_0's ndcg@2: 0.974183\tvalid_0's ndcg@3: 0.975846\tvalid_0's ndcg@4: 0.976223\tvalid_0's ndcg@5: 0.976232\n", + "[38]\tvalid_0's ndcg@1: 0.938725\tvalid_0's ndcg@2: 0.974672\tvalid_0's ndcg@3: 0.97636\tvalid_0's ndcg@4: 0.976715\tvalid_0's ndcg@5: 0.976725\n", + "[39]\tvalid_0's ndcg@1: 0.93865\tvalid_0's ndcg@2: 0.974676\tvalid_0's ndcg@3: 0.976364\tvalid_0's ndcg@4: 0.976697\tvalid_0's ndcg@5: 0.976707\n", + "[40]\tvalid_0's ndcg@1: 0.939125\tvalid_0's ndcg@2: 0.974867\tvalid_0's ndcg@3: 0.97653\tvalid_0's ndcg@4: 0.976874\tvalid_0's ndcg@5: 0.976884\n", + "[41]\tvalid_0's ndcg@1: 0.9396\tvalid_0's ndcg@2: 0.975042\tvalid_0's ndcg@3: 0.976705\tvalid_0's ndcg@4: 0.97705\tvalid_0's ndcg@5: 0.977059\n", + "[42]\tvalid_0's ndcg@1: 0.93985\tvalid_0's ndcg@2: 0.975072\tvalid_0's ndcg@3: 0.976784\tvalid_0's ndcg@4: 0.977129\tvalid_0's ndcg@5: 0.977138\n", + "[43]\tvalid_0's ndcg@1: 0.940075\tvalid_0's ndcg@2: 0.97517\tvalid_0's ndcg@3: 0.97687\tvalid_0's ndcg@4: 0.977215\tvalid_0's ndcg@5: 0.977225\n", + "[44]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.97534\tvalid_0's ndcg@3: 0.977015\tvalid_0's ndcg@4: 0.97736\tvalid_0's ndcg@5: 0.97737\n", + "[45]\tvalid_0's ndcg@1: 0.94055\tvalid_0's ndcg@2: 0.975409\tvalid_0's ndcg@3: 0.977059\tvalid_0's ndcg@4: 0.977403\tvalid_0's ndcg@5: 0.977413\n", + "[46]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975415\tvalid_0's ndcg@3: 0.97704\tvalid_0's ndcg@4: 0.977396\tvalid_0's ndcg@5: 0.977405\n", + "[47]\tvalid_0's ndcg@1: 0.940425\tvalid_0's ndcg@2: 0.975363\tvalid_0's ndcg@3: 0.977013\tvalid_0's ndcg@4: 0.977357\tvalid_0's ndcg@5: 0.977367\n", + "[48]\tvalid_0's ndcg@1: 0.94045\tvalid_0's ndcg@2: 0.975388\tvalid_0's ndcg@3: 0.977025\tvalid_0's ndcg@4: 0.97737\tvalid_0's ndcg@5: 0.977379\n", + "[49]\tvalid_0's ndcg@1: 0.940525\tvalid_0's ndcg@2: 0.975447\tvalid_0's ndcg@3: 0.977097\tvalid_0's ndcg@4: 0.977409\tvalid_0's ndcg@5: 0.977419\n", + "[50]\tvalid_0's ndcg@1: 0.941075\tvalid_0's ndcg@2: 0.975666\tvalid_0's ndcg@3: 0.977303\tvalid_0's ndcg@4: 0.977615\tvalid_0's ndcg@5: 0.977625\n", + "[51]\tvalid_0's ndcg@1: 0.94135\tvalid_0's ndcg@2: 0.975751\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.97771\tvalid_0's ndcg@5: 0.97772\n", + "[52]\tvalid_0's ndcg@1: 0.9413\tvalid_0's ndcg@2: 0.975717\tvalid_0's ndcg@3: 0.977355\tvalid_0's ndcg@4: 0.977688\tvalid_0's ndcg@5: 0.977698\n", + "[53]\tvalid_0's ndcg@1: 0.941375\tvalid_0's ndcg@2: 0.975713\tvalid_0's ndcg@3: 0.977376\tvalid_0's ndcg@4: 0.977699\tvalid_0's ndcg@5: 0.977718\n", + "[54]\tvalid_0's ndcg@1: 0.94185\tvalid_0's ndcg@2: 0.975857\tvalid_0's ndcg@3: 0.977557\tvalid_0's ndcg@4: 0.977869\tvalid_0's ndcg@5: 0.977889\n", + "[55]\tvalid_0's ndcg@1: 0.941925\tvalid_0's ndcg@2: 0.975837\tvalid_0's ndcg@3: 0.9776\tvalid_0's ndcg@4: 0.977891\tvalid_0's ndcg@5: 0.97791\n", + "[56]\tvalid_0's ndcg@1: 0.942325\tvalid_0's ndcg@2: 0.975969\tvalid_0's ndcg@3: 0.977719\tvalid_0's ndcg@4: 0.978032\tvalid_0's ndcg@5: 0.978051\n", + "[57]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976022\tvalid_0's ndcg@3: 0.977772\tvalid_0's ndcg@4: 0.978073\tvalid_0's ndcg@5: 0.978093\n", + "[58]\tvalid_0's ndcg@1: 0.9425\tvalid_0's ndcg@2: 0.976081\tvalid_0's ndcg@3: 0.977806\tvalid_0's ndcg@4: 0.978108\tvalid_0's ndcg@5: 0.978127\n", + "[59]\tvalid_0's ndcg@1: 0.9424\tvalid_0's ndcg@2: 0.976076\tvalid_0's ndcg@3: 0.977788\tvalid_0's ndcg@4: 0.978079\tvalid_0's ndcg@5: 0.978098\n", + "[60]\tvalid_0's ndcg@1: 0.942375\tvalid_0's ndcg@2: 0.976067\tvalid_0's ndcg@3: 0.977779\tvalid_0's ndcg@4: 0.97807\tvalid_0's ndcg@5: 0.978089\n", + "[61]\tvalid_0's ndcg@1: 0.942225\tvalid_0's ndcg@2: 0.976043\tvalid_0's ndcg@3: 0.97773\tvalid_0's ndcg@4: 0.978021\tvalid_0's ndcg@5: 0.97804\n", + "[62]\tvalid_0's ndcg@1: 0.942425\tvalid_0's ndcg@2: 0.976117\tvalid_0's ndcg@3: 0.977792\tvalid_0's ndcg@4: 0.978093\tvalid_0's ndcg@5: 0.978112\n", + "[63]\tvalid_0's ndcg@1: 0.942675\tvalid_0's ndcg@2: 0.976193\tvalid_0's ndcg@3: 0.977881\tvalid_0's ndcg@4: 0.978182\tvalid_0's ndcg@5: 0.978201\n", + "[64]\tvalid_0's ndcg@1: 0.942925\tvalid_0's ndcg@2: 0.976254\tvalid_0's ndcg@3: 0.977966\tvalid_0's ndcg@4: 0.978268\tvalid_0's ndcg@5: 0.978287\n", + "[65]\tvalid_0's ndcg@1: 0.9431\tvalid_0's ndcg@2: 0.97635\tvalid_0's ndcg@3: 0.978025\tvalid_0's ndcg@4: 0.978337\tvalid_0's ndcg@5: 0.978357\n", + "[66]\tvalid_0's ndcg@1: 0.9434\tvalid_0's ndcg@2: 0.976445\tvalid_0's ndcg@3: 0.978132\tvalid_0's ndcg@4: 0.978445\tvalid_0's ndcg@5: 0.978464\n", + "[67]\tvalid_0's ndcg@1: 0.943275\tvalid_0's ndcg@2: 0.976399\tvalid_0's ndcg@3: 0.978074\tvalid_0's ndcg@4: 0.978397\tvalid_0's ndcg@5: 0.978416\n", + "[68]\tvalid_0's ndcg@1: 0.943325\tvalid_0's ndcg@2: 0.976401\tvalid_0's ndcg@3: 0.978089\tvalid_0's ndcg@4: 0.978412\tvalid_0's ndcg@5: 0.978431\n", + "[69]\tvalid_0's ndcg@1: 0.943675\tvalid_0's ndcg@2: 0.976578\tvalid_0's ndcg@3: 0.97819\tvalid_0's ndcg@4: 0.978546\tvalid_0's ndcg@5: 0.978565\n", + "[70]\tvalid_0's ndcg@1: 0.944025\tvalid_0's ndcg@2: 0.976707\tvalid_0's ndcg@3: 0.97832\tvalid_0's ndcg@4: 0.978675\tvalid_0's ndcg@5: 0.978694\n", + "[71]\tvalid_0's ndcg@1: 0.9442\tvalid_0's ndcg@2: 0.976772\tvalid_0's ndcg@3: 0.978384\tvalid_0's ndcg@4: 0.97874\tvalid_0's ndcg@5: 0.978759\n", + "[72]\tvalid_0's ndcg@1: 0.94425\tvalid_0's ndcg@2: 0.976822\tvalid_0's ndcg@3: 0.978409\tvalid_0's ndcg@4: 0.978765\tvalid_0's ndcg@5: 0.978784\n", + "[73]\tvalid_0's ndcg@1: 0.94445\tvalid_0's ndcg@2: 0.976864\tvalid_0's ndcg@3: 0.978464\tvalid_0's ndcg@4: 0.97883\tvalid_0's ndcg@5: 0.978849\n", + "[74]\tvalid_0's ndcg@1: 0.9446\tvalid_0's ndcg@2: 0.976919\tvalid_0's ndcg@3: 0.978519\tvalid_0's ndcg@4: 0.978885\tvalid_0's ndcg@5: 0.978905\n", + "[75]\tvalid_0's ndcg@1: 0.9446\tvalid_0's ndcg@2: 0.976919\tvalid_0's ndcg@3: 0.978519\tvalid_0's ndcg@4: 0.978885\tvalid_0's ndcg@5: 0.978905\n", + "[76]\tvalid_0's ndcg@1: 0.944625\tvalid_0's ndcg@2: 0.97696\tvalid_0's ndcg@3: 0.978535\tvalid_0's ndcg@4: 0.978901\tvalid_0's ndcg@5: 0.978921\n", + "[77]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.976979\tvalid_0's ndcg@3: 0.978554\tvalid_0's ndcg@4: 0.97892\tvalid_0's ndcg@5: 0.978939\n", + "[78]\tvalid_0's ndcg@1: 0.944675\tvalid_0's ndcg@2: 0.976979\tvalid_0's ndcg@3: 0.978554\tvalid_0's ndcg@4: 0.97892\tvalid_0's ndcg@5: 0.978939\n", + "[79]\tvalid_0's ndcg@1: 0.944525\tvalid_0's ndcg@2: 0.976907\tvalid_0's ndcg@3: 0.978507\tvalid_0's ndcg@4: 0.978863\tvalid_0's ndcg@5: 0.978882\n", + "[80]\tvalid_0's ndcg@1: 0.94455\tvalid_0's ndcg@2: 0.976885\tvalid_0's ndcg@3: 0.97851\tvalid_0's ndcg@4: 0.978865\tvalid_0's ndcg@5: 0.978885\n", + "[81]\tvalid_0's ndcg@1: 0.944725\tvalid_0's ndcg@2: 0.97695\tvalid_0's ndcg@3: 0.978575\tvalid_0's ndcg@4: 0.978919\tvalid_0's ndcg@5: 0.978948\n", + "[82]\tvalid_0's ndcg@1: 0.945225\tvalid_0's ndcg@2: 0.977103\tvalid_0's ndcg@3: 0.978765\tvalid_0's ndcg@4: 0.97911\tvalid_0's ndcg@5: 0.979129\n", + "[83]\tvalid_0's ndcg@1: 0.945125\tvalid_0's ndcg@2: 0.977066\tvalid_0's ndcg@3: 0.978716\tvalid_0's ndcg@4: 0.979071\tvalid_0's ndcg@5: 0.97909\n", + "[84]\tvalid_0's ndcg@1: 0.945225\tvalid_0's ndcg@2: 0.97715\tvalid_0's ndcg@3: 0.978775\tvalid_0's ndcg@4: 0.97912\tvalid_0's ndcg@5: 0.979139\n", + "[85]\tvalid_0's ndcg@1: 0.945025\tvalid_0's ndcg@2: 0.977092\tvalid_0's ndcg@3: 0.978692\tvalid_0's ndcg@4: 0.979047\tvalid_0's ndcg@5: 0.979067\n", + "[86]\tvalid_0's ndcg@1: 0.9452\tvalid_0's ndcg@2: 0.977172\tvalid_0's ndcg@3: 0.97876\tvalid_0's ndcg@4: 0.979115\tvalid_0's ndcg@5: 0.979135\n", + "[87]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977178\tvalid_0's ndcg@3: 0.97879\tvalid_0's ndcg@4: 0.979156\tvalid_0's ndcg@5: 0.979166\n", + "[88]\tvalid_0's ndcg@1: 0.9453\tvalid_0's ndcg@2: 0.977178\tvalid_0's ndcg@3: 0.978815\tvalid_0's ndcg@4: 0.979149\tvalid_0's ndcg@5: 0.979168\n", + "[89]\tvalid_0's ndcg@1: 0.94555\tvalid_0's ndcg@2: 0.977333\tvalid_0's ndcg@3: 0.978933\tvalid_0's ndcg@4: 0.979267\tvalid_0's ndcg@5: 0.979277\n", + "[90]\tvalid_0's ndcg@1: 0.9459\tvalid_0's ndcg@2: 0.977462\tvalid_0's ndcg@3: 0.979062\tvalid_0's ndcg@4: 0.979396\tvalid_0's ndcg@5: 0.979406\n", + "[91]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977481\tvalid_0's ndcg@3: 0.979081\tvalid_0's ndcg@4: 0.979414\tvalid_0's ndcg@5: 0.979424\n", + "[92]\tvalid_0's ndcg@1: 0.945875\tvalid_0's ndcg@2: 0.977437\tvalid_0's ndcg@3: 0.97905\tvalid_0's ndcg@4: 0.979384\tvalid_0's ndcg@5: 0.979393\n", + "[93]\tvalid_0's ndcg@1: 0.945875\tvalid_0's ndcg@2: 0.977421\tvalid_0's ndcg@3: 0.979046\tvalid_0's ndcg@4: 0.97938\tvalid_0's ndcg@5: 0.97939\n", + "[94]\tvalid_0's ndcg@1: 0.9459\tvalid_0's ndcg@2: 0.977431\tvalid_0's ndcg@3: 0.979068\tvalid_0's ndcg@4: 0.979391\tvalid_0's ndcg@5: 0.979401\n", + "[95]\tvalid_0's ndcg@1: 0.94595\tvalid_0's ndcg@2: 0.977449\tvalid_0's ndcg@3: 0.979074\tvalid_0's ndcg@4: 0.979408\tvalid_0's ndcg@5: 0.979418\n", + "[96]\tvalid_0's ndcg@1: 0.946075\tvalid_0's ndcg@2: 0.977527\tvalid_0's ndcg@3: 0.979127\tvalid_0's ndcg@4: 0.979461\tvalid_0's ndcg@5: 0.97947\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[97]\tvalid_0's ndcg@1: 0.946375\tvalid_0's ndcg@2: 0.977622\tvalid_0's ndcg@3: 0.979222\tvalid_0's ndcg@4: 0.979577\tvalid_0's ndcg@5: 0.979577\n", + "[98]\tvalid_0's ndcg@1: 0.946625\tvalid_0's ndcg@2: 0.977714\tvalid_0's ndcg@3: 0.979339\tvalid_0's ndcg@4: 0.979673\tvalid_0's ndcg@5: 0.979673\n", + "[99]\tvalid_0's ndcg@1: 0.94665\tvalid_0's ndcg@2: 0.977739\tvalid_0's ndcg@3: 0.979352\tvalid_0's ndcg@4: 0.979685\tvalid_0's ndcg@5: 0.979685\n", + "[100]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.97778\tvalid_0's ndcg@3: 0.97938\tvalid_0's ndcg@4: 0.979703\tvalid_0's ndcg@5: 0.979703\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's ndcg@1: 0.946675\tvalid_0's ndcg@2: 0.97778\tvalid_0's ndcg@3: 0.97938\tvalid_0's ndcg@4: 0.979703\tvalid_0's ndcg@5: 0.979703\n" + ] + } + ], + "source": [ + "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", + "# 这一部分与前面的单独训练和验证是分开的\n", + "def get_kfold_users(trn_df, n=5):\n", + " user_ids = trn_df['user_id'].unique()\n", + " user_set = [user_ids[i::n] for i in range(n)]\n", + " return user_set\n", + "\n", + "k_fold = 5\n", + "trn_df = trn_user_item_feats_df_rank_model\n", + "user_set = get_kfold_users(trn_df, n=k_fold)\n", + "\n", + "score_list = []\n", + "score_df = trn_df[['user_id', 'click_article_id','label']]\n", + "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", + "\n", + "# 五折交叉验证,并将中间结果保存用于staking\n", + "for n_fold, valid_user in enumerate(user_set):\n", + " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", + " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", + " \n", + " # 训练集与验证集的用户分组\n", + " train_idx.sort_values(by=['user_id'], inplace=True)\n", + " g_train = train_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", + " \n", + " valid_idx.sort_values(by=['user_id'], inplace=True)\n", + " g_val = valid_idx.groupby(['user_id'], as_index=False).count()[\"label\"].values\n", + " \n", + " # 定义模型\n", + " lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", + " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", + " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) \n", + " # 训练模型\n", + " lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train,\n", + " eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], \n", + " eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )\n", + " \n", + " # 预测验证集结果\n", + " valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_)\n", + " \n", + " # 对输出结果进行归一化\n", + " valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))\n", + " \n", + " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", + " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", + " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", + " \n", + " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", + " if not offline:\n", + " sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_)\n", + " \n", + "score_df_ = pd.concat(score_list, axis=0)\n", + "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", + "# 保存训练集交叉验证产生的新特征\n", + "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False)\n", + " \n", + "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", + "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n", + "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n", + "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n", + "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + "\n", + "# 保存测试集交叉验证的新特征\n", + "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/2\n", - "290964/290964 [==============================] - 55s 189us/sample - loss: 0.4209 - binary_crossentropy: 0.4206 - auc: 0.7842\n", - "Epoch 2/2\n", - "290964/290964 [==============================] - 52s 178us/sample - loss: 0.3630 - binary_crossentropy: 0.3618 - auc: 0.8478\n" - ] - } - ], - "source": [ - "# 模型训练\n", - "if offline:\n", - " history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)\n", - "else:\n", - " # 也可以使用上面的语句用自己采样出来的验证集\n", - " # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)\n", - " history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:29:20.436591Z", - "start_time": "2020-11-18T04:28:58.102057Z" - } - }, - "outputs": [ + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:22:52.604397Z", + "start_time": "2020-11-18T04:22:43.253034Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "# 单模型生成提交结果\n", + "rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]\n", + "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", + "submit(rank_results, topk=5, model_name='lgb_ranker')" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "500000/500000 [==============================] - 20s 39us/sample\n" - ] - } - ], - "source": [ - "# 模型预测\n", - "tst_user_item_feats_df_din_model['pred_score'] = model.predict(x_tst, verbose=1, batch_size=256)\n", - "tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'din_rank_score.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:29:34.985535Z", - "start_time": "2020-11-18T04:29:26.264531Z" - } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']]\n", - "submit(rank_results, topk=5, model_name='din')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-15T06:15:49.490705Z", - "start_time": "2020-11-15T06:15:49.473794Z" - } - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:38:53.760383Z", - "start_time": "2020-11-18T04:29:51.737721Z" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LGB分类模型" + ] }, - "scrolled": true - }, - "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Train on 232681 samples, validate on 58283 samples\n", - "Epoch 1/2\n", - "232681/232681 [==============================] - 44s 189us/sample - loss: 0.2864 - binary_crossentropy: 0.2846 - auc: 0.9008 - val_loss: 0.2830 - val_binary_crossentropy: 0.2813 - val_auc: 0.9072\n", - "Epoch 2/2\n", - "232681/232681 [==============================] - 44s 187us/sample - loss: 0.2832 - binary_crossentropy: 0.2816 - auc: 0.9034 - val_loss: 0.2846 - val_binary_crossentropy: 0.2830 - val_auc: 0.9053\n", - "58283/58283 [==============================] - 2s 36us/sample\n", - "500000/500000 [==============================] - 19s 37us/sample\n", - "Train on 232798 samples, validate on 58166 samples\n", - "Epoch 1/2\n", - "232798/232798 [==============================] - 43s 184us/sample - loss: 0.2818 - binary_crossentropy: 0.2802 - auc: 0.9051 - val_loss: 0.2968 - val_binary_crossentropy: 0.2953 - val_auc: 0.9062\n", - "Epoch 2/2\n", - "232798/232798 [==============================] - 44s 187us/sample - loss: 0.2796 - binary_crossentropy: 0.2782 - auc: 0.9069 - val_loss: 0.2820 - val_binary_crossentropy: 0.2806 - val_auc: 0.9071\n", - "58166/58166 [==============================] - 2s 38us/sample\n", - "500000/500000 [==============================] - 18s 37us/sample\n", - "Train on 232847 samples, validate on 58117 samples\n", - "Epoch 1/2\n", - "232847/232847 [==============================] - 43s 185us/sample - loss: 0.2786 - binary_crossentropy: 0.2773 - auc: 0.9080 - val_loss: 0.2761 - val_binary_crossentropy: 0.2749 - val_auc: 0.9113\n", - "Epoch 2/2\n", - "232847/232847 [==============================] - 39s 166us/sample - loss: 0.2766 - binary_crossentropy: 0.2754 - auc: 0.9097 - val_loss: 0.2872 - val_binary_crossentropy: 0.2862 - val_auc: 0.9090\n", - "58117/58117 [==============================] - 2s 34us/sample\n", - "500000/500000 [==============================] - 17s 33us/sample\n", - "Train on 232716 samples, validate on 58248 samples\n", - "Epoch 1/2\n", - "232716/232716 [==============================] - 39s 169us/sample - loss: 0.2763 - binary_crossentropy: 0.2753 - auc: 0.9100 - val_loss: 0.2739 - val_binary_crossentropy: 0.2730 - val_auc: 0.9116\n", - "Epoch 2/2\n", - "232716/232716 [==============================] - 39s 168us/sample - loss: 0.2743 - binary_crossentropy: 0.2735 - auc: 0.9119 - val_loss: 0.2859 - val_binary_crossentropy: 0.2851 - val_auc: 0.9090\n", - "58248/58248 [==============================] - 2s 35us/sample\n", - "500000/500000 [==============================] - 17s 34us/sample\n", - "Train on 232814 samples, validate on 58150 samples\n", - "Epoch 1/2\n", - "232814/232814 [==============================] - 40s 170us/sample - loss: 0.2747 - binary_crossentropy: 0.2739 - auc: 0.9115 - val_loss: 0.2702 - val_binary_crossentropy: 0.2695 - val_auc: 0.9163\n", - "Epoch 2/2\n", - "232814/232814 [==============================] - 40s 170us/sample - loss: 0.2725 - binary_crossentropy: 0.2719 - auc: 0.9132 - val_loss: 0.2751 - val_binary_crossentropy: 0.2745 - val_auc: 0.9151\n", - "58150/58150 [==============================] - 2s 34us/sample\n", - "500000/500000 [==============================] - 17s 34us/sample\n" - ] - } - ], - "source": [ - "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", - "# 这一部分与前面的单独训练和验证是分开的\n", - "def get_kfold_users(trn_df, n=5):\n", - " user_ids = trn_df['user_id'].unique()\n", - " user_set = [user_ids[i::n] for i in range(n)]\n", - " return user_set\n", - "\n", - "k_fold = 5\n", - "trn_df = trn_user_item_feats_df_din_model\n", - "user_set = get_kfold_users(trn_df, n=k_fold)\n", - "\n", - "score_list = []\n", - "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n", - "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", - "\n", - "dense_fea = [x for x in dense_fea if x != 'label']\n", - "x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - "\n", - "# 五折交叉验证,并将中间结果保存用于staking\n", - "for n_fold, valid_user in enumerate(user_set):\n", - " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", - " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", - " \n", - " # 准备训练数据\n", - " x_trn, dnn_feature_columns = get_din_feats_columns(train_idx, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - " y_trn = train_idx['label'].values\n", - "\n", - " # 准备验证数据\n", - " x_val, dnn_feature_columns = get_din_feats_columns(valid_idx, dense_fea, \n", - " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", - " y_val = valid_idx['label'].values\n", - " \n", - " history = model.fit(x_trn, y_trn, verbose=1, epochs=2, validation_data=(x_val, y_val) , batch_size=256)\n", - " \n", - " # 预测验证集结果\n", - " valid_idx['pred_score'] = model.predict(x_val, verbose=1, batch_size=256) \n", - " \n", - " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", - " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - " \n", - " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", - " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", - " \n", - " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", - " if not offline:\n", - " sub_preds += model.predict(x_tst, verbose=1, batch_size=256)[:, 0] \n", - " \n", - "score_df_ = pd.concat(score_list, axis=0)\n", - "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", - "# 保存训练集交叉验证产生的新特征\n", - "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_din_cls_feats.csv', index=False)\n", - " \n", - "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", - "tst_user_item_feats_df_din_model['pred_score'] = sub_preds / k_fold\n", - "tst_user_item_feats_df_din_model['pred_score'] = tst_user_item_feats_df_din_model['pred_score'].transform(lambda x: norm_sim(x))\n", - "tst_user_item_feats_df_din_model.sort_values(by=['user_id', 'pred_score'])\n", - "tst_user_item_feats_df_din_model['pred_rank'] = tst_user_item_feats_df_din_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", - "\n", - "# 保存测试集交叉验证的新特征\n", - "tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_din_cls_feats.csv', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 模型融合" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 加权融合" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:27.351996Z", - "start_time": "2020-11-18T04:44:26.561275Z" - } - }, - "outputs": [], - "source": [ - "# 读取多个模型的排序结果文件\n", - "lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv')\n", - "lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv')\n", - "din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')\n", - "\n", - "# 这里也可以换成交叉验证输出的测试结果进行加权融合" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:31.593981Z", - "start_time": "2020-11-18T04:44:31.589439Z" - } - }, - "outputs": [], - "source": [ - "rank_model = {'lgb_ranker': lgb_ranker, \n", - " 'lgb_cls': lgb_cls, \n", - " 'din_ranker': din_ranker}" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:36.135860Z", - "start_time": "2020-11-18T04:44:36.130577Z" - } - }, - "outputs": [], - "source": [ - "def get_ensumble_predict_topk(rank_model, topk=5):\n", - " final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])\n", - " rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))\n", - " \n", - " final_recall = final_recall.append(rank_model['lgb_ranker'])\n", - " final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()\n", - " \n", - " submit(final_recall, topk=topk, model_name='ensemble_fuse')" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:51.659270Z", - "start_time": "2020-11-18T04:44:40.445659Z" - } - }, - "outputs": [], - "source": [ - "get_ensumble_predict_topk(rank_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Staking" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:44:58.025992Z", - "start_time": "2020-11-18T04:44:56.146962Z" - } - }, - "outputs": [], - "source": [ - "# 读取多个模型的交叉验证生成的结果文件\n", - "# 训练集\n", - "trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')\n", - "trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')\n", - "trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')\n", - "\n", - "# 测试集\n", - "tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')\n", - "tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')\n", - "tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:45:07.701862Z", - "start_time": "2020-11-18T04:45:07.644335Z" - } - }, - "outputs": [], - "source": [ - "# 将多个模型输出的特征进行拼接\n", - "\n", - "finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]\n", - "finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]\n", - "\n", - "for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats, trn_din_cls_feats]):\n", - " for feat in [ 'pred_score', 'pred_rank']:\n", - " col_name = feat + '_' + str(idx)\n", - " finall_trn_ranker_feats[col_name] = trn_model[feat]\n", - "\n", - "for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats, tst_din_cls_feats]):\n", - " for feat in [ 'pred_score', 'pred_rank']:\n", - " col_name = feat + '_' + str(idx)\n", - " finall_tst_ranker_feats[col_name] = tst_model[feat]" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:45:15.044242Z", - "start_time": "2020-11-18T04:45:13.138252Z" - } - }, - "outputs": [], - "source": [ - "# 定义一个逻辑回归模型再次拟合交叉验证产生的特征对测试集进行预测\n", - "# 这里需要注意的是,在做交叉验证的时候可以构造多一些与输出预测值相关的特征,来丰富这里简单模型的特征\n", - "from sklearn.linear_model import LogisticRegression\n", - "\n", - "feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1', 'pred_score_2', 'pred_rank_2']\n", - "\n", - "trn_x = finall_trn_ranker_feats[feat_cols]\n", - "trn_y = finall_trn_ranker_feats['label']\n", - "\n", - "tst_x = finall_tst_ranker_feats[feat_cols]\n", - "\n", - "# 定义模型\n", - "lr = LogisticRegression()\n", - "\n", - "# 模型训练\n", - "lr.fit(trn_x, trn_y)\n", - "\n", - "# 模型预测\n", - "finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "ExecuteTime": { - "end_time": "2020-11-18T04:45:29.018764Z", - "start_time": "2020-11-18T04:45:19.423130Z" + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:22:58.259730Z", + "start_time": "2020-11-18T04:22:58.254297Z" + } + }, + "outputs": [], + "source": [ + "# 模型及参数的定义\n", + "lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", + " max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", + " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:23:11.258774Z", + "start_time": "2020-11-18T04:23:00.861936Z" + } + }, + "outputs": [], + "source": [ + "# 模型训练\n", + "if offline:\n", + " lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'],\n", + " eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], \n", + " eval_metric=['auc', ],early_stopping_rounds=50, )\n", + "else:\n", + " lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:23:19.591396Z", + "start_time": "2020-11-18T04:23:13.813850Z" + } + }, + "outputs": [], + "source": [ + "# 模型预测\n", + "tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]\n", + "\n", + "# 将这里的排序结果保存一份,用户后面的模型融合\n", + "tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:23:32.352931Z", + "start_time": "2020-11-18T04:23:22.346609Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]\n", + "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", + "submit(rank_results, topk=5, model_name='lgb_cls')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:24:11.241196Z", + "start_time": "2020-11-18T04:23:41.377394Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1]\tvalid_0's auc: 0.764896\tvalid_0's binary_logloss: 0.522153\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.767857\tvalid_0's binary_logloss: 0.52057\n", + "[3]\tvalid_0's auc: 0.783096\tvalid_0's binary_logloss: 0.519584\n", + "[4]\tvalid_0's auc: 0.784354\tvalid_0's binary_logloss: 0.518485\n", + "[5]\tvalid_0's auc: 0.790554\tvalid_0's binary_logloss: 0.516886\n", + "[6]\tvalid_0's auc: 0.791954\tvalid_0's binary_logloss: 0.515334\n", + "[7]\tvalid_0's auc: 0.794257\tvalid_0's binary_logloss: 0.514032\n", + "[8]\tvalid_0's auc: 0.795222\tvalid_0's binary_logloss: 0.512516\n", + "[9]\tvalid_0's auc: 0.795417\tvalid_0's binary_logloss: 0.511671\n", + "[10]\tvalid_0's auc: 0.795913\tvalid_0's binary_logloss: 0.510226\n", + "[11]\tvalid_0's auc: 0.798222\tvalid_0's binary_logloss: 0.508858\n", + "[12]\tvalid_0's auc: 0.79825\tvalid_0's binary_logloss: 0.507928\n", + "[13]\tvalid_0's auc: 0.798842\tvalid_0's binary_logloss: 0.50708\n", + "[14]\tvalid_0's auc: 0.798935\tvalid_0's binary_logloss: 0.505752\n", + "[15]\tvalid_0's auc: 0.799543\tvalid_0's binary_logloss: 0.504388\n", + "[16]\tvalid_0's auc: 0.800844\tvalid_0's binary_logloss: 0.503126\n", + "[17]\tvalid_0's auc: 0.800855\tvalid_0's binary_logloss: 0.501809\n", + "[18]\tvalid_0's auc: 0.801653\tvalid_0's binary_logloss: 0.500676\n", + "[19]\tvalid_0's auc: 0.801518\tvalid_0's binary_logloss: 0.49987\n", + "[20]\tvalid_0's auc: 0.801662\tvalid_0's binary_logloss: 0.498625\n", + "[21]\tvalid_0's auc: 0.802093\tvalid_0's binary_logloss: 0.498113\n", + "[22]\tvalid_0's auc: 0.803071\tvalid_0's binary_logloss: 0.496933\n", + "[23]\tvalid_0's auc: 0.803222\tvalid_0's binary_logloss: 0.495864\n", + "[24]\tvalid_0's auc: 0.802927\tvalid_0's binary_logloss: 0.494691\n", + "[25]\tvalid_0's auc: 0.802581\tvalid_0's binary_logloss: 0.493543\n", + "[26]\tvalid_0's auc: 0.802965\tvalid_0's binary_logloss: 0.492444\n", + "[27]\tvalid_0's auc: 0.80298\tvalid_0's binary_logloss: 0.491336\n", + "[28]\tvalid_0's auc: 0.803226\tvalid_0's binary_logloss: 0.490275\n", + "[29]\tvalid_0's auc: 0.803436\tvalid_0's binary_logloss: 0.489126\n", + "[30]\tvalid_0's auc: 0.803796\tvalid_0's binary_logloss: 0.48802\n", + "[31]\tvalid_0's auc: 0.803601\tvalid_0's binary_logloss: 0.486988\n", + "[32]\tvalid_0's auc: 0.804416\tvalid_0's binary_logloss: 0.485972\n", + "[33]\tvalid_0's auc: 0.804529\tvalid_0's binary_logloss: 0.484939\n", + "[34]\tvalid_0's auc: 0.804534\tvalid_0's binary_logloss: 0.483927\n", + "[35]\tvalid_0's auc: 0.804819\tvalid_0's binary_logloss: 0.483271\n", + "[36]\tvalid_0's auc: 0.804774\tvalid_0's binary_logloss: 0.482273\n", + "[37]\tvalid_0's auc: 0.805237\tvalid_0's binary_logloss: 0.481639\n", + "[38]\tvalid_0's auc: 0.805546\tvalid_0's binary_logloss: 0.480959\n", + "[39]\tvalid_0's auc: 0.805598\tvalid_0's binary_logloss: 0.479955\n", + "[40]\tvalid_0's auc: 0.806011\tvalid_0's binary_logloss: 0.47903\n", + "[41]\tvalid_0's auc: 0.806664\tvalid_0's binary_logloss: 0.478439\n", + "[42]\tvalid_0's auc: 0.807021\tvalid_0's binary_logloss: 0.477798\n", + "[43]\tvalid_0's auc: 0.80726\tvalid_0's binary_logloss: 0.476829\n", + "[44]\tvalid_0's auc: 0.807157\tvalid_0's binary_logloss: 0.475976\n", + "[45]\tvalid_0's auc: 0.807788\tvalid_0's binary_logloss: 0.475056\n", + "[46]\tvalid_0's auc: 0.80805\tvalid_0's binary_logloss: 0.474446\n", + "[47]\tvalid_0's auc: 0.808097\tvalid_0's binary_logloss: 0.473576\n", + "[48]\tvalid_0's auc: 0.80815\tvalid_0's binary_logloss: 0.472676\n", + "[49]\tvalid_0's auc: 0.808304\tvalid_0's binary_logloss: 0.471918\n", + "[50]\tvalid_0's auc: 0.808749\tvalid_0's binary_logloss: 0.471481\n", + "[51]\tvalid_0's auc: 0.808972\tvalid_0's binary_logloss: 0.471104\n", + "[52]\tvalid_0's auc: 0.809326\tvalid_0's binary_logloss: 0.470289\n", + "[53]\tvalid_0's auc: 0.809472\tvalid_0's binary_logloss: 0.469508\n", + "[54]\tvalid_0's auc: 0.809505\tvalid_0's binary_logloss: 0.46869\n", + "[55]\tvalid_0's auc: 0.809594\tvalid_0's binary_logloss: 0.467885\n", + "[56]\tvalid_0's auc: 0.809847\tvalid_0's binary_logloss: 0.467356\n", + "[57]\tvalid_0's auc: 0.810262\tvalid_0's binary_logloss: 0.466531\n", + "[58]\tvalid_0's auc: 0.810407\tvalid_0's binary_logloss: 0.46573\n", + "[59]\tvalid_0's auc: 0.810618\tvalid_0's binary_logloss: 0.465205\n", + "[60]\tvalid_0's auc: 0.81066\tvalid_0's binary_logloss: 0.464435\n", + "[61]\tvalid_0's auc: 0.810638\tvalid_0's binary_logloss: 0.463721\n", + "[62]\tvalid_0's auc: 0.810658\tvalid_0's binary_logloss: 0.462982\n", + "[63]\tvalid_0's auc: 0.811106\tvalid_0's binary_logloss: 0.462246\n", + "[64]\tvalid_0's auc: 0.811313\tvalid_0's binary_logloss: 0.461748\n", + "[65]\tvalid_0's auc: 0.811351\tvalid_0's binary_logloss: 0.461038\n", + "[66]\tvalid_0's auc: 0.811433\tvalid_0's binary_logloss: 0.460323\n", + "[67]\tvalid_0's auc: 0.81158\tvalid_0's binary_logloss: 0.459662\n", + "[68]\tvalid_0's auc: 0.811561\tvalid_0's binary_logloss: 0.458988\n", + "[69]\tvalid_0's auc: 0.811748\tvalid_0's binary_logloss: 0.458592\n", + "[70]\tvalid_0's auc: 0.811919\tvalid_0's binary_logloss: 0.457934\n", + "[71]\tvalid_0's auc: 0.812073\tvalid_0's binary_logloss: 0.457508\n", + "[72]\tvalid_0's auc: 0.812273\tvalid_0's binary_logloss: 0.457038\n", + "[73]\tvalid_0's auc: 0.812561\tvalid_0's binary_logloss: 0.456439\n", + "[74]\tvalid_0's auc: 0.812633\tvalid_0's binary_logloss: 0.455789\n", + "[75]\tvalid_0's auc: 0.812757\tvalid_0's binary_logloss: 0.455173\n", + "[76]\tvalid_0's auc: 0.812923\tvalid_0's binary_logloss: 0.454533\n", + "[77]\tvalid_0's auc: 0.81295\tvalid_0's binary_logloss: 0.45392\n", + "[78]\tvalid_0's auc: 0.813073\tvalid_0's binary_logloss: 0.453517\n", + "[79]\tvalid_0's auc: 0.813202\tvalid_0's binary_logloss: 0.452932\n", + "[80]\tvalid_0's auc: 0.813611\tvalid_0's binary_logloss: 0.452285\n", + "[81]\tvalid_0's auc: 0.813769\tvalid_0's binary_logloss: 0.45191\n", + "[82]\tvalid_0's auc: 0.814468\tvalid_0's binary_logloss: 0.451455\n", + "[83]\tvalid_0's auc: 0.814656\tvalid_0's binary_logloss: 0.450885\n", + "[84]\tvalid_0's auc: 0.814755\tvalid_0's binary_logloss: 0.450308\n", + "[85]\tvalid_0's auc: 0.814824\tvalid_0's binary_logloss: 0.449739\n", + "[86]\tvalid_0's auc: 0.81499\tvalid_0's binary_logloss: 0.449348\n", + "[87]\tvalid_0's auc: 0.815232\tvalid_0's binary_logloss: 0.448759\n", + "[88]\tvalid_0's auc: 0.815452\tvalid_0's binary_logloss: 0.44823\n", + "[89]\tvalid_0's auc: 0.815593\tvalid_0's binary_logloss: 0.447861\n", + "[90]\tvalid_0's auc: 0.815591\tvalid_0's binary_logloss: 0.447323\n", + "[91]\tvalid_0's auc: 0.815672\tvalid_0's binary_logloss: 0.446796\n", + "[92]\tvalid_0's auc: 0.815875\tvalid_0's binary_logloss: 0.446472\n", + "[93]\tvalid_0's auc: 0.815984\tvalid_0's binary_logloss: 0.445961\n", + "[94]\tvalid_0's auc: 0.816026\tvalid_0's binary_logloss: 0.445439\n", + "[95]\tvalid_0's auc: 0.816172\tvalid_0's binary_logloss: 0.444909\n", + "[96]\tvalid_0's auc: 0.816321\tvalid_0's binary_logloss: 0.444413\n", + "[97]\tvalid_0's auc: 0.816751\tvalid_0's binary_logloss: 0.44405\n", + "[98]\tvalid_0's auc: 0.817226\tvalid_0's binary_logloss: 0.443626\n", + "[99]\tvalid_0's auc: 0.817286\tvalid_0's binary_logloss: 0.443136\n", + "[100]\tvalid_0's auc: 0.817391\tvalid_0's binary_logloss: 0.442854\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.817391\tvalid_0's binary_logloss: 0.442854\n", + "[1]\tvalid_0's auc: 0.771584\tvalid_0's binary_logloss: 0.527139\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.775446\tvalid_0's binary_logloss: 0.525462\n", + "[3]\tvalid_0's auc: 0.790092\tvalid_0's binary_logloss: 0.524461\n", + "[4]\tvalid_0's auc: 0.791432\tvalid_0's binary_logloss: 0.523322\n", + "[5]\tvalid_0's auc: 0.797482\tvalid_0's binary_logloss: 0.521614\n", + "[6]\tvalid_0's auc: 0.79893\tvalid_0's binary_logloss: 0.520007\n", + "[7]\tvalid_0's auc: 0.800753\tvalid_0's binary_logloss: 0.5187\n", + "[8]\tvalid_0's auc: 0.802197\tvalid_0's binary_logloss: 0.517125\n", + "[9]\tvalid_0's auc: 0.802828\tvalid_0's binary_logloss: 0.516269\n", + "[10]\tvalid_0's auc: 0.803496\tvalid_0's binary_logloss: 0.51474\n", + "[11]\tvalid_0's auc: 0.804972\tvalid_0's binary_logloss: 0.513321\n", + "[12]\tvalid_0's auc: 0.804995\tvalid_0's binary_logloss: 0.512334\n", + "[13]\tvalid_0's auc: 0.80525\tvalid_0's binary_logloss: 0.51151\n", + "[14]\tvalid_0's auc: 0.805026\tvalid_0's binary_logloss: 0.510149\n", + "[15]\tvalid_0's auc: 0.805622\tvalid_0's binary_logloss: 0.508708\n", + "[16]\tvalid_0's auc: 0.806974\tvalid_0's binary_logloss: 0.507384\n", + "[17]\tvalid_0's auc: 0.807045\tvalid_0's binary_logloss: 0.506017\n", + "[18]\tvalid_0's auc: 0.807265\tvalid_0's binary_logloss: 0.504853\n", + "[19]\tvalid_0's auc: 0.807126\tvalid_0's binary_logloss: 0.503972\n", + "[20]\tvalid_0's auc: 0.806948\tvalid_0's binary_logloss: 0.502693\n", + "[21]\tvalid_0's auc: 0.807315\tvalid_0's binary_logloss: 0.502166\n", + "[22]\tvalid_0's auc: 0.808067\tvalid_0's binary_logloss: 0.500948\n", + "[23]\tvalid_0's auc: 0.808226\tvalid_0's binary_logloss: 0.49987\n", + "[24]\tvalid_0's auc: 0.808268\tvalid_0's binary_logloss: 0.498623\n", + "[25]\tvalid_0's auc: 0.808569\tvalid_0's binary_logloss: 0.497389\n", + "[26]\tvalid_0's auc: 0.809069\tvalid_0's binary_logloss: 0.49624\n", + "[27]\tvalid_0's auc: 0.809312\tvalid_0's binary_logloss: 0.495095\n", + "[28]\tvalid_0's auc: 0.809549\tvalid_0's binary_logloss: 0.494012\n", + "[29]\tvalid_0's auc: 0.809944\tvalid_0's binary_logloss: 0.492834\n", + "[30]\tvalid_0's auc: 0.810047\tvalid_0's binary_logloss: 0.491735\n", + "[31]\tvalid_0's auc: 0.810086\tvalid_0's binary_logloss: 0.490633\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[32]\tvalid_0's auc: 0.810566\tvalid_0's binary_logloss: 0.489595\n", + "[33]\tvalid_0's auc: 0.810539\tvalid_0's binary_logloss: 0.488536\n", + "[34]\tvalid_0's auc: 0.810529\tvalid_0's binary_logloss: 0.487489\n", + "[35]\tvalid_0's auc: 0.810932\tvalid_0's binary_logloss: 0.486775\n", + "[36]\tvalid_0's auc: 0.810769\tvalid_0's binary_logloss: 0.48577\n", + "[37]\tvalid_0's auc: 0.811363\tvalid_0's binary_logloss: 0.485123\n", + "[38]\tvalid_0's auc: 0.811801\tvalid_0's binary_logloss: 0.484413\n", + "[39]\tvalid_0's auc: 0.811987\tvalid_0's binary_logloss: 0.483371\n", + "[40]\tvalid_0's auc: 0.812268\tvalid_0's binary_logloss: 0.482407\n", + "[41]\tvalid_0's auc: 0.813297\tvalid_0's binary_logloss: 0.481742\n", + "[42]\tvalid_0's auc: 0.813453\tvalid_0's binary_logloss: 0.481108\n", + "[43]\tvalid_0's auc: 0.813603\tvalid_0's binary_logloss: 0.480163\n", + "[44]\tvalid_0's auc: 0.813654\tvalid_0's binary_logloss: 0.479239\n", + "[45]\tvalid_0's auc: 0.814267\tvalid_0's binary_logloss: 0.478299\n", + "[46]\tvalid_0's auc: 0.81455\tvalid_0's binary_logloss: 0.477678\n", + "[47]\tvalid_0's auc: 0.81452\tvalid_0's binary_logloss: 0.476766\n", + "[48]\tvalid_0's auc: 0.814925\tvalid_0's binary_logloss: 0.475815\n", + "[49]\tvalid_0's auc: 0.814907\tvalid_0's binary_logloss: 0.47503\n", + "[50]\tvalid_0's auc: 0.815278\tvalid_0's binary_logloss: 0.474588\n", + "[51]\tvalid_0's auc: 0.815535\tvalid_0's binary_logloss: 0.474171\n", + "[52]\tvalid_0's auc: 0.815685\tvalid_0's binary_logloss: 0.473335\n", + "[53]\tvalid_0's auc: 0.815787\tvalid_0's binary_logloss: 0.472509\n", + "[54]\tvalid_0's auc: 0.815827\tvalid_0's binary_logloss: 0.471686\n", + "[55]\tvalid_0's auc: 0.815871\tvalid_0's binary_logloss: 0.470838\n", + "[56]\tvalid_0's auc: 0.816238\tvalid_0's binary_logloss: 0.470285\n", + "[57]\tvalid_0's auc: 0.816269\tvalid_0's binary_logloss: 0.469495\n", + "[58]\tvalid_0's auc: 0.816528\tvalid_0's binary_logloss: 0.468654\n", + "[59]\tvalid_0's auc: 0.816706\tvalid_0's binary_logloss: 0.468122\n", + "[60]\tvalid_0's auc: 0.816821\tvalid_0's binary_logloss: 0.467352\n", + "[61]\tvalid_0's auc: 0.816759\tvalid_0's binary_logloss: 0.466622\n", + "[62]\tvalid_0's auc: 0.81682\tvalid_0's binary_logloss: 0.465867\n", + "[63]\tvalid_0's auc: 0.817251\tvalid_0's binary_logloss: 0.465112\n", + "[64]\tvalid_0's auc: 0.817476\tvalid_0's binary_logloss: 0.464589\n", + "[65]\tvalid_0's auc: 0.817613\tvalid_0's binary_logloss: 0.463831\n", + "[66]\tvalid_0's auc: 0.817648\tvalid_0's binary_logloss: 0.463098\n", + "[67]\tvalid_0's auc: 0.817719\tvalid_0's binary_logloss: 0.462414\n", + "[68]\tvalid_0's auc: 0.817814\tvalid_0's binary_logloss: 0.461727\n", + "[69]\tvalid_0's auc: 0.817973\tvalid_0's binary_logloss: 0.461329\n", + "[70]\tvalid_0's auc: 0.818108\tvalid_0's binary_logloss: 0.460674\n", + "[71]\tvalid_0's auc: 0.818347\tvalid_0's binary_logloss: 0.460222\n", + "[72]\tvalid_0's auc: 0.818456\tvalid_0's binary_logloss: 0.45977\n", + "[73]\tvalid_0's auc: 0.818727\tvalid_0's binary_logloss: 0.459157\n", + "[74]\tvalid_0's auc: 0.818988\tvalid_0's binary_logloss: 0.458437\n", + "[75]\tvalid_0's auc: 0.819144\tvalid_0's binary_logloss: 0.457808\n", + "[76]\tvalid_0's auc: 0.819259\tvalid_0's binary_logloss: 0.457159\n", + "[77]\tvalid_0's auc: 0.819343\tvalid_0's binary_logloss: 0.456512\n", + "[78]\tvalid_0's auc: 0.81954\tvalid_0's binary_logloss: 0.456045\n", + "[79]\tvalid_0's auc: 0.819687\tvalid_0's binary_logloss: 0.455416\n", + "[80]\tvalid_0's auc: 0.819958\tvalid_0's binary_logloss: 0.454765\n", + "[81]\tvalid_0's auc: 0.820115\tvalid_0's binary_logloss: 0.45436\n", + "[82]\tvalid_0's auc: 0.820536\tvalid_0's binary_logloss: 0.453965\n", + "[83]\tvalid_0's auc: 0.820649\tvalid_0's binary_logloss: 0.453383\n", + "[84]\tvalid_0's auc: 0.820663\tvalid_0's binary_logloss: 0.452804\n", + "[85]\tvalid_0's auc: 0.820809\tvalid_0's binary_logloss: 0.452167\n", + "[86]\tvalid_0's auc: 0.821024\tvalid_0's binary_logloss: 0.451735\n", + "[87]\tvalid_0's auc: 0.821124\tvalid_0's binary_logloss: 0.451167\n", + "[88]\tvalid_0's auc: 0.821243\tvalid_0's binary_logloss: 0.45061\n", + "[89]\tvalid_0's auc: 0.821404\tvalid_0's binary_logloss: 0.450215\n", + "[90]\tvalid_0's auc: 0.821488\tvalid_0's binary_logloss: 0.449656\n", + "[91]\tvalid_0's auc: 0.821538\tvalid_0's binary_logloss: 0.449107\n", + "[92]\tvalid_0's auc: 0.82172\tvalid_0's binary_logloss: 0.448752\n", + "[93]\tvalid_0's auc: 0.821809\tvalid_0's binary_logloss: 0.448188\n", + "[94]\tvalid_0's auc: 0.82184\tvalid_0's binary_logloss: 0.447659\n", + "[95]\tvalid_0's auc: 0.821971\tvalid_0's binary_logloss: 0.447108\n", + "[96]\tvalid_0's auc: 0.822086\tvalid_0's binary_logloss: 0.446596\n", + "[97]\tvalid_0's auc: 0.82247\tvalid_0's binary_logloss: 0.446244\n", + "[98]\tvalid_0's auc: 0.822951\tvalid_0's binary_logloss: 0.445812\n", + "[99]\tvalid_0's auc: 0.822991\tvalid_0's binary_logloss: 0.445329\n", + "[100]\tvalid_0's auc: 0.823174\tvalid_0's binary_logloss: 0.445037\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.823174\tvalid_0's binary_logloss: 0.445037\n", + "[1]\tvalid_0's auc: 0.769525\tvalid_0's binary_logloss: 0.526256\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.775857\tvalid_0's binary_logloss: 0.524594\n", + "[3]\tvalid_0's auc: 0.785307\tvalid_0's binary_logloss: 0.523606\n", + "[4]\tvalid_0's auc: 0.786356\tvalid_0's binary_logloss: 0.522495\n", + "[5]\tvalid_0's auc: 0.793385\tvalid_0's binary_logloss: 0.520812\n", + "[6]\tvalid_0's auc: 0.794014\tvalid_0's binary_logloss: 0.519253\n", + "[7]\tvalid_0's auc: 0.795454\tvalid_0's binary_logloss: 0.517961\n", + "[8]\tvalid_0's auc: 0.79807\tvalid_0's binary_logloss: 0.516363\n", + "[9]\tvalid_0's auc: 0.798756\tvalid_0's binary_logloss: 0.51548\n", + "[10]\tvalid_0's auc: 0.798314\tvalid_0's binary_logloss: 0.514021\n", + "[11]\tvalid_0's auc: 0.799343\tvalid_0's binary_logloss: 0.512678\n", + "[12]\tvalid_0's auc: 0.799573\tvalid_0's binary_logloss: 0.511708\n", + "[13]\tvalid_0's auc: 0.799563\tvalid_0's binary_logloss: 0.510892\n", + "[14]\tvalid_0's auc: 0.800333\tvalid_0's binary_logloss: 0.509532\n", + "[15]\tvalid_0's auc: 0.800672\tvalid_0's binary_logloss: 0.508117\n", + "[16]\tvalid_0's auc: 0.801953\tvalid_0's binary_logloss: 0.506866\n", + "[17]\tvalid_0's auc: 0.802078\tvalid_0's binary_logloss: 0.5055\n", + "[18]\tvalid_0's auc: 0.802449\tvalid_0's binary_logloss: 0.504358\n", + "[19]\tvalid_0's auc: 0.802329\tvalid_0's binary_logloss: 0.503503\n", + "[20]\tvalid_0's auc: 0.802437\tvalid_0's binary_logloss: 0.502233\n", + "[21]\tvalid_0's auc: 0.802653\tvalid_0's binary_logloss: 0.50174\n", + "[22]\tvalid_0's auc: 0.803753\tvalid_0's binary_logloss: 0.50056\n", + "[23]\tvalid_0's auc: 0.803956\tvalid_0's binary_logloss: 0.499496\n", + "[24]\tvalid_0's auc: 0.804231\tvalid_0's binary_logloss: 0.498283\n", + "[25]\tvalid_0's auc: 0.804554\tvalid_0's binary_logloss: 0.497059\n", + "[26]\tvalid_0's auc: 0.805133\tvalid_0's binary_logloss: 0.495963\n", + "[27]\tvalid_0's auc: 0.805333\tvalid_0's binary_logloss: 0.494842\n", + "[28]\tvalid_0's auc: 0.805644\tvalid_0's binary_logloss: 0.493771\n", + "[29]\tvalid_0's auc: 0.806029\tvalid_0's binary_logloss: 0.492598\n", + "[30]\tvalid_0's auc: 0.806321\tvalid_0's binary_logloss: 0.491474\n", + "[31]\tvalid_0's auc: 0.806201\tvalid_0's binary_logloss: 0.490419\n", + "[32]\tvalid_0's auc: 0.806671\tvalid_0's binary_logloss: 0.489393\n", + "[33]\tvalid_0's auc: 0.806899\tvalid_0's binary_logloss: 0.488331\n", + "[34]\tvalid_0's auc: 0.807105\tvalid_0's binary_logloss: 0.487277\n", + "[35]\tvalid_0's auc: 0.807257\tvalid_0's binary_logloss: 0.486592\n", + "[36]\tvalid_0's auc: 0.80729\tvalid_0's binary_logloss: 0.485607\n", + "[37]\tvalid_0's auc: 0.807752\tvalid_0's binary_logloss: 0.484951\n", + "[38]\tvalid_0's auc: 0.808191\tvalid_0's binary_logloss: 0.484269\n", + "[39]\tvalid_0's auc: 0.808417\tvalid_0's binary_logloss: 0.483242\n", + "[40]\tvalid_0's auc: 0.808761\tvalid_0's binary_logloss: 0.482291\n", + "[41]\tvalid_0's auc: 0.80965\tvalid_0's binary_logloss: 0.48164\n", + "[42]\tvalid_0's auc: 0.810065\tvalid_0's binary_logloss: 0.480962\n", + "[43]\tvalid_0's auc: 0.810209\tvalid_0's binary_logloss: 0.479995\n", + "[44]\tvalid_0's auc: 0.810091\tvalid_0's binary_logloss: 0.479077\n", + "[45]\tvalid_0's auc: 0.810573\tvalid_0's binary_logloss: 0.478185\n", + "[46]\tvalid_0's auc: 0.810924\tvalid_0's binary_logloss: 0.477558\n", + "[47]\tvalid_0's auc: 0.810951\tvalid_0's binary_logloss: 0.476662\n", + "[48]\tvalid_0's auc: 0.811101\tvalid_0's binary_logloss: 0.475745\n", + "[49]\tvalid_0's auc: 0.811269\tvalid_0's binary_logloss: 0.474951\n", + "[50]\tvalid_0's auc: 0.81173\tvalid_0's binary_logloss: 0.474514\n", + "[51]\tvalid_0's auc: 0.811937\tvalid_0's binary_logloss: 0.474114\n", + "[52]\tvalid_0's auc: 0.812136\tvalid_0's binary_logloss: 0.473297\n", + "[53]\tvalid_0's auc: 0.812249\tvalid_0's binary_logloss: 0.472497\n", + "[54]\tvalid_0's auc: 0.812121\tvalid_0's binary_logloss: 0.471696\n", + "[55]\tvalid_0's auc: 0.812164\tvalid_0's binary_logloss: 0.470905\n", + "[56]\tvalid_0's auc: 0.812462\tvalid_0's binary_logloss: 0.470384\n", + "[57]\tvalid_0's auc: 0.812613\tvalid_0's binary_logloss: 0.4696\n", + "[58]\tvalid_0's auc: 0.812615\tvalid_0's binary_logloss: 0.468778\n", + "[59]\tvalid_0's auc: 0.812842\tvalid_0's binary_logloss: 0.468211\n", + "[60]\tvalid_0's auc: 0.81312\tvalid_0's binary_logloss: 0.467385\n", + "[61]\tvalid_0's auc: 0.813039\tvalid_0's binary_logloss: 0.466632\n", + "[62]\tvalid_0's auc: 0.812942\tvalid_0's binary_logloss: 0.465933\n", + "[63]\tvalid_0's auc: 0.813274\tvalid_0's binary_logloss: 0.465214\n", + "[64]\tvalid_0's auc: 0.813572\tvalid_0's binary_logloss: 0.464692\n", + "[65]\tvalid_0's auc: 0.813594\tvalid_0's binary_logloss: 0.463925\n", + "[66]\tvalid_0's auc: 0.813719\tvalid_0's binary_logloss: 0.463177\n", + "[67]\tvalid_0's auc: 0.814011\tvalid_0's binary_logloss: 0.462513\n", + "[68]\tvalid_0's auc: 0.813989\tvalid_0's binary_logloss: 0.461843\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[69]\tvalid_0's auc: 0.814218\tvalid_0's binary_logloss: 0.461443\n", + "[70]\tvalid_0's auc: 0.814334\tvalid_0's binary_logloss: 0.460775\n", + "[71]\tvalid_0's auc: 0.814493\tvalid_0's binary_logloss: 0.460332\n", + "[72]\tvalid_0's auc: 0.814663\tvalid_0's binary_logloss: 0.459867\n", + "[73]\tvalid_0's auc: 0.814856\tvalid_0's binary_logloss: 0.459266\n", + "[74]\tvalid_0's auc: 0.815017\tvalid_0's binary_logloss: 0.458585\n", + "[75]\tvalid_0's auc: 0.815186\tvalid_0's binary_logloss: 0.457958\n", + "[76]\tvalid_0's auc: 0.815374\tvalid_0's binary_logloss: 0.457316\n", + "[77]\tvalid_0's auc: 0.81554\tvalid_0's binary_logloss: 0.45665\n", + "[78]\tvalid_0's auc: 0.81569\tvalid_0's binary_logloss: 0.456217\n", + "[79]\tvalid_0's auc: 0.815861\tvalid_0's binary_logloss: 0.455615\n", + "[80]\tvalid_0's auc: 0.816443\tvalid_0's binary_logloss: 0.454895\n", + "[81]\tvalid_0's auc: 0.816659\tvalid_0's binary_logloss: 0.454503\n", + "[82]\tvalid_0's auc: 0.817017\tvalid_0's binary_logloss: 0.454149\n", + "[83]\tvalid_0's auc: 0.817162\tvalid_0's binary_logloss: 0.453578\n", + "[84]\tvalid_0's auc: 0.817274\tvalid_0's binary_logloss: 0.452984\n", + "[85]\tvalid_0's auc: 0.817283\tvalid_0's binary_logloss: 0.452416\n", + "[86]\tvalid_0's auc: 0.817339\tvalid_0's binary_logloss: 0.452022\n", + "[87]\tvalid_0's auc: 0.817494\tvalid_0's binary_logloss: 0.45146\n", + "[88]\tvalid_0's auc: 0.817594\tvalid_0's binary_logloss: 0.450926\n", + "[89]\tvalid_0's auc: 0.817771\tvalid_0's binary_logloss: 0.450553\n", + "[90]\tvalid_0's auc: 0.81789\tvalid_0's binary_logloss: 0.449985\n", + "[91]\tvalid_0's auc: 0.817931\tvalid_0's binary_logloss: 0.449439\n", + "[92]\tvalid_0's auc: 0.818138\tvalid_0's binary_logloss: 0.449094\n", + "[93]\tvalid_0's auc: 0.818334\tvalid_0's binary_logloss: 0.448527\n", + "[94]\tvalid_0's auc: 0.818426\tvalid_0's binary_logloss: 0.447989\n", + "[95]\tvalid_0's auc: 0.818676\tvalid_0's binary_logloss: 0.447407\n", + "[96]\tvalid_0's auc: 0.818852\tvalid_0's binary_logloss: 0.446884\n", + "[97]\tvalid_0's auc: 0.81945\tvalid_0's binary_logloss: 0.446455\n", + "[98]\tvalid_0's auc: 0.819861\tvalid_0's binary_logloss: 0.446045\n", + "[99]\tvalid_0's auc: 0.819943\tvalid_0's binary_logloss: 0.445543\n", + "[100]\tvalid_0's auc: 0.820076\tvalid_0's binary_logloss: 0.445258\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.820076\tvalid_0's binary_logloss: 0.445258\n", + "[1]\tvalid_0's auc: 0.770032\tvalid_0's binary_logloss: 0.527241\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.779881\tvalid_0's binary_logloss: 0.525545\n", + "[3]\tvalid_0's auc: 0.791308\tvalid_0's binary_logloss: 0.524508\n", + "[4]\tvalid_0's auc: 0.790788\tvalid_0's binary_logloss: 0.52341\n", + "[5]\tvalid_0's auc: 0.795645\tvalid_0's binary_logloss: 0.521753\n", + "[6]\tvalid_0's auc: 0.797745\tvalid_0's binary_logloss: 0.520131\n", + "[7]\tvalid_0's auc: 0.79931\tvalid_0's binary_logloss: 0.518872\n", + "[8]\tvalid_0's auc: 0.800014\tvalid_0's binary_logloss: 0.517353\n", + "[9]\tvalid_0's auc: 0.800549\tvalid_0's binary_logloss: 0.516487\n", + "[10]\tvalid_0's auc: 0.800261\tvalid_0's binary_logloss: 0.515039\n", + "[11]\tvalid_0's auc: 0.801261\tvalid_0's binary_logloss: 0.513695\n", + "[12]\tvalid_0's auc: 0.801062\tvalid_0's binary_logloss: 0.512735\n", + "[13]\tvalid_0's auc: 0.801155\tvalid_0's binary_logloss: 0.51192\n", + "[14]\tvalid_0's auc: 0.801315\tvalid_0's binary_logloss: 0.510559\n", + "[15]\tvalid_0's auc: 0.80185\tvalid_0's binary_logloss: 0.509147\n", + "[16]\tvalid_0's auc: 0.803029\tvalid_0's binary_logloss: 0.507914\n", + "[17]\tvalid_0's auc: 0.803035\tvalid_0's binary_logloss: 0.506583\n", + "[18]\tvalid_0's auc: 0.803433\tvalid_0's binary_logloss: 0.505441\n", + "[19]\tvalid_0's auc: 0.803717\tvalid_0's binary_logloss: 0.504599\n", + "[20]\tvalid_0's auc: 0.803819\tvalid_0's binary_logloss: 0.503327\n", + "[21]\tvalid_0's auc: 0.803923\tvalid_0's binary_logloss: 0.502782\n", + "[22]\tvalid_0's auc: 0.804939\tvalid_0's binary_logloss: 0.501596\n", + "[23]\tvalid_0's auc: 0.804707\tvalid_0's binary_logloss: 0.500572\n", + "[24]\tvalid_0's auc: 0.804632\tvalid_0's binary_logloss: 0.499367\n", + "[25]\tvalid_0's auc: 0.804756\tvalid_0's binary_logloss: 0.498161\n", + "[26]\tvalid_0's auc: 0.805067\tvalid_0's binary_logloss: 0.497061\n", + "[27]\tvalid_0's auc: 0.805119\tvalid_0's binary_logloss: 0.495933\n", + "[28]\tvalid_0's auc: 0.805304\tvalid_0's binary_logloss: 0.494849\n", + "[29]\tvalid_0's auc: 0.805688\tvalid_0's binary_logloss: 0.493677\n", + "[30]\tvalid_0's auc: 0.805822\tvalid_0's binary_logloss: 0.492594\n", + "[31]\tvalid_0's auc: 0.805869\tvalid_0's binary_logloss: 0.49152\n", + "[32]\tvalid_0's auc: 0.807267\tvalid_0's binary_logloss: 0.490435\n", + "[33]\tvalid_0's auc: 0.807301\tvalid_0's binary_logloss: 0.489392\n", + "[34]\tvalid_0's auc: 0.80736\tvalid_0's binary_logloss: 0.488325\n", + "[35]\tvalid_0's auc: 0.807706\tvalid_0's binary_logloss: 0.487654\n", + "[36]\tvalid_0's auc: 0.807758\tvalid_0's binary_logloss: 0.486651\n", + "[37]\tvalid_0's auc: 0.808051\tvalid_0's binary_logloss: 0.486012\n", + "[38]\tvalid_0's auc: 0.808429\tvalid_0's binary_logloss: 0.485355\n", + "[39]\tvalid_0's auc: 0.808663\tvalid_0's binary_logloss: 0.484327\n", + "[40]\tvalid_0's auc: 0.809007\tvalid_0's binary_logloss: 0.483386\n", + "[41]\tvalid_0's auc: 0.809781\tvalid_0's binary_logloss: 0.482745\n", + "[42]\tvalid_0's auc: 0.810071\tvalid_0's binary_logloss: 0.482124\n", + "[43]\tvalid_0's auc: 0.810383\tvalid_0's binary_logloss: 0.481154\n", + "[44]\tvalid_0's auc: 0.810446\tvalid_0's binary_logloss: 0.480243\n", + "[45]\tvalid_0's auc: 0.811148\tvalid_0's binary_logloss: 0.479261\n", + "[46]\tvalid_0's auc: 0.811245\tvalid_0's binary_logloss: 0.478687\n", + "[47]\tvalid_0's auc: 0.811214\tvalid_0's binary_logloss: 0.477812\n", + "[48]\tvalid_0's auc: 0.811408\tvalid_0's binary_logloss: 0.47689\n", + "[49]\tvalid_0's auc: 0.811486\tvalid_0's binary_logloss: 0.476132\n", + "[50]\tvalid_0's auc: 0.811806\tvalid_0's binary_logloss: 0.475718\n", + "[51]\tvalid_0's auc: 0.812017\tvalid_0's binary_logloss: 0.475342\n", + "[52]\tvalid_0's auc: 0.812255\tvalid_0's binary_logloss: 0.474505\n", + "[53]\tvalid_0's auc: 0.812249\tvalid_0's binary_logloss: 0.473707\n", + "[54]\tvalid_0's auc: 0.812235\tvalid_0's binary_logloss: 0.47289\n", + "[55]\tvalid_0's auc: 0.812233\tvalid_0's binary_logloss: 0.472091\n", + "[56]\tvalid_0's auc: 0.812492\tvalid_0's binary_logloss: 0.471563\n", + "[57]\tvalid_0's auc: 0.812579\tvalid_0's binary_logloss: 0.47077\n", + "[58]\tvalid_0's auc: 0.812598\tvalid_0's binary_logloss: 0.469992\n", + "[59]\tvalid_0's auc: 0.812885\tvalid_0's binary_logloss: 0.469458\n", + "[60]\tvalid_0's auc: 0.812995\tvalid_0's binary_logloss: 0.468676\n", + "[61]\tvalid_0's auc: 0.812961\tvalid_0's binary_logloss: 0.467939\n", + "[62]\tvalid_0's auc: 0.812919\tvalid_0's binary_logloss: 0.467232\n", + "[63]\tvalid_0's auc: 0.813291\tvalid_0's binary_logloss: 0.466491\n", + "[64]\tvalid_0's auc: 0.813702\tvalid_0's binary_logloss: 0.465945\n", + "[65]\tvalid_0's auc: 0.813803\tvalid_0's binary_logloss: 0.465197\n", + "[66]\tvalid_0's auc: 0.813851\tvalid_0's binary_logloss: 0.4645\n", + "[67]\tvalid_0's auc: 0.814011\tvalid_0's binary_logloss: 0.463814\n", + "[68]\tvalid_0's auc: 0.814027\tvalid_0's binary_logloss: 0.463113\n", + "[69]\tvalid_0's auc: 0.814138\tvalid_0's binary_logloss: 0.462727\n", + "[70]\tvalid_0's auc: 0.814365\tvalid_0's binary_logloss: 0.462077\n", + "[71]\tvalid_0's auc: 0.814432\tvalid_0's binary_logloss: 0.461655\n", + "[72]\tvalid_0's auc: 0.8146\tvalid_0's binary_logloss: 0.461194\n", + "[73]\tvalid_0's auc: 0.815324\tvalid_0's binary_logloss: 0.460477\n", + "[74]\tvalid_0's auc: 0.815411\tvalid_0's binary_logloss: 0.459805\n", + "[75]\tvalid_0's auc: 0.815548\tvalid_0's binary_logloss: 0.459189\n", + "[76]\tvalid_0's auc: 0.815625\tvalid_0's binary_logloss: 0.458525\n", + "[77]\tvalid_0's auc: 0.81562\tvalid_0's binary_logloss: 0.457905\n", + "[78]\tvalid_0's auc: 0.815786\tvalid_0's binary_logloss: 0.45747\n", + "[79]\tvalid_0's auc: 0.815834\tvalid_0's binary_logloss: 0.456884\n", + "[80]\tvalid_0's auc: 0.816475\tvalid_0's binary_logloss: 0.45617\n", + "[81]\tvalid_0's auc: 0.816677\tvalid_0's binary_logloss: 0.455787\n", + "[82]\tvalid_0's auc: 0.817255\tvalid_0's binary_logloss: 0.455358\n", + "[83]\tvalid_0's auc: 0.817383\tvalid_0's binary_logloss: 0.454775\n", + "[84]\tvalid_0's auc: 0.817509\tvalid_0's binary_logloss: 0.454176\n", + "[85]\tvalid_0's auc: 0.817572\tvalid_0's binary_logloss: 0.453609\n", + "[86]\tvalid_0's auc: 0.817721\tvalid_0's binary_logloss: 0.453213\n", + "[87]\tvalid_0's auc: 0.817992\tvalid_0's binary_logloss: 0.452586\n", + "[88]\tvalid_0's auc: 0.81808\tvalid_0's binary_logloss: 0.45204\n", + "[89]\tvalid_0's auc: 0.818202\tvalid_0's binary_logloss: 0.451643\n", + "[90]\tvalid_0's auc: 0.818336\tvalid_0's binary_logloss: 0.451081\n", + "[91]\tvalid_0's auc: 0.818347\tvalid_0's binary_logloss: 0.450531\n", + "[92]\tvalid_0's auc: 0.818558\tvalid_0's binary_logloss: 0.450179\n", + "[93]\tvalid_0's auc: 0.818743\tvalid_0's binary_logloss: 0.449647\n", + "[94]\tvalid_0's auc: 0.818789\tvalid_0's binary_logloss: 0.449133\n", + "[95]\tvalid_0's auc: 0.818849\tvalid_0's binary_logloss: 0.44862\n", + "[96]\tvalid_0's auc: 0.81913\tvalid_0's binary_logloss: 0.448072\n", + "[97]\tvalid_0's auc: 0.819526\tvalid_0's binary_logloss: 0.447713\n", + "[98]\tvalid_0's auc: 0.819971\tvalid_0's binary_logloss: 0.447296\n", + "[99]\tvalid_0's auc: 0.819972\tvalid_0's binary_logloss: 0.446814\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[100]\tvalid_0's auc: 0.820086\tvalid_0's binary_logloss: 0.446533\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.820086\tvalid_0's binary_logloss: 0.446533\n", + "[1]\tvalid_0's auc: 0.768646\tvalid_0's binary_logloss: 0.527167\n", + "Training until validation scores don't improve for 50 rounds\n", + "[2]\tvalid_0's auc: 0.779902\tvalid_0's binary_logloss: 0.525481\n", + "[3]\tvalid_0's auc: 0.789868\tvalid_0's binary_logloss: 0.524485\n", + "[4]\tvalid_0's auc: 0.791895\tvalid_0's binary_logloss: 0.523382\n", + "[5]\tvalid_0's auc: 0.795453\tvalid_0's binary_logloss: 0.521759\n", + "[6]\tvalid_0's auc: 0.796672\tvalid_0's binary_logloss: 0.520166\n", + "[7]\tvalid_0's auc: 0.798023\tvalid_0's binary_logloss: 0.518857\n", + "[8]\tvalid_0's auc: 0.799331\tvalid_0's binary_logloss: 0.517297\n", + "[9]\tvalid_0's auc: 0.800181\tvalid_0's binary_logloss: 0.516416\n", + "[10]\tvalid_0's auc: 0.800373\tvalid_0's binary_logloss: 0.514967\n", + "[11]\tvalid_0's auc: 0.801087\tvalid_0's binary_logloss: 0.513631\n", + "[12]\tvalid_0's auc: 0.801122\tvalid_0's binary_logloss: 0.512658\n", + "[13]\tvalid_0's auc: 0.801043\tvalid_0's binary_logloss: 0.511833\n", + "[14]\tvalid_0's auc: 0.801238\tvalid_0's binary_logloss: 0.510461\n", + "[15]\tvalid_0's auc: 0.801847\tvalid_0's binary_logloss: 0.509034\n", + "[16]\tvalid_0's auc: 0.803139\tvalid_0's binary_logloss: 0.507759\n", + "[17]\tvalid_0's auc: 0.803577\tvalid_0's binary_logloss: 0.506361\n", + "[18]\tvalid_0's auc: 0.803834\tvalid_0's binary_logloss: 0.505229\n", + "[19]\tvalid_0's auc: 0.803943\tvalid_0's binary_logloss: 0.504371\n", + "[20]\tvalid_0's auc: 0.80415\tvalid_0's binary_logloss: 0.503102\n", + "[21]\tvalid_0's auc: 0.804446\tvalid_0's binary_logloss: 0.502564\n", + "[22]\tvalid_0's auc: 0.805163\tvalid_0's binary_logloss: 0.501396\n", + "[23]\tvalid_0's auc: 0.805323\tvalid_0's binary_logloss: 0.500327\n", + "[24]\tvalid_0's auc: 0.805314\tvalid_0's binary_logloss: 0.499123\n", + "[25]\tvalid_0's auc: 0.80535\tvalid_0's binary_logloss: 0.497927\n", + "[26]\tvalid_0's auc: 0.805864\tvalid_0's binary_logloss: 0.496834\n", + "[27]\tvalid_0's auc: 0.805919\tvalid_0's binary_logloss: 0.495667\n", + "[28]\tvalid_0's auc: 0.806272\tvalid_0's binary_logloss: 0.494606\n", + "[29]\tvalid_0's auc: 0.806599\tvalid_0's binary_logloss: 0.49343\n", + "[30]\tvalid_0's auc: 0.806932\tvalid_0's binary_logloss: 0.492303\n", + "[31]\tvalid_0's auc: 0.806656\tvalid_0's binary_logloss: 0.491249\n", + "[32]\tvalid_0's auc: 0.807436\tvalid_0's binary_logloss: 0.490188\n", + "[33]\tvalid_0's auc: 0.807629\tvalid_0's binary_logloss: 0.489117\n", + "[34]\tvalid_0's auc: 0.807501\tvalid_0's binary_logloss: 0.48808\n", + "[35]\tvalid_0's auc: 0.807885\tvalid_0's binary_logloss: 0.487383\n", + "[36]\tvalid_0's auc: 0.807921\tvalid_0's binary_logloss: 0.48636\n", + "[37]\tvalid_0's auc: 0.808267\tvalid_0's binary_logloss: 0.485724\n", + "[38]\tvalid_0's auc: 0.808563\tvalid_0's binary_logloss: 0.485076\n", + "[39]\tvalid_0's auc: 0.808813\tvalid_0's binary_logloss: 0.484039\n", + "[40]\tvalid_0's auc: 0.809023\tvalid_0's binary_logloss: 0.483091\n", + "[41]\tvalid_0's auc: 0.809782\tvalid_0's binary_logloss: 0.482441\n", + "[42]\tvalid_0's auc: 0.810135\tvalid_0's binary_logloss: 0.48179\n", + "[43]\tvalid_0's auc: 0.810219\tvalid_0's binary_logloss: 0.48082\n", + "[44]\tvalid_0's auc: 0.81031\tvalid_0's binary_logloss: 0.479906\n", + "[45]\tvalid_0's auc: 0.810514\tvalid_0's binary_logloss: 0.479024\n", + "[46]\tvalid_0's auc: 0.810566\tvalid_0's binary_logloss: 0.478437\n", + "[47]\tvalid_0's auc: 0.810611\tvalid_0's binary_logloss: 0.477529\n", + "[48]\tvalid_0's auc: 0.810781\tvalid_0's binary_logloss: 0.476637\n", + "[49]\tvalid_0's auc: 0.81089\tvalid_0's binary_logloss: 0.475883\n", + "[50]\tvalid_0's auc: 0.811266\tvalid_0's binary_logloss: 0.475459\n", + "[51]\tvalid_0's auc: 0.811402\tvalid_0's binary_logloss: 0.475078\n", + "[52]\tvalid_0's auc: 0.811765\tvalid_0's binary_logloss: 0.474246\n", + "[53]\tvalid_0's auc: 0.811891\tvalid_0's binary_logloss: 0.473452\n", + "[54]\tvalid_0's auc: 0.811868\tvalid_0's binary_logloss: 0.47263\n", + "[55]\tvalid_0's auc: 0.81192\tvalid_0's binary_logloss: 0.471804\n", + "[56]\tvalid_0's auc: 0.812272\tvalid_0's binary_logloss: 0.471275\n", + "[57]\tvalid_0's auc: 0.812639\tvalid_0's binary_logloss: 0.470396\n", + "[58]\tvalid_0's auc: 0.812764\tvalid_0's binary_logloss: 0.469597\n", + "[59]\tvalid_0's auc: 0.813084\tvalid_0's binary_logloss: 0.469049\n", + "[60]\tvalid_0's auc: 0.813342\tvalid_0's binary_logloss: 0.468244\n", + "[61]\tvalid_0's auc: 0.813302\tvalid_0's binary_logloss: 0.467499\n", + "[62]\tvalid_0's auc: 0.813221\tvalid_0's binary_logloss: 0.466758\n", + "[63]\tvalid_0's auc: 0.813697\tvalid_0's binary_logloss: 0.466017\n", + "[64]\tvalid_0's auc: 0.813985\tvalid_0's binary_logloss: 0.465501\n", + "[65]\tvalid_0's auc: 0.81416\tvalid_0's binary_logloss: 0.464725\n", + "[66]\tvalid_0's auc: 0.814227\tvalid_0's binary_logloss: 0.46398\n", + "[67]\tvalid_0's auc: 0.814397\tvalid_0's binary_logloss: 0.463309\n", + "[68]\tvalid_0's auc: 0.814426\tvalid_0's binary_logloss: 0.462627\n", + "[69]\tvalid_0's auc: 0.814593\tvalid_0's binary_logloss: 0.462244\n", + "[70]\tvalid_0's auc: 0.814789\tvalid_0's binary_logloss: 0.461571\n", + "[71]\tvalid_0's auc: 0.814889\tvalid_0's binary_logloss: 0.461144\n", + "[72]\tvalid_0's auc: 0.815078\tvalid_0's binary_logloss: 0.460684\n", + "[73]\tvalid_0's auc: 0.815439\tvalid_0's binary_logloss: 0.460063\n", + "[74]\tvalid_0's auc: 0.815511\tvalid_0's binary_logloss: 0.459386\n", + "[75]\tvalid_0's auc: 0.815574\tvalid_0's binary_logloss: 0.45877\n", + "[76]\tvalid_0's auc: 0.815634\tvalid_0's binary_logloss: 0.458128\n", + "[77]\tvalid_0's auc: 0.815618\tvalid_0's binary_logloss: 0.457495\n", + "[78]\tvalid_0's auc: 0.81582\tvalid_0's binary_logloss: 0.457057\n", + "[79]\tvalid_0's auc: 0.81594\tvalid_0's binary_logloss: 0.456475\n", + "[80]\tvalid_0's auc: 0.815961\tvalid_0's binary_logloss: 0.455885\n", + "[81]\tvalid_0's auc: 0.816153\tvalid_0's binary_logloss: 0.455511\n", + "[82]\tvalid_0's auc: 0.816433\tvalid_0's binary_logloss: 0.455186\n", + "[83]\tvalid_0's auc: 0.816546\tvalid_0's binary_logloss: 0.454625\n", + "[84]\tvalid_0's auc: 0.816586\tvalid_0's binary_logloss: 0.454039\n", + "[85]\tvalid_0's auc: 0.816584\tvalid_0's binary_logloss: 0.453482\n", + "[86]\tvalid_0's auc: 0.816881\tvalid_0's binary_logloss: 0.453048\n", + "[87]\tvalid_0's auc: 0.817029\tvalid_0's binary_logloss: 0.452485\n", + "[88]\tvalid_0's auc: 0.81707\tvalid_0's binary_logloss: 0.451941\n", + "[89]\tvalid_0's auc: 0.817298\tvalid_0's binary_logloss: 0.451544\n", + "[90]\tvalid_0's auc: 0.817343\tvalid_0's binary_logloss: 0.450975\n", + "[91]\tvalid_0's auc: 0.817357\tvalid_0's binary_logloss: 0.450422\n", + "[92]\tvalid_0's auc: 0.817592\tvalid_0's binary_logloss: 0.450109\n", + "[93]\tvalid_0's auc: 0.817729\tvalid_0's binary_logloss: 0.449542\n", + "[94]\tvalid_0's auc: 0.817834\tvalid_0's binary_logloss: 0.448982\n", + "[95]\tvalid_0's auc: 0.81809\tvalid_0's binary_logloss: 0.448398\n", + "[96]\tvalid_0's auc: 0.818269\tvalid_0's binary_logloss: 0.447908\n", + "[97]\tvalid_0's auc: 0.818682\tvalid_0's binary_logloss: 0.447547\n", + "[98]\tvalid_0's auc: 0.819015\tvalid_0's binary_logloss: 0.447165\n", + "[99]\tvalid_0's auc: 0.819016\tvalid_0's binary_logloss: 0.446669\n", + "[100]\tvalid_0's auc: 0.819127\tvalid_0's binary_logloss: 0.446397\n", + "Did not meet early stopping. Best iteration is:\n", + "[100]\tvalid_0's auc: 0.819127\tvalid_0's binary_logloss: 0.446397\n" + ] + } + ], + "source": [ + "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", + "# 这一部分与前面的单独训练和验证是分开的\n", + "def get_kfold_users(trn_df, n=5):\n", + " user_ids = trn_df['user_id'].unique()\n", + " user_set = [user_ids[i::n] for i in range(n)]\n", + " return user_set\n", + "\n", + "k_fold = 5\n", + "trn_df = trn_user_item_feats_df_rank_model\n", + "user_set = get_kfold_users(trn_df, n=k_fold)\n", + "\n", + "score_list = []\n", + "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n", + "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", + "\n", + "# 五折交叉验证,并将中间结果保存用于staking\n", + "for n_fold, valid_user in enumerate(user_set):\n", + " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", + " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", + " \n", + " # 模型及参数的定义\n", + " lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,\n", + " max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1,\n", + " learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) \n", + " # 训练模型\n", + " lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], \n", + " eval_metric=['auc', ],early_stopping_rounds=50, )\n", + " \n", + " # 预测验证集结果\n", + " valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], \n", + " num_iteration=lgb_Classfication.best_iteration_)[:,1]\n", + " \n", + " # 对输出结果进行归一化 分类模型输出的值本身就是一个概率值不需要进行归一化\n", + " # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x))\n", + " \n", + " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", + " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", + " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", + " \n", + " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", + " if not offline:\n", + " sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], \n", + " num_iteration=lgb_Classfication.best_iteration_)[:,1]\n", + " \n", + "score_df_ = pd.concat(score_list, axis=0)\n", + "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", + "# 保存训练集交叉验证产生的新特征\n", + "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False)\n", + " \n", + "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", + "tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_fold\n", + "tst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))\n", + "tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])\n", + "tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + "\n", + "# 保存测试集交叉验证的新特征\n", + "tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:24:23.074237Z", + "start_time": "2020-11-18T04:24:13.812284Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]\n", + "rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)\n", + "submit(rank_results, topk=5, model_name='lgb_cls')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DIN模型" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 用户的历史点击行为列表\n", + "这个是为后面的DIN模型服务的" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:24:30.508213Z", + "start_time": "2020-11-18T04:24:27.426372Z" + } + }, + "outputs": [], + "source": [ + "if offline:\n", + " all_data = pd.read_csv('./data_raw/train_click_log.csv')\n", + "else:\n", + " trn_data = pd.read_csv('./data_raw/train_click_log.csv')\n", + " tst_data = pd.read_csv('./data_raw/testA_click_log.csv')\n", + " all_data = trn_data.append(tst_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:25:28.082071Z", + "start_time": "2020-11-18T04:24:33.649524Z" + } + }, + "outputs": [], + "source": [ + "hist_click =all_data[['user_id', 'click_article_id']].groupby('user_id').agg({list}).reset_index()\n", + "his_behavior_df = pd.DataFrame()\n", + "his_behavior_df['user_id'] = hist_click['user_id']\n", + "his_behavior_df['hist_click_article_id'] = hist_click['click_article_id']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:25:52.925866Z", + "start_time": "2020-11-18T04:25:52.863922Z" + } + }, + "outputs": [], + "source": [ + "trn_user_item_feats_df_din_model = trn_user_item_feats_df.copy()\n", + "\n", + "if offline:\n", + " val_user_item_feats_df_din_model = val_user_item_feats_df.copy()\n", + "else: \n", + " val_user_item_feats_df_din_model = None\n", + " \n", + "tst_user_item_feats_df_din_model = tst_user_item_feats_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:00.070681Z", + "start_time": "2020-11-18T04:25:56.417197Z" + } + }, + "outputs": [], + "source": [ + "trn_user_item_feats_df_din_model = trn_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')\n", + "\n", + "if offline:\n", + " val_user_item_feats_df_din_model = val_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')\n", + "else:\n", + " val_user_item_feats_df_din_model = None\n", + "\n", + "tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_behavior_df, on='user_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DIN模型简介\n", + "我们下面尝试使用DIN模型, DIN的全称是Deep Interest Network, 这是阿里2018年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型, 它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性,来计算用户兴趣的表示向量。具体来说就是通过引入局部激活单元,通过软搜索历史行为的相关部分来关注相关的用户兴趣,并采用加权和来获得有关候选广告的用户兴趣的表示。与候选广告相关性较高的行为会获得较高的激活权重,并支配着用户兴趣。该表示向量在不同广告上有所不同,大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合, 我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。 该模型的结构如下:\n", + "\n", + "![image-20201116201646983](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png)\n", + "\n", + "\n", + "我们这里直接调包来使用这个模型, 关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用:deepctr的函数原型如下:\n", + "> def DIN(dnn_feature_columns, history_feature_list, dnn_use_bn=False,\n", + "> dnn_hidden_units=(200, 80), dnn_activation='relu', att_hidden_size=(80, 40), att_activation=\"dice\",\n", + "> att_weight_normalization=False, l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024,\n", + "> task='binary'):\n", + "> \n", + "> * dnn_feature_columns: 特征列, 包含数据所有特征的列表\n", + "> * history_feature_list: 用户历史行为列, 反应用户历史行为的特征的列表\n", + "> * dnn_use_bn: 是否使用BatchNormalization\n", + "> * dnn_hidden_units: 全连接层网络的层数和每一层神经元的个数, 一个列表或者元组\n", + "> * dnn_activation_relu: 全连接网络的激活单元类型\n", + "> * att_hidden_size: 注意力层的全连接网络的层数和每一层神经元的个数\n", + "> * att_activation: 注意力层的激活单元类型\n", + "> * att_weight_normalization: 是否归一化注意力得分\n", + "> * l2_reg_dnn: 全连接网络的正则化系数\n", + "> * l2_reg_embedding: embedding向量的正则化稀疏\n", + "> * dnn_dropout: 全连接网络的神经元的失活概率\n", + "> * task: 任务, 可以是分类, 也可是是回归\n", + "\n", + "在具体使用的时候, 我们必须要传入特征列和历史行为列, 但是再传入之前, 我们需要进行一下特征列的预处理。具体如下:\n", + "\n", + "1. 首先,我们要处理数据集, 得到数据, 由于我们是基于用户过去的行为去预测用户是否点击当前文章, 所以我们需要把数据的特征列划分成数值型特征, 离散型特征和历史行为特征列三部分, 对于每一部分, DIN模型的处理会有不同\n", + " 1. 对于离散型特征, 在我们的数据集中就是那些类别型的特征, 比如user_id这种, 这种类别型特征, 我们首先要经过embedding处理得到每个特征的低维稠密型表示, 既然要经过embedding, 那么我们就需要为每一列的类别特征的取值建立一个字典,并指明embedding维度, 所以在使用deepctr的DIN模型准备数据的时候, 我们需要通过SparseFeat函数指明这些类别型特征, 这个函数的传入参数就是列名, 列的唯一取值(建立字典用)和embedding维度。\n", + " 2. 对于用户历史行为特征列, 比如文章id, 文章的类别等这种, 同样的我们需要先经过embedding处理, 只不过和上面不一样的地方是,对于这种特征, 我们在得到每个特征的embedding表示之后, 还需要通过一个Attention_layer计算用户的历史行为和当前候选文章的相关性以此得到当前用户的embedding向量, 这个向量就可以基于当前的候选文章与用户过去点击过得历史文章的相似性的程度来反应用户的兴趣, 并且随着用户的不同的历史点击来变化,去动态的模拟用户兴趣的变化过程。这类特征对于每个用户都是一个历史行为序列, 对于每个用户, 历史行为序列长度会不一样, 可能有的用户点击的历史文章多,有的点击的历史文章少, 所以我们还需要把这个长度统一起来, 在为DIN模型准备数据的时候, 我们首先要通过SparseFeat函数指明这些类别型特征, 然后还需要通过VarLenSparseFeat函数再进行序列填充, 使得每个用户的历史序列一样长, 所以这个函数参数中会有个maxlen,来指明序列的最大长度是多少。\n", + " 3. 对于连续型特征列, 我们只需要用DenseFeat函数来指明列名和维度即可。\n", + "2. 处理完特征列之后, 我们把相应的数据与列进行对应,就得到了最后的数据。\n", + "\n", + "下面根据具体的代码感受一下, 逻辑是这样, 首先我们需要写一个数据准备函数, 在这里面就是根据上面的具体步骤准备数据, 得到数据和特征列, 然后就是建立DIN模型并训练, 最后基于模型进行测试。" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:08.405211Z", + "start_time": "2020-11-18T04:26:04.887013Z" + } + }, + "outputs": [], + "source": [ + "# 导入deepctr\n", + "from deepctr.models import DIN\n", + "from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat, get_feature_names\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "\n", + "from tensorflow.keras import backend as K\n", + "from tensorflow.keras.layers import *\n", + "from tensorflow.keras.models import *\n", + "from tensorflow.keras.callbacks import * \n", + "import tensorflow as tf\n", + "\n", + "import os\n", + "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"2\"" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:13.485712Z", + "start_time": "2020-11-18T04:26:13.476042Z" + } + }, + "outputs": [], + "source": [ + "# 数据准备函数\n", + "def get_din_feats_columns(df, dense_fea, sparse_fea, behavior_fea, his_behavior_fea, emb_dim=32, max_len=100):\n", + " \"\"\"\n", + " 数据准备函数:\n", + " df: 数据集\n", + " dense_fea: 数值型特征列\n", + " sparse_fea: 离散型特征列\n", + " behavior_fea: 用户的候选行为特征列\n", + " his_behavior_fea: 用户的历史行为特征列\n", + " embedding_dim: embedding的维度, 这里为了简单, 统一把离散型特征列采用一样的隐向量维度\n", + " max_len: 用户序列的最大长度\n", + " \"\"\"\n", + " \n", + " sparse_feature_columns = [SparseFeat(feat, vocabulary_size=df[feat].nunique() + 1, embedding_dim=emb_dim) for feat in sparse_fea]\n", + " \n", + " dense_feature_columns = [DenseFeat(feat, 1, ) for feat in dense_fea]\n", + " \n", + " var_feature_columns = [VarLenSparseFeat(SparseFeat(feat, vocabulary_size=df['click_article_id'].nunique() + 1,\n", + " embedding_dim=emb_dim, embedding_name='click_article_id'), maxlen=max_len) for feat in hist_behavior_fea]\n", + " \n", + " dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns\n", + " \n", + " # 建立x, x是一个字典的形式\n", + " x = {}\n", + " for name in get_feature_names(dnn_feature_columns):\n", + " if name in his_behavior_fea:\n", + " # 这是历史行为序列\n", + " his_list = [l for l in df[name]]\n", + " x[name] = pad_sequences(his_list, maxlen=max_len, padding='post') # 二维数组\n", + " else:\n", + " x[name] = df[name].values\n", + " \n", + " return x, dnn_feature_columns" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:18.783217Z", + "start_time": "2020-11-18T04:26:18.776795Z" + } + }, + "outputs": [], + "source": [ + "# 把特征分开\n", + "sparse_fea = ['user_id', 'click_article_id', 'category_id', 'click_environment', 'click_deviceGroup', \n", + " 'click_os', 'click_country', 'click_region', 'click_referrer_type', 'is_cat_hab']\n", + "\n", + "behavior_fea = ['click_article_id']\n", + "\n", + "hist_behavior_fea = ['hist_click_article_id']\n", + "\n", + "dense_fea = ['sim0', 'time_diff0', 'word_diff0', 'sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score',\n", + " 'rank','click_size','time_diff_mean','active_level','user_time_hob1','user_time_hob2',\n", + " 'words_hbo','words_count']" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:25.469810Z", + "start_time": "2020-11-18T04:26:24.779347Z" + } + }, + "outputs": [], + "source": [ + "# dense特征进行归一化, 神经网络训练都需要将数值进行归一化处理\n", + "mm = MinMaxScaler()\n", + "\n", + "# 下面是做一些特殊处理,当在其他的地方出现无效值的时候,不处理无法进行归一化,刚开始可以先把他注释掉,在运行了下面的代码\n", + "# 之后如果发现报错,应该先去想办法处理如何不出现inf之类的值\n", + "# trn_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)\n", + "# tst_user_item_feats_df_din_model.replace([np.inf, -np.inf], 0, inplace=True)\n", + "\n", + "for feat in dense_fea:\n", + " trn_user_item_feats_df_din_model[feat] = mm.fit_transform(trn_user_item_feats_df_din_model[[feat]])\n", + " \n", + " if val_user_item_feats_df_din_model is not None:\n", + " val_user_item_feats_df_din_model[feat] = mm.fit_transform(val_user_item_feats_df_din_model[[feat]])\n", + " \n", + " tst_user_item_feats_df_din_model[feat] = mm.fit_transform(tst_user_item_feats_df_din_model[[feat]])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:36.727753Z", + "start_time": "2020-11-18T04:26:28.854705Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n" + ] + } + ], + "source": [ + "# 准备训练数据\n", + "x_trn, dnn_feature_columns = get_din_feats_columns(trn_user_item_feats_df_din_model, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + "y_trn = trn_user_item_feats_df_din_model['label'].values\n", + "\n", + "if offline:\n", + " # 准备验证数据\n", + " x_val, dnn_feature_columns = get_din_feats_columns(val_user_item_feats_df_din_model, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + " y_val = val_user_item_feats_df_din_model['label'].values\n", + " \n", + "dense_fea = [x for x in dense_fea if x != 'label']\n", + "x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:26:45.146318Z", + "start_time": "2020-11-18T04:26:40.423914Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1288: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Call initializer instance with the dtype argument instead of passing it to the constructor\n", + "WARNING:tensorflow:From /home/ryluo/anaconda3/lib/python3.6/site-packages/tensorflow/python/autograph/impl/api.py:255: add_dispatch_support..wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", + "Model: \"model\"\n", + "__________________________________________________________________________________________________\n", + "Layer (type) Output Shape Param # Connected to \n", + "==================================================================================================\n", + "user_id (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_article_id (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "category_id (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_environment (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_deviceGroup (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_os (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_country (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_region (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_referrer_type (InputLayer [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "is_cat_hab (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_user_id (Embedding) (None, 1, 32) 1600032 user_id[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_seq_emb_hist_click_artic multiple 525664 click_article_id[0][0] \n", + " hist_click_article_id[0][0] \n", + " click_article_id[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_category_id (Embeddi (None, 1, 32) 7776 category_id[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_environment (E (None, 1, 32) 128 click_environment[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_deviceGroup (E (None, 1, 32) 160 click_deviceGroup[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_os (Embedding) (None, 1, 32) 288 click_os[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_country (Embed (None, 1, 32) 384 click_country[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_region (Embedd (None, 1, 32) 928 click_region[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_click_referrer_type (None, 1, 32) 256 click_referrer_type[0][0] \n", + "__________________________________________________________________________________________________\n", + "sparse_emb_is_cat_hab (Embeddin (None, 1, 32) 64 is_cat_hab[0][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask (NoMask) (None, 1, 32) 0 sparse_emb_user_id[0][0] \n", + " sparse_seq_emb_hist_click_article\n", + " sparse_emb_category_id[0][0] \n", + " sparse_emb_click_environment[0][0\n", + " sparse_emb_click_deviceGroup[0][0\n", + " sparse_emb_click_os[0][0] \n", + " sparse_emb_click_country[0][0] \n", + " sparse_emb_click_region[0][0] \n", + " sparse_emb_click_referrer_type[0]\n", + " sparse_emb_is_cat_hab[0][0] \n", + "__________________________________________________________________________________________________\n", + "hist_click_article_id (InputLay [(None, 50)] 0 \n", + "__________________________________________________________________________________________________\n", + "concatenate (Concatenate) (None, 1, 320) 0 no_mask[0][0] \n", + " no_mask[1][0] \n", + " no_mask[2][0] \n", + " no_mask[3][0] \n", + " no_mask[4][0] \n", + " no_mask[5][0] \n", + " no_mask[6][0] \n", + " no_mask[7][0] \n", + " no_mask[8][0] \n", + " no_mask[9][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask_1 (NoMask) (None, 1, 320) 0 concatenate[0][0] \n", + "__________________________________________________________________________________________________\n", + "attention_sequence_pooling_laye (None, 1, 32) 13961 sparse_seq_emb_hist_click_article\n", + " sparse_seq_emb_hist_click_article\n", + "__________________________________________________________________________________________________\n", + "concatenate_1 (Concatenate) (None, 1, 352) 0 no_mask_1[0][0] \n", + " attention_sequence_pooling_layer[\n", + "__________________________________________________________________________________________________\n", + "sim0 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "time_diff0 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "word_diff0 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sim_max (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sim_min (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sim_sum (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "sim_mean (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "score (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "rank (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "click_size (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "time_diff_mean (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "active_level (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "user_time_hob1 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "user_time_hob2 (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "words_hbo (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "words_count (InputLayer) [(None, 1)] 0 \n", + "__________________________________________________________________________________________________\n", + "flatten (Flatten) (None, 352) 0 concatenate_1[0][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask_3 (NoMask) (None, 1) 0 sim0[0][0] \n", + " time_diff0[0][0] \n", + " word_diff0[0][0] \n", + " sim_max[0][0] \n", + " sim_min[0][0] \n", + " sim_sum[0][0] \n", + " sim_mean[0][0] \n", + " score[0][0] \n", + " rank[0][0] \n", + " click_size[0][0] \n", + " time_diff_mean[0][0] \n", + " active_level[0][0] \n", + " user_time_hob1[0][0] \n", + " user_time_hob2[0][0] \n", + " words_hbo[0][0] \n", + " words_count[0][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask_2 (NoMask) (None, 352) 0 flatten[0][0] \n", + "__________________________________________________________________________________________________\n", + "concatenate_2 (Concatenate) (None, 16) 0 no_mask_3[0][0] \n", + " no_mask_3[1][0] \n", + " no_mask_3[2][0] \n", + " no_mask_3[3][0] \n", + " no_mask_3[4][0] \n", + " no_mask_3[5][0] \n", + " no_mask_3[6][0] \n", + " no_mask_3[7][0] \n", + " no_mask_3[8][0] \n", + " no_mask_3[9][0] \n", + " no_mask_3[10][0] \n", + " no_mask_3[11][0] \n", + " no_mask_3[12][0] \n", + " no_mask_3[13][0] \n", + " no_mask_3[14][0] \n", + " no_mask_3[15][0] \n", + "__________________________________________________________________________________________________\n", + "flatten_1 (Flatten) (None, 352) 0 no_mask_2[0][0] \n", + "__________________________________________________________________________________________________\n", + "flatten_2 (Flatten) (None, 16) 0 concatenate_2[0][0] \n", + "__________________________________________________________________________________________________\n", + "no_mask_4 (NoMask) multiple 0 flatten_1[0][0] \n", + " flatten_2[0][0] \n", + "__________________________________________________________________________________________________\n", + "concatenate_3 (Concatenate) (None, 368) 0 no_mask_4[0][0] \n", + " no_mask_4[1][0] \n", + "__________________________________________________________________________________________________\n", + "dnn_1 (DNN) (None, 80) 89880 concatenate_3[0][0] \n", + "__________________________________________________________________________________________________\n", + "dense (Dense) (None, 1) 80 dnn_1[0][0] \n", + "__________________________________________________________________________________________________\n", + "prediction_layer (PredictionLay (None, 1) 1 dense[0][0] \n", + "==================================================================================================\n", + "Total params: 2,239,602\n", + "Trainable params: 2,239,362\n", + "Non-trainable params: 240\n", + "__________________________________________________________________________________________________\n" + ] + } + ], + "source": [ + "# 建立模型\n", + "model = DIN(dnn_feature_columns, behavior_fea)\n", + "\n", + "# 查看模型结构\n", + "model.summary()\n", + "\n", + "# 模型编译\n", + "model.compile('adam', 'binary_crossentropy',metrics=['binary_crossentropy', tf.keras.metrics.AUC()])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:28:43.885773Z", + "start_time": "2020-11-18T04:26:48.746787Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/2\n", + "290964/290964 [==============================] - 55s 189us/sample - loss: 0.4209 - binary_crossentropy: 0.4206 - auc: 0.7842\n", + "Epoch 2/2\n", + "290964/290964 [==============================] - 52s 178us/sample - loss: 0.3630 - binary_crossentropy: 0.3618 - auc: 0.8478\n" + ] + } + ], + "source": [ + "# 模型训练\n", + "if offline:\n", + " history = model.fit(x_trn, y_trn, verbose=1, epochs=10, validation_data=(x_val, y_val) , batch_size=256)\n", + "else:\n", + " # 也可以使用上面的语句用自己采样出来的验证集\n", + " # history = model.fit(x_trn, y_trn, verbose=1, epochs=3, validation_split=0.3, batch_size=256)\n", + " history = model.fit(x_trn, y_trn, verbose=1, epochs=2, batch_size=256)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:29:20.436591Z", + "start_time": "2020-11-18T04:28:58.102057Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "500000/500000 [==============================] - 20s 39us/sample\n" + ] + } + ], + "source": [ + "# 模型预测\n", + "tst_user_item_feats_df_din_model['pred_score'] = model.predict(x_tst, verbose=1, batch_size=256)\n", + "tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'din_rank_score.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:29:34.985535Z", + "start_time": "2020-11-18T04:29:26.264531Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score']]\n", + "submit(rank_results, topk=5, model_name='din')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-15T06:15:49.490705Z", + "start_time": "2020-11-15T06:15:49.473794Z" + } + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:38:53.760383Z", + "start_time": "2020-11-18T04:29:51.737721Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train on 232681 samples, validate on 58283 samples\n", + "Epoch 1/2\n", + "232681/232681 [==============================] - 44s 189us/sample - loss: 0.2864 - binary_crossentropy: 0.2846 - auc: 0.9008 - val_loss: 0.2830 - val_binary_crossentropy: 0.2813 - val_auc: 0.9072\n", + "Epoch 2/2\n", + "232681/232681 [==============================] - 44s 187us/sample - loss: 0.2832 - binary_crossentropy: 0.2816 - auc: 0.9034 - val_loss: 0.2846 - val_binary_crossentropy: 0.2830 - val_auc: 0.9053\n", + "58283/58283 [==============================] - 2s 36us/sample\n", + "500000/500000 [==============================] - 19s 37us/sample\n", + "Train on 232798 samples, validate on 58166 samples\n", + "Epoch 1/2\n", + "232798/232798 [==============================] - 43s 184us/sample - loss: 0.2818 - binary_crossentropy: 0.2802 - auc: 0.9051 - val_loss: 0.2968 - val_binary_crossentropy: 0.2953 - val_auc: 0.9062\n", + "Epoch 2/2\n", + "232798/232798 [==============================] - 44s 187us/sample - loss: 0.2796 - binary_crossentropy: 0.2782 - auc: 0.9069 - val_loss: 0.2820 - val_binary_crossentropy: 0.2806 - val_auc: 0.9071\n", + "58166/58166 [==============================] - 2s 38us/sample\n", + "500000/500000 [==============================] - 18s 37us/sample\n", + "Train on 232847 samples, validate on 58117 samples\n", + "Epoch 1/2\n", + "232847/232847 [==============================] - 43s 185us/sample - loss: 0.2786 - binary_crossentropy: 0.2773 - auc: 0.9080 - val_loss: 0.2761 - val_binary_crossentropy: 0.2749 - val_auc: 0.9113\n", + "Epoch 2/2\n", + "232847/232847 [==============================] - 39s 166us/sample - loss: 0.2766 - binary_crossentropy: 0.2754 - auc: 0.9097 - val_loss: 0.2872 - val_binary_crossentropy: 0.2862 - val_auc: 0.9090\n", + "58117/58117 [==============================] - 2s 34us/sample\n", + "500000/500000 [==============================] - 17s 33us/sample\n", + "Train on 232716 samples, validate on 58248 samples\n", + "Epoch 1/2\n", + "232716/232716 [==============================] - 39s 169us/sample - loss: 0.2763 - binary_crossentropy: 0.2753 - auc: 0.9100 - val_loss: 0.2739 - val_binary_crossentropy: 0.2730 - val_auc: 0.9116\n", + "Epoch 2/2\n", + "232716/232716 [==============================] - 39s 168us/sample - loss: 0.2743 - binary_crossentropy: 0.2735 - auc: 0.9119 - val_loss: 0.2859 - val_binary_crossentropy: 0.2851 - val_auc: 0.9090\n", + "58248/58248 [==============================] - 2s 35us/sample\n", + "500000/500000 [==============================] - 17s 34us/sample\n", + "Train on 232814 samples, validate on 58150 samples\n", + "Epoch 1/2\n", + "232814/232814 [==============================] - 40s 170us/sample - loss: 0.2747 - binary_crossentropy: 0.2739 - auc: 0.9115 - val_loss: 0.2702 - val_binary_crossentropy: 0.2695 - val_auc: 0.9163\n", + "Epoch 2/2\n", + "232814/232814 [==============================] - 40s 170us/sample - loss: 0.2725 - binary_crossentropy: 0.2719 - auc: 0.9132 - val_loss: 0.2751 - val_binary_crossentropy: 0.2745 - val_auc: 0.9151\n", + "58150/58150 [==============================] - 2s 34us/sample\n", + "500000/500000 [==============================] - 17s 34us/sample\n" + ] + } + ], + "source": [ + "# 五折交叉验证,这里的五折交叉是以用户为目标进行五折划分\n", + "# 这一部分与前面的单独训练和验证是分开的\n", + "def get_kfold_users(trn_df, n=5):\n", + " user_ids = trn_df['user_id'].unique()\n", + " user_set = [user_ids[i::n] for i in range(n)]\n", + " return user_set\n", + "\n", + "k_fold = 5\n", + "trn_df = trn_user_item_feats_df_din_model\n", + "user_set = get_kfold_users(trn_df, n=k_fold)\n", + "\n", + "score_list = []\n", + "score_df = trn_df[['user_id', 'click_article_id', 'label']]\n", + "sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])\n", + "\n", + "dense_fea = [x for x in dense_fea if x != 'label']\n", + "x_tst, dnn_feature_columns = get_din_feats_columns(tst_user_item_feats_df_din_model, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + "\n", + "# 五折交叉验证,并将中间结果保存用于staking\n", + "for n_fold, valid_user in enumerate(user_set):\n", + " train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user\n", + " valid_idx = trn_df[trn_df['user_id'].isin(valid_user)]\n", + " \n", + " # 准备训练数据\n", + " x_trn, dnn_feature_columns = get_din_feats_columns(train_idx, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + " y_trn = train_idx['label'].values\n", + "\n", + " # 准备验证数据\n", + " x_val, dnn_feature_columns = get_din_feats_columns(valid_idx, dense_fea, \n", + " sparse_fea, behavior_fea, hist_behavior_fea, max_len=50)\n", + " y_val = valid_idx['label'].values\n", + " \n", + " history = model.fit(x_trn, y_trn, verbose=1, epochs=2, validation_data=(x_val, y_val) , batch_size=256)\n", + " \n", + " # 预测验证集结果\n", + " valid_idx['pred_score'] = model.predict(x_val, verbose=1, batch_size=256) \n", + " \n", + " valid_idx.sort_values(by=['user_id', 'pred_score'])\n", + " valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + " \n", + " # 将验证集的预测结果放到一个列表中,后面进行拼接\n", + " score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']])\n", + " \n", + " # 如果是线上测试,需要计算每次交叉验证的结果相加,最后求平均\n", + " if not offline:\n", + " sub_preds += model.predict(x_tst, verbose=1, batch_size=256)[:, 0] \n", + " \n", + "score_df_ = pd.concat(score_list, axis=0)\n", + "score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])\n", + "# 保存训练集交叉验证产生的新特征\n", + "score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_din_cls_feats.csv', index=False)\n", + " \n", + "# 测试集的预测结果,多次交叉验证求平均,将预测的score和对应的rank特征保存,可以用于后面的staking,这里还可以构造其他更多的特征\n", + "tst_user_item_feats_df_din_model['pred_score'] = sub_preds / k_fold\n", + "tst_user_item_feats_df_din_model['pred_score'] = tst_user_item_feats_df_din_model['pred_score'].transform(lambda x: norm_sim(x))\n", + "tst_user_item_feats_df_din_model.sort_values(by=['user_id', 'pred_score'])\n", + "tst_user_item_feats_df_din_model['pred_rank'] = tst_user_item_feats_df_din_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')\n", + "\n", + "# 保存测试集交叉验证的新特征\n", + "tst_user_item_feats_df_din_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_din_cls_feats.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 模型融合" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 加权融合" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:27.351996Z", + "start_time": "2020-11-18T04:44:26.561275Z" + } + }, + "outputs": [], + "source": [ + "# 读取多个模型的排序结果文件\n", + "lgb_ranker = pd.read_csv(save_path + 'lgb_ranker_score.csv')\n", + "lgb_cls = pd.read_csv(save_path + 'lgb_cls_score.csv')\n", + "din_ranker = pd.read_csv(save_path + 'din_rank_score.csv')\n", + "\n", + "# 这里也可以换成交叉验证输出的测试结果进行加权融合" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:31.593981Z", + "start_time": "2020-11-18T04:44:31.589439Z" + } + }, + "outputs": [], + "source": [ + "rank_model = {'lgb_ranker': lgb_ranker, \n", + " 'lgb_cls': lgb_cls, \n", + " 'din_ranker': din_ranker}" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:36.135860Z", + "start_time": "2020-11-18T04:44:36.130577Z" + } + }, + "outputs": [], + "source": [ + "def get_ensumble_predict_topk(rank_model, topk=5):\n", + " final_recall = rank_model['lgb_cls'].append(rank_model['din_ranker'])\n", + " rank_model['lgb_ranker']['pred_score'] = rank_model['lgb_ranker']['pred_score'].transform(lambda x: norm_sim(x))\n", + " \n", + " final_recall = final_recall.append(rank_model['lgb_ranker'])\n", + " final_recall = final_recall.groupby(['user_id', 'click_article_id'])['pred_score'].sum().reset_index()\n", + " \n", + " submit(final_recall, topk=topk, model_name='ensemble_fuse')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:51.659270Z", + "start_time": "2020-11-18T04:44:40.445659Z" + } + }, + "outputs": [], + "source": [ + "get_ensumble_predict_topk(rank_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Staking" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:44:58.025992Z", + "start_time": "2020-11-18T04:44:56.146962Z" + } + }, + "outputs": [], + "source": [ + "# 读取多个模型的交叉验证生成的结果文件\n", + "# 训练集\n", + "trn_lgb_ranker_feats = pd.read_csv(save_path + 'trn_lgb_ranker_feats.csv')\n", + "trn_lgb_cls_feats = pd.read_csv(save_path + 'trn_lgb_cls_feats.csv')\n", + "trn_din_cls_feats = pd.read_csv(save_path + 'trn_din_cls_feats.csv')\n", + "\n", + "# 测试集\n", + "tst_lgb_ranker_feats = pd.read_csv(save_path + 'tst_lgb_ranker_feats.csv')\n", + "tst_lgb_cls_feats = pd.read_csv(save_path + 'tst_lgb_cls_feats.csv')\n", + "tst_din_cls_feats = pd.read_csv(save_path + 'tst_din_cls_feats.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:45:07.701862Z", + "start_time": "2020-11-18T04:45:07.644335Z" + } + }, + "outputs": [], + "source": [ + "# 将多个模型输出的特征进行拼接\n", + "\n", + "finall_trn_ranker_feats = trn_lgb_ranker_feats[['user_id', 'click_article_id', 'label']]\n", + "finall_tst_ranker_feats = tst_lgb_ranker_feats[['user_id', 'click_article_id']]\n", + "\n", + "for idx, trn_model in enumerate([trn_lgb_ranker_feats, trn_lgb_cls_feats, trn_din_cls_feats]):\n", + " for feat in [ 'pred_score', 'pred_rank']:\n", + " col_name = feat + '_' + str(idx)\n", + " finall_trn_ranker_feats[col_name] = trn_model[feat]\n", + "\n", + "for idx, tst_model in enumerate([tst_lgb_ranker_feats, tst_lgb_cls_feats, tst_din_cls_feats]):\n", + " for feat in [ 'pred_score', 'pred_rank']:\n", + " col_name = feat + '_' + str(idx)\n", + " finall_tst_ranker_feats[col_name] = tst_model[feat]" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:45:15.044242Z", + "start_time": "2020-11-18T04:45:13.138252Z" + } + }, + "outputs": [], + "source": [ + "# 定义一个逻辑回归模型再次拟合交叉验证产生的特征对测试集进行预测\n", + "# 这里需要注意的是,在做交叉验证的时候可以构造多一些与输出预测值相关的特征,来丰富这里简单模型的特征\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "feat_cols = ['pred_score_0', 'pred_rank_0', 'pred_score_1', 'pred_rank_1', 'pred_score_2', 'pred_rank_2']\n", + "\n", + "trn_x = finall_trn_ranker_feats[feat_cols]\n", + "trn_y = finall_trn_ranker_feats['label']\n", + "\n", + "tst_x = finall_tst_ranker_feats[feat_cols]\n", + "\n", + "# 定义模型\n", + "lr = LogisticRegression()\n", + "\n", + "# 模型训练\n", + "lr.fit(trn_x, trn_y)\n", + "\n", + "# 模型预测\n", + "finall_tst_ranker_feats['pred_score'] = lr.predict_proba(tst_x)[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "ExecuteTime": { + "end_time": "2020-11-18T04:45:29.018764Z", + "start_time": "2020-11-18T04:45:19.423130Z" + } + }, + "outputs": [], + "source": [ + "# 预测结果重新排序, 及生成提交结果\n", + "rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]\n", + "submit(rank_results, topk=5, model_name='ensumble_staking')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 总结\n", + "本章主要学习了三个排序模型,包括LGB的Rank, LGB的Classifier还有深度学习的DIN模型, 当然,对于这三个模型的原理部分,我们并没有给出详细的介绍, 请大家课下自己探索原理,也欢迎大家把自己的探索与所学分享出来,我们一块学习和进步。最后,我们进行了简单的模型融合策略,包括简单的加权和Stacking。\n", + "\n", + "关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", + "\n", + "![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" + ] } - }, - "outputs": [], - "source": [ - "# 预测结果重新排序, 及生成提交结果\n", - "rank_results = finall_tst_ranker_feats[['user_id', 'click_article_id', 'pred_score']]\n", - "submit(rank_results, topk=5, model_name='ensumble_staking')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 总结\n", - "本章主要学习了三个排序模型,包括LGB的Rank, LGB的Classifier还有深度学习的DIN模型, 当然,对于这三个模型的原理部分,我们并没有给出详细的介绍, 请大家课下自己探索原理,也欢迎大家把自己的探索与所学分享出来,我们一块学习和进步。最后,我们进行了简单的模型融合策略,包括简单的加权和Stacking。\n", - "\n", - "关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale:\n", - "\n", - "![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "latex_envs": { - "LaTeX_envs_menu_present": true, - "autoclose": false, - "autocomplete": true, - "bibliofile": "biblio.bib", - "cite_by": "apalike", - "current_citInitial": 1, - "eqLabelWithNumbers": true, - "eqNumInitial": 1, - "hotkeys": { - "equation": "Ctrl-E", - "itemize": "Ctrl-I" - }, - "labels_anchors": false, - "latex_user_defs": false, - "report_style_numbering": false, - "user_envs_cfg": false - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": { - "height": "calc(100% - 180px)", - "left": "10px", - "top": "150px", - "width": "170px" - }, - "toc_section_display": true, - "toc_window_display": true - }, - "varInspector": { - "cols": { - "lenName": 16, - "lenType": 16, - "lenVar": 40 - }, - "kernels_config": { - "python": { - "delete_cmd_postfix": "", - "delete_cmd_prefix": "del ", - "library": "var_list.py", - "varRefreshCmd": "print(var_dic_list())" + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" }, - "r": { - "delete_cmd_postfix": ") ", - "delete_cmd_prefix": "rm(", - "library": "var_list.r", - "varRefreshCmd": "cat(var_dic_list()) " + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "170px" + }, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false } - }, - "types_to_exclude": [ - "module", - "function", - "builtin_function_or_method", - "instance", - "_Feature" - ], - "window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.1 \350\265\233\351\242\230\347\220\206\350\247\243+Baseline.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.1 \350\265\233\351\242\230\347\220\206\350\247\243+Baseline.md" index 645152157..5c3930fe0 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.1 \350\265\233\351\242\230\347\220\206\350\247\243+Baseline.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.1 \350\265\233\351\242\230\347\220\206\350\247\243+Baseline.md" @@ -377,7 +377,7 @@ submit(tst_recall, topk=5, model_name='itemcf_baseline') **关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.2 \346\225\260\346\215\256\345\210\206\346\236\220.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.2 \346\225\260\346\215\256\345\210\206\346\236\220.md" index 173d95002..5584973fa 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.2 \346\225\260\346\215\256\345\210\206\346\236\220.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.2 \346\225\260\346\215\256\345\210\206\346\236\220.md" @@ -66,7 +66,7 @@ trn_click = trn_click.merge(item_df, how='left', on=['click_article_id']) trn_click.head() ``` -![image-20201119112706647](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112706647.png) +![image-20201119112706647](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112706647.png) **train_click_log.csv文件数据中每个字段的含义** @@ -86,7 +86,7 @@ trn_click.head() trn_click.info() ``` -![image-20201119112622939](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112622939.png) +![image-20201119112622939](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112622939.png) @@ -94,7 +94,7 @@ trn_click.info() trn_click.describe() ``` -![image-20201119112649376](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112649376.png) +![image-20201119112649376](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112649376.png) ```python @@ -133,7 +133,7 @@ plt.tight_layout() plt.show() ``` -![在这里插入图片描述](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/20201118000820300.png) +![在这里插入图片描述](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/20201118000820300.png) **从点击时间clik_timestamp来看,分布较为平均,可不做特殊处理。由于时间戳是13位的,后续将时间格式转换成10位方便计算。** @@ -149,14 +149,14 @@ tst_click = tst_click.merge(item_df, how='left', on=['click_article_id']) tst_click.head() ``` -![image-20201119112952261](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112952261.png) +![image-20201119112952261](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112952261.png) ```python tst_click.describe() ``` -![image-20201119113015529](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113015529.png) +![image-20201119113015529](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113015529.png) **我们可以看出训练集和测试集的用户是完全不一样的** @@ -187,14 +187,14 @@ tst_click.groupby('user_id')['click_article_id'].count().min() # 注意测试集 item_df.head().append(item_df.tail()) ``` -![image-20201119113118388](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113118388.png) +![image-20201119113118388](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113118388.png) ```python item_df['words_count'].value_counts() ``` -![image-20201119113147240](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113147240.png) +![image-20201119113147240](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113147240.png) ```python @@ -219,7 +219,7 @@ item_df.shape # 364047篇文章 item_emb_df.head() ``` -![image-20201119113253455](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113253455.png) +![image-20201119113253455](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113253455.png) ```python item_emb_df.shape @@ -245,21 +245,21 @@ user_click_count = user_click_merge.groupby(['user_id', 'click_article_id'])['cl user_click_count[:10] ``` -![image-20201119113334727](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113334727.png) +![image-20201119113334727](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113334727.png) ```python user_click_count[user_click_count['count']>7] ``` -![image-20201119113351807](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113351807.png) +![image-20201119113351807](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113351807.png) ```python user_click_count['count'].unique() ``` -![image-20201119113429769](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113429769.png) +![image-20201119113429769](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113429769.png) ```python @@ -267,7 +267,7 @@ user_click_count['count'].unique() user_click_count.loc[:,'count'].value_counts() ``` -![image-20201119113414785](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113414785.png) +![image-20201119113414785](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113414785.png) **可以看出:有1605541(约占99.2%)的用户未重复阅读过文章,仅有极少数用户重复点击过某篇文章。 这个也可以单独制作成特征** @@ -301,15 +301,15 @@ for _, user_df in sample_users.groupby('user_id'): plot_envs(user_df, cols, 2, 3) ``` -![image-20201119113624424](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113624424.png) +![image-20201119113624424](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113624424.png) -![image-20201119113637746](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113637746.png) +![image-20201119113637746](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113637746.png) -![image-20201119113652132](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113652132.png) +![image-20201119113652132](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113652132.png) -![image-20201119113702034](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113702034.png) +![image-20201119113702034](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113702034.png) -![image-20201119113714135](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113714135.png) +![image-20201119113714135](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113714135.png) **可以看出绝大多数数的用户的点击环境是比较固定的。思路:可以基于这些环境的统计特征来代表该用户本身的属性** @@ -322,7 +322,7 @@ plt.plot(user_click_item_count) ``` -![image-20201119113759490](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113759490.png) +![image-20201119113759490](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113759490.png) **可以根据用户的点击文章次数看出用户的活跃度** @@ -332,7 +332,7 @@ plt.plot(user_click_item_count) plt.plot(user_click_item_count[:50]) ``` -![image-20201119113825586](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113825586.png) +![image-20201119113825586](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113825586.png) **点击次数排前50的用户的点击次数都在100次以上。思路:我们可以定义点击次数大于等于100次的用户为活跃用户,这是一种简单的处理思路, 判断用户活跃度,更加全面的是再结合上点击时间,后面我们会基于点击次数和点击时间两个方面来判断用户活跃度。** @@ -342,7 +342,7 @@ plt.plot(user_click_item_count[:50]) plt.plot(user_click_item_count[25000:50000]) ``` -![image-20201119113844946](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113844946.png) +![image-20201119113844946](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113844946.png) **可以看出点击次数小于等于两次的用户非常的多,这些用户可以认为是非活跃用户** @@ -358,14 +358,14 @@ item_click_count = sorted(user_click_merge.groupby('click_article_id')['user_id' plt.plot(item_click_count) ``` -![image-20201119113912912](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113912912.png) +![image-20201119113912912](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113912912.png) ```python plt.plot(item_click_count[:100]) ``` -![image-20201119113930745](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113930745.png) +![image-20201119113930745](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113930745.png) **可以看出点击次数最多的前100篇新闻,点击次数大于1000次** @@ -374,7 +374,7 @@ plt.plot(item_click_count[:100]) plt.plot(item_click_count[:20]) ``` -![image-20201119113958254](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113958254.png) +![image-20201119113958254](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119113958254.png) **点击次数最多的前20篇新闻,点击次数大于2500。思路:可以定义这些新闻为热门新闻, 这个也是简单的处理方式,后面我们也是根据点击次数和时间进行文章热度的一个划分。** @@ -383,7 +383,7 @@ plt.plot(item_click_count[:20]) plt.plot(item_click_count[3500:]) ``` -![image-20201119114017762](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114017762.png) +![image-20201119114017762](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114017762.png) **可以发现很多新闻只被点击过一两次。思路:可以定义这些新闻是冷门新闻。** @@ -397,7 +397,7 @@ union_item = tmp.groupby(['click_article_id','next_item'])['click_timestamp'].ag union_item[['count']].describe() ``` -![image-20201119114044351](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114044351.png) +![image-20201119114044351](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114044351.png) **由统计数据可以看出,平均共现次数2.88,最高为1687。** @@ -411,14 +411,14 @@ y = union_item['count'] plt.scatter(x, y) ``` -![image-20201119114106223](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114106223.png) +![image-20201119114106223](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114106223.png) ```python plt.plot(union_item['count'].values[40000:]) ``` -![image-20201119114122557](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114122557.png) +![image-20201119114122557](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114122557.png) **大概有70000个pair至少共现一次。** @@ -432,7 +432,7 @@ plt.plot(union_item['count'].values[40000:]) plt.plot(user_click_merge['category_id'].value_counts().values) ``` -![image-20201119114144058](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114144058.png) +![image-20201119114144058](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114144058.png) ```python @@ -440,7 +440,7 @@ plt.plot(user_click_merge['category_id'].value_counts().values) plt.plot(user_click_merge['category_id'].value_counts().values[150:]) ``` -![image-20201119114201764](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114201764.png) +![image-20201119114201764](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114201764.png) ```python @@ -455,7 +455,7 @@ user_click_merge['words_count'].describe() plt.plot(user_click_merge['words_count'].values) ``` -![image-20201119114241194](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114241194.png) +![image-20201119114241194](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114241194.png) @@ -469,7 +469,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), re ``` -![image-20201119114300286](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114300286.png) +![image-20201119114300286](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114300286.png) **从上图中可以看出有一小部分用户阅读类型是极其广泛的,大部分人都处在20个新闻类型以下。** @@ -478,7 +478,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['category_id'].nunique(), re user_click_merge.groupby('user_id')['category_id'].nunique().reset_index().describe() ``` -![image-20201119114318523](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114318523.png) +![image-20201119114318523](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114318523.png) ### 用户查看文章的长度的分布 @@ -490,7 +490,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), rever ``` -![image-20201119114337448](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114337448.png) +![image-20201119114337448](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114337448.png) @@ -504,7 +504,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), rever plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), reverse=True)[1000:45000]) ``` -![image-20201119114355195](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114355195.png) +![image-20201119114355195](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114355195.png) **可以发现大多数人都是看250字以下的文章** @@ -514,7 +514,7 @@ plt.plot(sorted(user_click_merge.groupby('user_id')['words_count'].mean(), rever user_click_merge.groupby('user_id')['words_count'].mean().reset_index().describe() ``` -![image-20201119114418911](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114418911.png) +![image-20201119114418911](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114418911.png) @@ -536,7 +536,7 @@ user_click_merge = user_click_merge.sort_values('click_timestamp') user_click_merge.head() ``` -![image-20201119114447904](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114447904.png) +![image-20201119114447904](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114447904.png) ```python @@ -558,7 +558,7 @@ mean_diff_click_time = user_click_merge.groupby('user_id')['click_timestamp', 'c plt.plot(sorted(mean_diff_click_time.values, reverse=True)) ``` -![image-20201119114505086](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114505086.png) +![image-20201119114505086](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119114505086.png) **从上图可以发现不同用户点击文章的时间差是有差异的。** @@ -573,7 +573,7 @@ mean_diff_created_time = user_click_merge.groupby('user_id')['click_timestamp', plt.plot(sorted(mean_diff_created_time.values, reverse=True)) ``` -![image-20201119122227666](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122227666.png) +![image-20201119122227666](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122227666.png) **从图中可以发现用户先后点击文章,文章的创建时间也是有差异的** @@ -602,7 +602,7 @@ sub_user_info = user_click_merge[user_click_merge['user_id'].isin(sub_user_ids)] sub_user_info.head() ``` -![image-20201119122251274](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122251274.png) +![image-20201119122251274](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122251274.png) ```python @@ -625,7 +625,7 @@ for _, user_df in sub_user_info.groupby('user_id'): ``` -![image-20201119122310969](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122310969.png) +![image-20201119122310969](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119122310969.png) @@ -654,5 +654,5 @@ for _, user_df in sub_user_info.groupby('user_id'): **关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.3 \345\244\232\350\267\257\345\217\254\345\233\236.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.3 \345\244\232\350\267\257\345\217\254\345\233\236.md" index 323cf46fe..9bf554093 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.3 \345\244\232\350\267\257\345\217\254\345\233\236.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.3 \345\244\232\350\267\257\345\217\254\345\233\236.md" @@ -2,7 +2,7 @@ 所谓的“多路召回”策略,就是指采用不同的策略、特征或简单模型,分别召回一部分候选集,然后把候选集混合在一起供后续排序模型使用,可以明显的看出,“多路召回策略”是在“计算速度”和“召回率”之间进行权衡的结果。其中,各种简单策略保证候选集的快速召回,从不同角度设计的策略保证召回率接近理想的状态,不至于损伤排序效果。如下图是多路召回的一个示意图,在多路召回中,每个策略之间毫不相关,所以一般可以写并发多线程同时进行,这样可以更加高效。 -image-20201119132726873 +image-20201119132726873 上图只是一个多路召回的例子,也就是说可以使用多种不同的策略来获取用户排序的候选商品集合,而具体使用哪些召回策略其实是与业务强相关的 ,针对不同的任务就会有对于该业务真实场景下需要考虑的召回规则。例如新闻推荐,召回规则可以是“热门视频”、“导演召回”、“演员召回”、“最近上映“、”流行趋势“、”类型召回“等等。 @@ -1344,4 +1344,4 @@ final_recall_items_dict_rank = combine_recall_results(user_multi_recall_dict, we **关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.4 \347\211\271\345\276\201\345\267\245\347\250\213.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.4 \347\211\271\345\276\201\345\267\245\347\250\213.md" index 197765e8b..e5e267f0e 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.4 \347\211\271\345\276\201\345\267\245\347\250\213.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.4 \347\211\271\345\276\201\345\267\245\347\250\213.md" @@ -193,7 +193,7 @@ Word2Vec主要思想是:一个词的上下文可以很好的表达出词的语 - skip-gram:已知中心词预测周围词。 - cbow:已知周围词预测中心词。 -![image-20201106225233086](http://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20201106225233086.png) +![image-20201106225233086](https://ryluo.oss-cn-chengdu.aliyuncs.com/Javaimage-20201106225233086.png) 在使用gensim训练word2vec的时候,有几个比较重要的参数 - size: 表示词向量的维度。 @@ -985,5 +985,5 @@ tst_user_item_feats_df.to_csv(save_path + 'tst_user_item_feats_df.csv', index=Fa **关于Datawhale:** Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.5 \346\216\222\345\272\217\346\250\241\345\236\213+\346\250\241\345\236\213\350\236\215\345\220\210.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.5 \346\216\222\345\272\217\346\250\241\345\236\213+\346\250\241\345\236\213\350\236\215\345\220\210.md" index 9fef3fda5..0e8f45abe 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.5 \346\216\222\345\272\217\346\250\241\345\236\213+\346\250\241\345\236\213\350\236\215\345\220\210.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.1\347\253\236\350\265\233\345\256\236\350\267\265/markdown/2.5 \346\216\222\345\272\217\346\250\241\345\236\213+\346\250\241\345\236\213\350\236\215\345\220\210.md" @@ -407,7 +407,7 @@ tst_user_item_feats_df_din_model = tst_user_item_feats_df_din_model.merge(his_be 我们下面尝试使用DIN模型, DIN的全称是Deep Interest Network, 这是阿里2018年基于前面的深度学习模型无法表达用户多样化的兴趣而提出的一个模型, 它可以通过考虑【给定的候选广告】和【用户的历史行为】的相关性,来计算用户兴趣的表示向量。具体来说就是通过引入局部激活单元,通过软搜索历史行为的相关部分来关注相关的用户兴趣,并采用加权和来获得有关候选广告的用户兴趣的表示。与候选广告相关性较高的行为会获得较高的激活权重,并支配着用户兴趣。该表示向量在不同广告上有所不同,大大提高了模型的表达能力。所以该模型对于此次新闻推荐的任务也比较适合, 我们在这里通过当前的候选文章与用户历史点击文章的相关性来计算用户对于文章的兴趣。 该模型的结构如下: -![image-20201116201646983](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png) +![image-20201116201646983](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201116201646983.png) 我们这里直接调包来使用这个模型, 关于这个模型的详细细节部分我们会在下一期的推荐系统组队学习中给出。下面说一下该模型如何具体使用:deepctr的函数原型如下: @@ -949,4 +949,4 @@ submit(rank_results, topk=5, model_name='ensumble_staking') 关于Datawhale: Datawhale是一个专注于数据科学与AI领域的开源组织,汇集了众多领域院校和知名企业的优秀学习者,聚合了一群有开源精神和探索精神的团队成员。Datawhale 以“for the learner,和学习者一起成长”为愿景,鼓励真实地展现自我、开放包容、互信互助、敢于试错和勇于担当。同时 Datawhale 用开源的理念去探索开源内容、开源学习和开源方案,赋能人才培养,助力人才成长,建立起人与人,人与知识,人与企业和人与未来的联结。 本次数据挖掘路径学习,专题知识将在天池分享,详情可关注Datawhale: -![image-20201119112159065](http://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) \ No newline at end of file +![image-20201119112159065](https://ryluo.oss-cn-chengdu.aliyuncs.com/abc/image-20201119112159065.png) \ No newline at end of file diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.3 Redis\345\237\272\347\241\200.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.3 Redis\345\237\272\347\241\200.md" index 2d79c1fbf..9153d9f15 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.3 Redis\345\237\272\347\241\200.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.3 Redis\345\237\272\347\241\200.md" @@ -20,7 +20,7 @@ sudo apt-get install redis-server 下载完成的结果 -![image-20211030164414594](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164414594.png) +![image-20211030164414594](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164414594.png) **启动Redis服务:** @@ -30,7 +30,7 @@ sudo apt-get install redis-server service redis-server status ``` -![image-20211030164432589](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164432589.png) +![image-20211030164432589](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164432589.png) 检查当前进程,查看redis是否启动。(ps: 可以看到redis服务正在监听6379端口) @@ -38,7 +38,7 @@ service redis-server status ps -aux|grep redis-server ``` -![image-20211030164448713](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164448713.png) +![image-20211030164448713](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164448713.png) 或者进入redis客户端,与服务器进行通信,当输入ping命令,如果返回 PONG 表示Redis已成功安装。 @@ -46,7 +46,7 @@ ps -aux|grep redis-server redis-cli ``` -![image-20211030164455928](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164455928.png) +![image-20211030164455928](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211030164455928.png) 上面的127.0.0.1 是redis服务器的 IP 地址,6379 是 Redis 服务器运行的端口。 diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.4 scrapy\345\237\272\347\241\200\345\217\212\346\226\260\351\227\273\347\210\254\345\217\226\345\256\236\346\210\230.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.4 scrapy\345\237\272\347\241\200\345\217\212\346\226\260\351\227\273\347\210\254\345\217\226\345\256\236\346\210\230.md" index 8a74c546e..dc29a96f1 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.4 scrapy\345\237\272\347\241\200\345\217\212\346\226\260\351\227\273\347\210\254\345\217\226\345\256\236\346\210\230.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.4 scrapy\345\237\272\347\241\200\345\217\212\346\226\260\351\227\273\347\210\254\345\217\226\345\256\236\346\210\230.md" @@ -129,7 +129,7 @@ class QuotesSpider(scrapy.Spider): 因为新闻爬取项目和新闻推荐系统是放在一起的,为了方便提前学习,下面直接给出项目的目录结构以及重要文件中的代码实现,最终的项目将会和新闻推荐系统一起开源出来 -image-20211103214124327 +image-20211103214124327 1. **创建一个scrapy项目:** @@ -164,7 +164,7 @@ class SinanewsItem(scrapy.Item): 这里需要注意的一点,这里在爬取新闻的时候选择的是一个比较简洁的展示网站进行爬取的,相比直接去最新的新浪新闻观光爬取新闻简单很多,简洁的网站大概的链接:https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1 -image-20211103213354334 +image-20211103213354334 ```python # -*- coding: utf-8 -*- @@ -497,7 +497,7 @@ sh run_scrapy_sina.sh 最终查看数据库中的数据: -image-20211103214611171 +image-20211103214611171 ### 参考资料 diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.5 \350\207\252\345\212\250\345\214\226\346\236\204\345\273\272\347\224\250\346\210\267\345\217\212\347\211\251\346\226\231\347\224\273\345\203\217.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.5 \350\207\252\345\212\250\345\214\226\346\236\204\345\273\272\347\224\250\346\210\267\345\217\212\347\211\251\346\226\231\347\224\273\345\203\217.md" index 4cc60eda6..bd9d70acc 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.5 \350\207\252\345\212\250\345\214\226\346\236\204\345\273\272\347\224\250\346\210\267\345\217\212\347\211\251\346\226\231\347\224\273\345\203\217.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.1.5 \350\207\252\345\212\250\345\214\226\346\236\204\345\273\272\347\224\250\346\210\267\345\217\212\347\211\251\346\226\231\347\224\273\345\203\217.md" @@ -1,4 +1,4 @@ -![image-20211203145147649](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203145147649.png) +![image-20211203145147649](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203145147649.png) # 自动化构建用户及物料画像 @@ -19,13 +19,13 @@ 首先说一下新物料添加到物料库的逻辑是什么,新物料添加到物料库这件事情肯定是发生在新闻爬取之后的,然后要将新物料添加到物料库还需要对新物料做一些简单的画像处理,目前我们定义的画像字段如下(处理后的画像存储在Mongodb): -image-20211203150212960 +image-20211203150212960 具体的逻辑就是遍历今天爬取的所有文章,然后通过文章的title来判断这篇文章是否已经在物料库中(新闻网站有可能有些相同的文章会出现在多天)来去重。然后再根据我们定义的一些字段,给画像相应的字段初始化,最后就是存入画像物料池中。 关于旧物料画像的更新,这里就需要先了解一下旧物料哪些字段会被用户的行为更新。下面是新闻列表展示页,我们会发现前端会展示新闻的阅读、喜欢及收藏次数。而用户的交互(阅读、点赞和收藏)会改变这些值。 -image-20211203150835056 +image-20211203150835056 为了能够实时的在前端显示新闻的这些动态行为信息,我们提前将新闻的动态信息存储到了redis中,线上获取的时候是直接从redis中获取新闻的数据,并且如果用户对新闻产生了交互,那么这些动态信息就会被更新,我们也是直接更新redis中的值,这样做主要是为了能够让前端可以实时的获取的新闻最新的动态画像信息。 @@ -175,9 +175,9 @@ if __name__ == "__main__": 上面的内容说完了物料的更新,接下来介绍一下对于更新完的物料是如何添加到redis数据库中去的。关于新闻内容在redis中的存储,我们将新闻的信息拆成了两部分,一部分是新闻不会发生变化的属性(例如,创建时间、标题、新闻内容等),还有一部分是物料的动态属性,在redis中存储的key的标识分别为:static_news_detail:news_id和dynamic_news_detail:news_id 下面是redis中存储的真实内容 -image-20211203153841222 +image-20211203153841222 -image-20211203153958220 +image-20211203153958220 这么做的目的是为了线上实时更改物料动态信息的时候更加高效一点。当需要获取某篇新闻的详细信息的时候需要查这两份数据并将数据这两部分数据拼起来最终才发送给前端展示。这部分的代码逻辑如下: @@ -306,11 +306,11 @@ if __name__ == "__main__": 由于我们系统中将所有注册过的用户都放到了一个表里面(新、老用户),所以每次更新画像的话只需要遍历一遍注册表中的所有用户。再说具体的画像构建逻辑之前,得先了解一下用户画像中包含哪些字段,下面是直接从mongo中查出来的 -image-20211203163848668 +image-20211203163848668 从上面可以看出,主要是用户的基本信息和用户历史信息相关的一些标签,对于用户的基本属性特征这个可以直接从注册表中获取,那么对于跟用户历史阅读相关的信息,需要统计用户历史的所有阅读、喜欢和收藏的新闻详细信息。为了得到跟用户历史兴趣相关的信息,我们需要对用户的历史阅读、喜欢和收藏这几个历史记录给存起来,其实这些信息都可以从日志信息中获取得到,但是这里有个工程上的事情得先说明一下,先看下面这个图,对于每个用户点进一篇新闻的详情页 -image-20211203164332062 +image-20211203164332062 最底部有个喜欢和收藏,这个前端展示的结果是从后端获取的数据,那就意味着后端需要维护一个用户历史点击及收藏过的文章列表,这里我们使用了mysql来存储,主要是怕redis不够用。其实这两个表不仅仅可以用来前端展示用的,还可以用来分析用户的画像,这都给我们整理好了用户历史喜欢和收藏了。 @@ -622,7 +622,7 @@ echo " " **crontab定时任务:** -![image-20211203172613512](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203172613512.png) +![image-20211203172613512](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203172613512.png) 将定时任务拆解一下: diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.2.3 \345\211\215\345\220\216\347\253\257\344\272\244\344\272\222.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.2.3 \345\211\215\345\220\216\347\253\257\344\272\244\344\272\222.md" index e251e6515..e736c0fa1 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.2.3 \345\211\215\345\220\216\347\253\257\344\272\244\344\272\222.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.2.3 \345\211\215\345\220\216\347\253\257\344\272\244\344\272\222.md" @@ -6,7 +6,7 @@ 下面主要展现的是项目的整体部分,主要分为推荐页,热门页以及新闻详情页。 -image-20211203154557244image-20211203155028564image-20211203155058020 +image-20211203154557244image-20211203155028564image-20211203155058020 diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.3.1 \346\216\250\350\215\220\347\263\273\347\273\237\346\265\201\347\250\213\347\232\204\346\236\204\345\273\272.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.3.1 \346\216\250\350\215\220\347\263\273\347\273\237\346\265\201\347\250\213\347\232\204\346\236\204\345\273\272.md" index 3beac83f1..b2cce14f1 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.3.1 \346\216\250\350\215\220\347\263\273\347\273\237\346\265\201\347\250\213\347\232\204\346\236\204\345\273\272.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.3.1 \346\216\250\350\215\220\347\263\273\347\273\237\346\265\201\347\250\213\347\232\204\346\236\204\345\273\272.md" @@ -1,6 +1,6 @@ -![](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片Untitled.png) +![](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片Untitled.png) 本篇文章主要是讲解推荐系统流程构建,主要包括Offline和Online两个部分。 diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.5.1 DSSM\345\217\254\345\233\236.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.5.1 DSSM\345\217\254\345\233\236.md" index 166009600..d354407c5 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.5.1 DSSM\345\217\254\345\233\236.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/docs/2.2.5.1 DSSM\345\217\254\345\233\236.md" @@ -12,7 +12,7 @@ DSSM(Deep Structured Semantic Model)是由微软研究院于CIKM在2013年提出 ### **DSSM 模型结构** -![image-20220224100424897](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20220224100424897.png) +![image-20220224100424897](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20220224100424897.png) 上图是DSSM模型的结构,该网络结构比较简单,是一个由几层DNN组成网络,我们将要搜索文本(Query)和要匹配的文本(Document)的 embedding 输入到网络,网络输出为 128 维的向量,然后通过向量之间计算余弦相似度来计算向量之间距离,可以看作每一个 query 和 document 之间相似分数,然后在做 softmax。 @@ -28,7 +28,7 @@ DSSM(Deep Structured Semantic Model)是由微软研究院于CIKM在2013年提出 该模型主要是将上述模型中的两个“塔”改为独立的 user 和 item 两个子网络,大概结构如下: -![img](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-f7ecbf1faf7899c6e2999182055470fb_720w.jpg) +![img](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-f7ecbf1faf7899c6e2999182055470fb_720w.jpg) 其结构非常简单,如上图所示,左侧是用户塔,右侧是Item塔。在用户侧结构中,其输入为用户侧特征(用户画像信息、统计属性以及历史行为序列等);在用户侧结构中,其输入为Item相关特征(Item基本信息、属性信息等)。对于这两个塔本身,则是经典的DNN模型,在训练过程中,其输入由特征OneHot到特征Embedding,再经过几层DNN隐层,两个塔分别输出user embedding和item embedding,最后这两个embedding做内积或者Cosine相似度计算,使得user和item在embedding映射到共同维度的语义空间中。 @@ -38,7 +38,7 @@ DSSM(Deep Structured Semantic Model)是由微软研究院于CIKM在2013年提出 该模型主要的改进是在user塔和Item塔的特征Embedding层上,各自加入一个SENet模块,借助SENet网络用来动态地学习特征的重要性,根据得到的特征权重与对应特征的embedding相乘,进而达到放大重要特征或抑制无效特征的目的,模型大致结构如下所示: -![img](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-8766fee1b442ed17111d5822033f960f_720w.jpg) +![img](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片v2-8766fee1b442ed17111d5822033f960f_720w.jpg) 其模型和朴素DSSM模型的区别在于多加了一个SENet网络,该网络主要是将特征的 embedding 通过 Squeeze 和Excitation 两个阶段得到一个权重向量,在用该向量与特征的embeding对应为相乘,挑选出最要特征之后在进入到朴素的DSSM网络中。 而 SENet 之所以起作用的原因,张俊林老师的解释是 SENet 可以突出那些对高层 User embedding 和 Item embedding 的特征交叉起重要作用的特征,更有利于表达两侧的特征交互,避免单侧无效特征经过DNN双塔非线性融合时带来的噪声,同时又带有非线性的作用。关于SENet网络详细内容可以查看[原文](https://arxiv.org/abs/1709.01507) @@ -48,7 +48,7 @@ DSSM(Deep Structured Semantic Model)是由微软研究院于CIKM在2013年提出 该模型是Youtube于2019年在RecSys发表的一篇工作,这个模型从结构上来看是最普通的双塔。左边是user塔,输入包括两部分,第一部分是user当前正在观看的视频的特征,第二部分user的特征是用户历史行为的统计量,例如用户最近观看的N条视频的id embedding均值,这两部分融合起来一起输入user侧的输入。右边是item塔,将候选视频的特征作为输入,计算item的 embedding。之后也是再计算两侧embedding的相似度,进行学习。 模型的大致结构如下所示: -![image-20220224100307472](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20220224100307472.png) +![image-20220224100307472](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20220224100307472.png) 对于该模型,重点并不在于结构上的改变,而是对于负采样问题。因为召回的过程可以被视为是一个多分类问题,模型的输出层选择softmax计算后再计算交叉熵损失。但问题是当候选item特别多的时候,无法对所有的item进行softmax,因此通常的做法是随机从全量item中采样出一个batch的item进行softmax。但是使用batch内的样本作为彼此负样本会带来非常大的偏置问题,即对于热门的样本,被当作负样本的概率更高,因此该模型的贡献在于如何减小batch内负采样所带来的偏置问题? 关于paper的详细内容可以查看[原文](https://dl.acm.org/doi/10.1145/3298689.3346996) diff --git "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/readme.md" "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/readme.md" index 2632563b5..2aa49bd87 100644 --- "a/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/readme.md" +++ "b/docs/\347\254\254\344\272\214\347\253\240 \346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/2.2\346\226\260\351\227\273\346\216\250\350\215\220\347\263\273\347\273\237\345\256\236\346\210\230/readme.md" @@ -68,4 +68,4 @@ github上给出了参考资料,其实也是用来作为查询的,因为每 如果大家最终在学习完本次的组队学习内容,可以理解下面这张流程图的话,那基本上就很不错了。因为内容真的比较多,而且比较偏向实战,如果要真的弄懂里面的详细流程需要大家花不少时间在看源码上面。 -![image-20211203193754525](http://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203193754525.png) +![image-20211203193754525](https://ryluo.oss-cn-chengdu.aliyuncs.com/图片image-20211203193754525.png) diff --git a/readme.md b/readme.md index bca478302..ab880d26d 100644 --- a/readme.md +++ b/readme.md @@ -19,7 +19,7 @@ 为了方便学习和交流,**我们建立了FunRec学习社区(微信群+知识星球)**,微信群方便大家平时日常交流和讨论,知识星球方便沉淀内容。由于我们的内容面向的人群主要是学生,所以**知识星球永久免费**,感兴趣的可以加入星球讨论(加入星球的同学先看置定的必读帖)!**FunRec学习社区内部会不定期分享(FunRec社区中爱分享的同学)技术总结、个人管理等内容,[跟技术相关的分享内容都放在了B站](https://space.bilibili.com/431850986/channel/collectiondetail?sid=339597)上面**。由于微信群的二维码只有7天内有效,所以直接加下面这个微信,备注:**Fun-Rec**,会被拉到Fun-Rec交流群,如果觉得微信群比较吵建议直接加知识星球!。
-image-20220408193745249 +image-20220408193745249
**注意:不建议直接在github上面阅读(公式图片容易解析错误),推荐点击上面的在线阅读或者离线下载下来之后使用markdown工具(如typora)查看!** @@ -136,15 +136,15 @@ [2.1 竞赛实践(天池入门赛-新闻推荐)](https://tianchi.aliyun.com/competition/entrance/531842/forum)
- image-20211213165802957 - image-20211213165847593 + image-20211213165802957 + image-20211213165847593
**2.2 新闻推荐系统实践前端展示和后端逻辑(项目没有任何商用价值仅供入门者学习)**
- image-20211205142026937 - Fun-Rec新闻推荐系统 + image-20211205142026937 + Fun-Rec新闻推荐系统