kmeans

cr-mao · Sep 18, 2024 · 384a8fc · 384a8fc
1 parent a88d13b
commit 384a8fc
Show file tree

Hide file tree

Showing 13 changed files with 2,659 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,8 @@
-# 我的ai相关学习笔记
+# machine learning
 
-- ai相关数学知识
+数据分析、挖掘， 机器学习等学习笔记
+
+- 数学基础
 - 数据分析与可视化
 - 机器学习
 - python
@@ -88,17 +90,25 @@ jupyter notebook ,numpy,pandas,matplotlib
     - [逻辑回归理论、公式](machinelearning/逻辑回归.md)
     - [sigmod函数](machinelearning/logisticRegression/01-WhatIsLogisticRegression/01-What-is-Logistic-Regression.ipynb)
     - [实现逻辑回归](machinelearning/logisticRegression/02-ImplementLogisticRegression/implementLogisticRegression.ipynb)
-
+    - [决策边界](machinelearning/logisticRegression/03-DecisionBoundary/Decision-Boundary.ipynb)
+    - [添加多项式](machinelearning/logisticRegression/04-PolynomialFeaturesInLogisticRegression/polynomialFeaturesInLogisticRegression.ipynb)
+    - [scikit-learn中的逻辑回归](machinelearning/logisticRegression/05-logisticRegressionInScikitLearn/logisticRegressionInScikitLearn.ipynb)
+    - [解决多分类问题](machinelearning/logisticRegression/06-OvrAndOvo/ovrAndOvo.ipynb)
+- 评价分类结果 
+  - [实现混淆矩阵，精准率和召回率](machinelearning/classificationPerformanceMeasures/01-implementConfusionMatrixPrecisionAndRecall/Implement-Confusion-Matrix-Precision-and-Recall.ipynb)
+  - [F1 score](machinelearning/classificationPerformanceMeasures/02-F1Score/F1Score.ipynb)
+  - [精准度和召回率的平衡](machinelearning/classificationPerformanceMeasures/03-PrecisionRecallTradeoff/precisionRecallTradeoff.ipynb)
+
+- k-means
+  - [k-means理论](machinelearning/Kmeans.md)
+  - [特征降维、kmeans实践](machinelearning/kmeans/kmeans.ipynb)
 - 朴素叶贝斯
-
-### 案例
-
-#### 推荐系统相关
-
-- [推荐系统快速入门](machinelearning/推荐系统入门.md)
-- [用户口味、余弦相似性](machinelearning/recommand/01consine_simiartiy/consine_similarty.ipynb)
-- [用户消费能力、标准化欧式距离](machinelearning/recommand/02distance/distance.ipynb)
-- [NearestNeighbors、余弦相似性找出最相似的用户](machinelearning/recommand/03NearestNeighborsAndConsineSimiarity/NearestNeighbors_and_consine_simiarity.ipynb)
+- 神经网络
+- 推荐系统相关
+  - [推荐系统快速入门](machinelearning/推荐系统入门.md)
+  - [用户口味、余弦相似性](machinelearning/recommand/01consine_simiartiy/consine_similarty.ipynb)
+  - [用户消费能力、标准化欧式距离](machinelearning/recommand/02distance/distance.ipynb)
+  - [NearestNeighbors、余弦相似性找出最相似的用户](machinelearning/recommand/03NearestNeighborsAndConsineSimiarity/NearestNeighbors_and_consine_simiarity.ipynb)
 
 ## links
 

diff --git a/datahandling/09-NumpyTest/numpy_test.ipynb b/datahandling/09-NumpyTest/numpy_test.ipynb
@@ -0,0 +1,291 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:25:05.185328Z",
+     "start_time": "2024-09-13T09:25:05.174999Z"
+    }
+   },
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "persontype = np.dtype({\n",
+    "    'names':['name', 'chinese', 'english', 'math'],\n",
+    "    'formats':['S32', 'i', 'i', 'i']})\n",
+    "\n",
+    "peoples = np.array([(\"ZhangFei\",32,75,100),(\"GuanYu\",24,85,96),\n",
+    "       (\"ZhaoYun\",28,85,92),(\"HuangZhong\",29,65,85)],dtype=persontype)\n",
+    "\n",
+    "print(peoples)"
+   ],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(b'ZhangFei', 32, 75, 100) (b'GuanYu', 24, 85,  96)\n",
+      " (b'ZhaoYun', 28, 85,  92) (b'HuangZhong', 29, 65,  85)]\n"
+     ]
+    }
+   ],
+   "execution_count": 2
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:25:31.525656Z",
+     "start_time": "2024-09-13T09:25:31.522645Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(peoples[:][\"chinese\"])",
+   "id": "9efaccd6aa05aeb7",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[32 24 28 29]\n"
+     ]
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:26:04.726637Z",
+     "start_time": "2024-09-13T09:26:04.721970Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(np.average(peoples[:][\"chinese\"]))",
+   "id": "851dd0377e1cd22d",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "28.25\n"
+     ]
+    }
+   ],
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:26:25.509957Z",
+     "start_time": "2024-09-13T09:26:25.505608Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(np.amin(peoples[:][\"chinese\"]))",
+   "id": "1d73901d0834b9a1",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24\n"
+     ]
+    }
+   ],
+   "execution_count": 5
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:26:31.508430Z",
+     "start_time": "2024-09-13T09:26:31.504455Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(np.amax(peoples[:][\"chinese\"]))",
+   "id": "12ca85a02cc88ad0",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "32\n"
+     ]
+    }
+   ],
+   "execution_count": 6
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:27:08.461523Z",
+     "start_time": "2024-09-13T09:27:08.458620Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(np.var(peoples[:][\"chinese\"]))",
+   "id": "fd4dd57ccd300ab7",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8.1875\n"
+     ]
+    }
+   ],
+   "execution_count": 8
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:27:14.380705Z",
+     "start_time": "2024-09-13T09:27:14.377686Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "print(np.std(peoples[:][\"chinese\"]))\n",
+   "id": "61a4a51025df52cf",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2.8613807855648994\n"
+     ]
+    }
+   ],
+   "execution_count": 9
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:29:10.950024Z",
+     "start_time": "2024-09-13T09:29:10.936537Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "persontype = np.dtype({'names':['name','chinese','english','math','total'],'formats':['S32','i','i','i','i']})\n",
+    "peoples[:]['total'] = peoples[:]['chinese']+peoples[:]['english']+peoples[:]['math']\n"
+   ],
+   "id": "ff16709efce98a81",
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "no field of name total",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mValueError\u001B[0m                                Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[12], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m \u001B[43mpeoples\u001B[49m\u001B[43m[\u001B[49m\u001B[43m:\u001B[49m\u001B[43m]\u001B[49m\u001B[43m[\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtotal\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m]\u001B[49m \u001B[38;5;241m=\u001B[39m peoples[:][\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mchinese\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m+\u001B[39mpeoples[:][\u001B[38;5;124m'\u001B[39m\u001B[38;5;124menglish\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m+\u001B[39mpeoples[:][\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mmath\u001B[39m\u001B[38;5;124m'\u001B[39m]\n",
+      "\u001B[0;31mValueError\u001B[0m: no field of name total"
+     ]
+    }
+   ],
+   "execution_count": 12
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-09-13T09:44:17.954933Z",
+     "start_time": "2024-09-13T09:44:17.943792Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# 学员成绩数据\n",
+    "scores = np.array([[80, 85, 90],\n",
+    "                   [75, 88, 93],\n",
+    "                   [85, 82, 87],\n",
+    "                   [90, 95, 88],\n",
+    "                   [88, 92, 90]])\n",
+    "\n",
+    "# 计算平均成绩\n",
+    "avg_scores = np.mean(scores, axis=0)\n",
+    "\n",
+    "# 计算最小成绩\n",
+    "min_scores = np.min(scores, axis=0)\n",
+    "\n",
+    "# 计算最大成绩\n",
+    "max_scores = np.max(scores, axis=0)\n",
+    "\n",
+    "# 计算方差\n",
+    "var_scores = np.var(scores, axis=0)\n",
+    "\n",
+    "# 计算标准差\n",
+    "std_scores = np.std(scores, axis=0)\n",
+    "\n",
+    "# 计算总成绩并排序\n",
+    "total_scores = np.sum(scores, axis=1)\n",
+    "ranking = np.argsort(-total_scores) + 1\n",
+    "\n",
+    "# 输出结果\n",
+    "print(\"平均成绩：\", avg_scores)\n",
+    "print(\"最小成绩：\", min_scores)\n",
+    "print(\"最大成绩：\", max_scores)\n",
+    "print(\"方差：\", var_scores)\n",
+    "print(\"标准差：\", std_scores)\n",
+    "print(\"\\n总成绩排序：\")\n",
+    "for rank, total_score in zip(ranking, total_scores):\n",
+    "    print(f\"第{rank}名，总成绩：{total_score}\")"
+   ],
+   "id": "8198d2785e589c97",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "平均成绩： [83.6 88.4 89.6]\n",
+      "最小成绩： [75 82 87]\n",
+      "最大成绩： [90 95 93]\n",
+      "方差： [29.84 21.84  4.24]\n",
+      "标准差： [5.46260011 4.67332858 2.05912603]\n",
+      "\n",
+      "总成绩排序：\n",
+      "第4名，总成绩：255\n",
+      "第5名，总成绩：256\n",
+      "第2名，总成绩：254\n",
+      "第1名，总成绩：273\n",
+      "第3名，总成绩：270\n"
+     ]
+    }
+   ],
+   "execution_count": 13
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "6176fcc1dc633cf0"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/datahandling/code/pandas_base.py b/datahandling/code/pandas_base.py
@@ -111,7 +111,7 @@
 
 """从Series字典来创建DataFrame"""
 #创建一个字典，字典的每个值都是通过pd.Series格式化为一维数组并指定index
-d = {'one':pd.Series([1,2,3],index=['a','b','c']),'two':pd.Series([1,2,3,4],index=['a','b','c','d'])}
+d = {'one':pd.Series ([1,2,3],index=['a','b','c']),'two':pd.Series([1,2,3,4],index=['a','b','c','d'])}
 #使用DataFrame格式化
 df = pd.DataFrame(d)
 #返回二维列表

diff --git a/datahandling/数据领域中的专业术语.md b/datahandling/数据领域中的专业术语.md
@@ -8,7 +8,13 @@
 
 ### MSE
 
-用来衡量统计学上的距离，然比比较两个结果的不同程度
+用来衡量统计学上的距离，然比比较两个结果的不同程度.
+
+均方误差， 误差的平方累积/n
+
+### SSE
+
+SSE =MSE * n ,误差的平方累积
 
 ### MAE
 
@@ -146,4 +152,9 @@
 4. 模型评估
    - 是否预测正确数值
    - 是否拟合了足够的信息
-   - 可以可视化数据，探究数据特征
+   - 可以可视化数据，探究数据特征
+
+
+### 数据挖掘的流程
+
+在数据挖掘中，有几个非常重要的任务，就是分类、聚类、预测和关联分析