{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# 数据预处理的三大神器 Numpy, Pandas, Matplotlib\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"'''\n",
"数据集说明:\n",
"1、数据文件保存在当前目录下\\r\\n\n",
"2、文件名为data.csv\\n\n",
"3、改文件存储着部分客户的国籍、年龄、收入信息(隐去了客户身份),以及是否购买的标签信息\\n\n",
"4、对应上述信息,文件中columns为:Country,Age,Salary,Purchased,其中Purchased是标签列\\n\n",
"5、数据文件中字段分隔符为tab符“\\t”\n",
"'''\n",
"file = \"./data.csv\"\n",
"dataset = pd.read_csv(file,sep=\"\\t\")#请在空格处指定文件分隔符"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Age | \n",
" Salary | \n",
" Purchased | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" France | \n",
" 44.0 | \n",
" 72000.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Spain | \n",
" 27.0 | \n",
" 48000.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" Germany | \n",
" 30.0 | \n",
" 54000.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" Spain | \n",
" 38.0 | \n",
" 61000.0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" Germany | \n",
" 40.0 | \n",
" NaN | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Age Salary Purchased\n",
"0 France 44.0 72000.0 0\n",
"1 Spain 27.0 48000.0 1\n",
"2 Germany 30.0 54000.0 0\n",
"3 Spain 38.0 61000.0 0\n",
"4 Germany 40.0 NaN 1"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#========================================================\n",
"#题目1:请在下方空格填写对应代码,查看数据集列标签\n",
"dataset.______ "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Age | \n",
" Salary | \n",
" Purchased | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 18.000000 | \n",
" 18.000000 | \n",
" 20.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 33.222222 | \n",
" 77888.888889 | \n",
" 0.550000 | \n",
"
\n",
" \n",
" std | \n",
" 10.212769 | \n",
" 61330.242889 | \n",
" 0.510418 | \n",
"
\n",
" \n",
" min | \n",
" 18.000000 | \n",
" 5000.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 24.000000 | \n",
" 52250.000000 | \n",
" 0.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 35.000000 | \n",
" 66500.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 39.500000 | \n",
" 84500.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" max | \n",
" 52.000000 | \n",
" 285000.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Age Salary Purchased\n",
"count 18.000000 18.000000 20.000000\n",
"mean 33.222222 77888.888889 0.550000\n",
"std 10.212769 61330.242889 0.510418\n",
"min 18.000000 5000.000000 0.000000\n",
"25% 24.000000 52250.000000 0.000000\n",
"50% 35.000000 66500.000000 1.000000\n",
"75% 39.500000 84500.000000 1.000000\n",
"max 52.000000 285000.000000 1.000000"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#========================================================\n",
"#题目2:请在下方空格填写对应代码,查看数据集的基本情况\n",
"dataset.______"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"数据集缺失值情况统计:\n",
" Age 2\n",
"Salary 2\n",
"dtype: int64\n"
]
}
],
"source": [
"# 统计数据集中每一列包含缺失值的数量\n",
"print('数据集缺失值情况统计:\\n', dataset.isnull().sum()[dataset.isnull().sum() != 0])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"数据集缺失值情况统计:\n",
" Salary 2\n",
"dtype: int64\n"
]
}
],
"source": [
"#从上面的结果可以看出,在Age和Salary中存在空值\n",
"#========================================================\n",
"#题目3:请在下方空格填写对应代码,对Age进行均值填充\n",
"dataset['Age'].fillna(_______________), inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"数据集缺失值情况统计:\n",
" Series([], dtype: int64)\n"
]
}
],
"source": [
"#题目4:对Salary进行中位数填充 易\n",
"dataset['Salary'].fillna(_____________), inplace=True) "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"#题目5:下面方法实现了最大最小标准化,请在下方空格处填写代码,实现最大最小标准化计算\n",
"x_std = ____________________# 请在空格处填写最大最小标准化公式实现"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"#题目6:下面方法实现了z-score标准化,请在下方空格处填写代码,实现z-score标准化计算\n",
"x_std = __________________________ # 请在空格处填写最大最小标准化公式实现 "
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"18.0\n",
"52.0\n",
"自定义最大最小标准化:\n",
" 0 0.764706\n",
"1 0.264706\n",
"2 0.352941\n",
"3 0.588235\n",
"4 0.647059\n",
"5 0.500000\n",
"6 0.447712\n",
"7 0.447712\n",
"8 0.941176\n",
"9 0.647059\n",
"10 0.500000\n",
"11 0.117647\n",
"12 0.588235\n",
"13 0.058824\n",
"14 0.058824\n",
"15 0.352941\n",
"16 0.147059\n",
"17 1.000000\n",
"18 0.000000\n",
"19 0.529412\n",
"Name: Age, dtype: float64\n"
]
}
],
"source": [
"#题目7:请调用自定义函数,对Age进行最大最小标准化\n",
"age_std = ______________________________"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"自定义z-score标准化:\n",
" 0 -0.083853\n",
"1 -0.507531\n",
"2 -0.401611\n",
"3 -0.278039\n",
"4 -0.180946\n",
"5 -0.330998\n",
"6 -0.436918\n",
"7 0.039720\n",
"8 0.110333\n",
"9 0.410438\n",
"10 -1.213660\n",
"11 -0.789983\n",
"12 -0.030893\n",
"13 0.145639\n",
"14 -0.419264\n",
"15 0.763503\n",
"16 -1.266620\n",
"17 0.975342\n",
"18 -0.180946\n",
"19 3.676287\n",
"Name: Salary, dtype: float64\n"
]
}
],
"source": [
"#题目8:请调用自定义函数,对Salary进行z-score标准化\n",
"salary_std = _________________________"
]
},
{
"cell_type": "code",
"execution_count": 535,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sklearn的z-score标准化:\n",
" [[-0.08385289]\n",
" [-0.50753067]\n",
" [-0.40161123]\n",
" [-0.27803854]\n",
" [-0.18094572]\n",
" [-0.33099827]\n",
" [-0.43691771]\n",
" [ 0.03971979]\n",
" [ 0.11033276]\n",
" [ 0.41043785]\n",
" [-1.21366031]\n",
" [-0.78998253]\n",
" [-0.03089317]\n",
" [ 0.14563924]\n",
" [-0.41926447]\n",
" [ 0.76350267]\n",
" [-1.26662003]\n",
" [ 0.97534156]\n",
" [-0.18094572]\n",
" [ 3.6762874 ]]\n"
]
}
],
"source": [
"#题目9:请调用sklearn,对Salary进行z-score标准化\n",
"z_score_std = ____________"
]
},
{
"cell_type": "code",
"execution_count": 536,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Purchased | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" France | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Spain | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" Germany | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" Spain | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" Germany | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Purchased\n",
"0 France 0\n",
"1 Spain 1\n",
"2 Germany 0\n",
"3 Spain 0\n",
"4 Germany 1"
]
},
"execution_count": 536,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#前面代码我们对Age和Salary做了不同的标准化,结果存储在age_std和salary_std。\n",
"#题目10:要求从原始数据集dataset中删除Age和Salary两列\n",
"dataset.drop(____,axis= __,inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 537,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Purchased | \n",
" Age_std | \n",
" Salary_std | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" France | \n",
" 0 | \n",
" 0.764706 | \n",
" -0.083853 | \n",
"
\n",
" \n",
" 1 | \n",
" Spain | \n",
" 1 | \n",
" 0.264706 | \n",
" -0.507531 | \n",
"
\n",
" \n",
" 2 | \n",
" Germany | \n",
" 0 | \n",
" 0.352941 | \n",
" -0.401611 | \n",
"
\n",
" \n",
" 3 | \n",
" Spain | \n",
" 0 | \n",
" 0.588235 | \n",
" -0.278039 | \n",
"
\n",
" \n",
" 4 | \n",
" Germany | \n",
" 1 | \n",
" 0.647059 | \n",
" -0.180946 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Purchased Age_std Salary_std\n",
"0 France 0 0.764706 -0.083853\n",
"1 Spain 1 0.264706 -0.507531\n",
"2 Germany 0 0.352941 -0.401611\n",
"3 Spain 0 0.588235 -0.278039\n",
"4 Germany 1 0.647059 -0.180946"
]
},
"execution_count": 537,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#题目11:将标准化后的age_std和salary_std两个字段添加到dataset中,指定column名为‘Age_std’和‘Salary_std’\n",
"___________\n",
"___________ "
]
},
{
"cell_type": "code",
"execution_count": 538,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[0. 0. 1. 0. 0. 0.]\n",
" [0. 0. 0. 0. 0. 1.]\n",
" [0. 0. 0. 1. 0. 0.]\n",
" [0. 0. 0. 0. 0. 1.]\n",
" [0. 0. 0. 1. 0. 0.]\n",
" [0. 0. 1. 0. 0. 0.]\n",
" [0. 0. 0. 0. 0. 1.]\n",
" [0. 0. 1. 0. 0. 0.]\n",
" [0. 0. 0. 1. 0. 0.]\n",
" [1. 0. 0. 0. 0. 0.]\n",
" [0. 0. 0. 0. 1. 0.]\n",
" [0. 0. 0. 0. 0. 1.]\n",
" [0. 1. 0. 0. 0. 0.]\n",
" [1. 0. 0. 0. 0. 0.]\n",
" [0. 0. 0. 1. 0. 0.]\n",
" [1. 0. 0. 0. 0. 0.]\n",
" [0. 0. 0. 0. 1. 0.]\n",
" [0. 0. 0. 0. 0. 1.]\n",
" [0. 1. 0. 0. 0. 0.]\n",
" [1. 0. 0. 0. 0. 0.]]\n"
]
}
],
"source": [
"#数据集中国籍Country字段为字符串,即类别数据,模型训练前需要对字符串类别类字段进行数值化编码\n",
"#题目12:请调用sklearn相关方法对Country进行one-hot编码\n",
"\n",
"onehot_label=________________________________________ # 请在空格中填写实现代码"
]
},
{
"cell_type": "code",
"execution_count": 539,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Purchased | \n",
" Age_std | \n",
" Salary_std | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" France | \n",
" 0 | \n",
" 0.764706 | \n",
" -0.083853 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" Spain | \n",
" 1 | \n",
" 0.264706 | \n",
" -0.507531 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 2 | \n",
" Germany | \n",
" 0 | \n",
" 0.352941 | \n",
" -0.401611 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" Spain | \n",
" 0 | \n",
" 0.588235 | \n",
" -0.278039 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" Germany | \n",
" 1 | \n",
" 0.647059 | \n",
" -0.180946 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Purchased Age_std Salary_std 0 1 2 3 4 5\n",
"0 France 0 0.764706 -0.083853 0.0 0.0 1.0 0.0 0.0 0.0\n",
"1 Spain 1 0.264706 -0.507531 0.0 0.0 0.0 0.0 0.0 1.0\n",
"2 Germany 0 0.352941 -0.401611 0.0 0.0 0.0 1.0 0.0 0.0\n",
"3 Spain 0 0.588235 -0.278039 0.0 0.0 0.0 0.0 0.0 1.0\n",
"4 Germany 1 0.647059 -0.180946 0.0 0.0 0.0 1.0 0.0 0.0"
]
},
"execution_count": 539,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#上面代码对Country字段进行了one-hot编码\n",
"#题目13:将onehot编码后的Country数字编码添加到dateset中,请在空格处指定按列进行合并,即扩展新列\n",
"dataset = pd.concat([dataset,country_code],axis=_____)"
]
},
{
"cell_type": "code",
"execution_count": 540,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2 5 3 5 3 2 5 2 3 0 4 5 1 0 3 0 4 5 1 0]\n"
]
}
],
"source": [
"#数据集中国籍Country字段为字符串,即类别数据,模型训练前需要对字符串类别类字段进行数值化编码\n",
"#题目14:请调用sklearn相关方法对Country进行数字化编码LabelEncoder\n",
"labels=_______________________________________"
]
},
{
"cell_type": "code",
"execution_count": 541,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Country | \n",
" Purchased | \n",
" Age_std | \n",
" Salary_std | \n",
" country_code | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" France | \n",
" 0 | \n",
" 0.764706 | \n",
" -0.083853 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" Spain | \n",
" 1 | \n",
" 0.264706 | \n",
" -0.507531 | \n",
" 5 | \n",
"
\n",
" \n",
" 2 | \n",
" Germany | \n",
" 0 | \n",
" 0.352941 | \n",
" -0.401611 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" Spain | \n",
" 0 | \n",
" 0.588235 | \n",
" -0.278039 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" Germany | \n",
" 1 | \n",
" 0.647059 | \n",
" -0.180946 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Country Purchased Age_std Salary_std country_code\n",
"0 France 0 0.764706 -0.083853 2\n",
"1 Spain 1 0.264706 -0.507531 5\n",
"2 Germany 0 0.352941 -0.401611 3\n",
"3 Spain 0 0.588235 -0.278039 5\n",
"4 Germany 1 0.647059 -0.180946 3"
]
},
"execution_count": 541,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#上面代码对Country字段进行了LabelEncoder编码\n",
"#题目15:将LabelEncoder编码后的Country数字编码添加到dateset中,请在空格处指定按列进行合并,即扩展新列\n",
"dataset = pd.concat([dataset,country_code],axis=_____)"
]
},
{
"cell_type": "code",
"execution_count": 542,
"metadata": {},
"outputs": [],
"source": [
"# 选择某种编码后的数据集进行下一步处理\n",
"dataset = dataset_label\n",
"# dataset = dataset_one_hot"
]
},
{
"cell_type": "code",
"execution_count": 543,
"metadata": {},
"outputs": [],
"source": [
"#删除Country列\n",
"dataset = dataset.drop('Country', axis=1) #两种答案"
]
},
{
"cell_type": "code",
"execution_count": 544,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Purchased | \n",
" Age_std | \n",
" Salary_std | \n",
" country_code | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0.764706 | \n",
" -0.083853 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0.264706 | \n",
" -0.507531 | \n",
" 5 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0.352941 | \n",
" -0.401611 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0.588235 | \n",
" -0.278039 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 0.647059 | \n",
" -0.180946 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Purchased Age_std Salary_std country_code\n",
"0 0 0.764706 -0.083853 2\n",
"1 1 0.264706 -0.507531 5\n",
"2 0 0.352941 -0.401611 3\n",
"3 0 0.588235 -0.278039 5\n",
"4 1 0.647059 -0.180946 3"
]
},
"execution_count": 544,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 545,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"特征集:\n",
" [[ 0.76470588 -0.08385289 2. ]\n",
" [ 0.26470588 -0.50753067 5. ]\n",
" [ 0.35294118 -0.40161123 3. ]\n",
" [ 0.58823529 -0.27803854 5. ]\n",
" [ 0.64705882 -0.18094572 3. ]\n",
" [ 0.5 -0.33099827 2. ]\n",
" [ 0.44771242 -0.43691771 5. ]\n",
" [ 0.44771242 0.03971979 2. ]\n",
" [ 0.94117647 0.11033276 3. ]\n",
" [ 0.64705882 0.41043785 0. ]\n",
" [ 0.5 -1.21366031 4. ]\n",
" [ 0.11764706 -0.78998253 5. ]\n",
" [ 0.58823529 -0.03089317 1. ]\n",
" [ 0.05882353 0.14563924 0. ]\n",
" [ 0.05882353 -0.41926447 3. ]\n",
" [ 0.35294118 0.76350267 0. ]\n",
" [ 0.14705882 -1.26662003 4. ]\n",
" [ 1. 0.97534156 5. ]\n",
" [ 0. -0.18094572 1. ]\n",
" [ 0.52941176 3.6762874 0. ]]\n",
"标签集:\n",
" [0 1 0 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 0 1]\n"
]
}
],
"source": [
"#前面代码已经将类别型特征Country转换成数字编码特征country_code,\n",
"#题目16:请根据上述代码打印的dataset最新的结构,在下面空格处使用iloc对数据集进行切片,将特征列放入X,将标签列放入y\n",
"X = dataset.iloc[:,____:].values\n",
"y = dataset.iloc[:,0].values"
]
},
{
"cell_type": "code",
"execution_count": 546,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Purchased | \n",
" Age_std | \n",
" Salary_std | \n",
" country_code | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0.764706 | \n",
" -0.083853 | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0.264706 | \n",
" -0.507531 | \n",
" 5 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0.352941 | \n",
" -0.401611 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0.588235 | \n",
" -0.278039 | \n",
" 5 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 0.647059 | \n",
" -0.180946 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Purchased Age_std Salary_std country_code\n",
"0 0 0.764706 -0.083853 2\n",
"1 1 0.264706 -0.507531 5\n",
"2 0 0.352941 -0.401611 3\n",
"3 0 0.588235 -0.278039 5\n",
"4 1 0.647059 -0.180946 3"
]
},
"execution_count": 546,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head()"
]
},
{
"cell_type": "code",
"execution_count": 547,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"特征集:\n",
" [[ 0.76470588 -0.08385289 2. ]\n",
" [ 0.26470588 -0.50753067 5. ]\n",
" [ 0.35294118 -0.40161123 3. ]\n",
" [ 0.58823529 -0.27803854 5. ]\n",
" [ 0.64705882 -0.18094572 3. ]\n",
" [ 0.5 -0.33099827 2. ]\n",
" [ 0.44771242 -0.43691771 5. ]\n",
" [ 0.44771242 0.03971979 2. ]\n",
" [ 0.94117647 0.11033276 3. ]\n",
" [ 0.64705882 0.41043785 0. ]\n",
" [ 0.5 -1.21366031 4. ]\n",
" [ 0.11764706 -0.78998253 5. ]\n",
" [ 0.58823529 -0.03089317 1. ]\n",
" [ 0.05882353 0.14563924 0. ]\n",
" [ 0.05882353 -0.41926447 3. ]\n",
" [ 0.35294118 0.76350267 0. ]\n",
" [ 0.14705882 -1.26662003 4. ]\n",
" [ 1. 0.97534156 5. ]\n",
" [ 0. -0.18094572 1. ]\n",
" [ 0.52941176 3.6762874 0. ]]\n",
"标签集:\n",
" [0 1 0 0 1 1 0 1 0 1 1 0 1 0 1 1 0 1 0 1]\n"
]
}
],
"source": [
"#前面代码已经将类别型特征Country转换成数字编码特征country_code,\n",
"#题目17:请根据上述代码打印的dataset最新的结构,在下面空格处使用iloc对数据集进行切片,将特征列放入X,将标签列放入y\n",
"X = dataset.loc[:,______:].values\n",
"y = dataset.loc[:,'Purchased'].values"
]
},
{
"cell_type": "code",
"execution_count": 548,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"训练集大小: 14\n",
"测试集大小: 6\n"
]
}
],
"source": [
"#拆分训练数据集和测试数据集\n",
"#题目18:调用sklearn函数实现对训练集和测试集的拆分,指定训练集:测试集为7:3\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, __________ = 0.3, random_state = 0)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"说明:本文件使用完整的一套示例代码,重点考察数据预处理相关知识,文件描述中包含18个代码填空试题,可以根据需要选取不同填空位置组合成不同试卷。"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}