参考资料:

* 黑马程序员相关教程
<>创建
# 通过array创建DataFrame t = pd.DataFrame(np.arange(12).reshape(3, 4)) print(t) #
0 1 2 3 # 0 0 1 2 3 # 1 4 5 6 7 # 2 8 9 10 11 # 指定DataFrame的index和columns t1 =
np.arange(12).reshape(3, 4) t2 = pd.DataFrame(t1, index=list("123"), columns=
list("abcd")) print(t2) # a b c d # 1 0 1 2 3 # 2 4 5 6 7 # 3 8 9 10 11 #
通过字典创建DataFrame d1 = {'name': "张三", 'age': 18, 'del': 10010} d2 = {'name': '李四',
'del': 10086} d3 = {'age': 17} L = [d1, d2, d3] t = pd.DataFrame(L) print(t) #
name age del # 0 张三 18.0 10010.0 # 1 李四 NaN 10086.0 # 2 NaN 17.0 NaN
<>读取csv文件
t = pd.read_csv("./train.csv") print(t) # PassengerId Survived Pclass ... Fare
Cabin Embarked # 0 1 0 3 ... 7.2500 NaN S # 1 2 1 1 ... 71.2833 C85 C # 2 3 1 3
... 7.9250 NaN S # 3 4 1 1 ... 53.1000 C123 S # 4 5 0 3 ... 8.0500 NaN S # ..
... ... ... ... ... ... ... # 886 887 0 2 ... 13.0000 NaN S # 887 888 1 1 ...
30.0000 B42 S # 888 889 0 3 ... 23.4500 NaN S # 889 890 1 1 ... 30.0000 C148 C
# 890 891 0 3 ... 7.7500 NaN Q # # [891 rows x 12 columns]
<>DataFrame的属性
t = pd.read_csv("./train.csv") print(t.index) # 获取DataFrame的行索引 #
RangeIndex(start=0, stop=891, step=1) print(t.columns) # 获取DataFrame的列索引 #
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', #
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], # dtype='object') print(t.shape
) # 获取DataFrame的形状 # (891, 12) print(t.dtypes) # 获取DataFrame中每一个列索引对应的数据类型 #
PassengerId int64 # Survived int64 # Pclass int64 # Name object # Sex object #
Age float64 # SibSp int64 # Parch int64 # Ticket object # Fare float64 # Cabin
object # Embarked object # dtype: object print(t.ndim) # 获取DataFrame的维度 # 2
print(t.head()) # 显示DataFrame的前几行(默认5行) # PassengerId Survived Pclass ... Fare
Cabin Embarked # 0 1 0 3 ... 7.2500 NaN S # 1 2 1 1 ... 71.2833 C85 C # 2 3 1 3
... 7.9250 NaN S # 3 4 1 1 ... 53.1000 C123 S # 4 5 0 3 ... 8.0500 NaN S # # [5
rows x 12 columns] print(t.tail()) # 显示DataFrame的后几行(默认5行) # PassengerId
Survived Pclass ... Fare Cabin Embarked # 886 887 0 2 ... 13.00 NaN S # 887 888
1 1 ... 30.00 B42 S # 888 889 0 3 ... 23.45 NaN S # 889 890 1 1 ... 30.00 C148 C
# 890 891 0 3 ... 7.75 NaN Q # # [5 rows x 12 columns] print(t.describe()) #
显示DataFrame的统计信息 # PassengerId Survived Pclass ... SibSp Parch Fare # count
891.000000 891.000000 891.000000 ... 891.000000 891.000000 891.000000 # mean
446.000000 0.383838 2.308642 ... 0.523008 0.381594 32.204208 # std 257.353842
0.486592 0.836071 ... 1.102743 0.806057 49.693429 # min 1.000000 0.000000
1.000000 ... 0.000000 0.000000 0.000000 # 25% 223.500000 0.000000 2.000000 ...
0.000000 0.000000 7.910400 # 50% 446.000000 0.000000 3.000000 ... 0.000000
0.000000 14.454200 # 75% 668.500000 1.000000 3.000000 ... 1.000000 0.000000
31.000000 # max 891.000000 1.000000 3.000000 ... 8.000000 6.000000 512.329200 #
# [8 rows x 7 columns]
<>DataFrame的排序
t = pd.read_csv("train.csv"); print(t.sort_values(by="Fare", ascending=False))
#按照"Fare"的值进行升序排列 # PassengerId Survived Pclass ... Fare Cabin Embarked # 258
259 1 1 ... 512.3292 NaN C # 737 738 1 1 ... 512.3292 B101 C # 679 680 1 1 ...
512.3292 B51 B53 B55 C # 88 89 1 1 ... 263.0000 C23 C25 C27 S # 27 28 0 1 ...
263.0000 C23 C25 C27 S # .. ... ... ... ... ... ... ... # 633 634 0 1 ...
0.0000 NaN S # 413 414 0 2 ... 0.0000 NaN S # 822 823 0 1 ... 0.0000 NaN S #
732 733 0 2 ... 0.0000 NaN S # 674 675 0 2 ... 0.0000 NaN S # # [891 rows x 12
columns]
<>索引

<>行索引
t = pd.DataFrame(np.arange(12).reshape(3, 4)) print(t[:2]) print(type(t[:2]))
# 0 1 2 3 # 0 0 1 2 3 # 1 4 5 6 7 # <class 'pandas.core.frame.DataFrame'>
<>列索引
t = pd.DataFrame(np.arange(12).reshape(3, 4)) print(t[1]) print(type(t[1])) #
0 0 # 1 4 # 2 8 # Name: 0, dtype: int32 # <class 'pandas.core.series.Series'>
<>通过标签索引
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list(
"WXYZ")) print(t) # W X Y Z # a 0 1 2 3 # b 4 5 6 7 # c 8 9 10 11 print(t.loc[
'a', :]) # W 0 # X 1 # Y 2 # Z 3 # Name: a, dtype: int32 print(t.loc['b', 'X'])
# 5 print(t.loc['a':'c', 'W':'X']) # 冒号右边的标签包含在切片中 # W X # a 0 1 # b 4 5 # c 8 9
<>通过位置索引
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list(
"WXYZ")) print(t) # W X Y Z # a 0 1 2 3 # b 4 5 6 7 # c 8 9 10 11 print(t.iloc[0
, :]) # W 0 # X 1 # Y 2 # Z 3 # Name: a, dtype: int32 print(t.iloc[1, 1]) # 5
print(t.iloc[0:2, 0:1]) # 冒号右边的位置不包含在切片中 # W # a 0 # b 4
<>布尔索引
t = pd.read_csv("train.csv") print(t[(t["Fare"] > 200) & (t["Embarked"] == 'S')
]) # PassengerId Survived Pclass ... Fare Cabin Embarked # 27 28 0 1 ...
263.0000 C23 C25 C27 S # 88 89 1 1 ... 263.0000 C23 C25 C27 S # 341 342 1 1 ...
263.0000 C23 C25 C27 S # 438 439 0 1 ... 263.0000 C23 C25 C27 S # 527 528 0 1
... 221.7792 C95 S # 689 690 1 1 ... 211.3375 B5 S # 730 731 1 1 ... 211.3375
B5 S # 779 780 1 1 ... 211.3375 B3 S # # [8 rows x 12 columns]
<>缺失数据处理
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list(
"WXYZ")) t.loc['b', 'W':'X'] = np.nan t.loc['c', :] = np.nan print(t) # W X Y Z
# a 0.0 1.0 2.0 3.0 # b NaN NaN 6.0 7.0 # c NaN NaN NaN NaN
<>判断数据是否为NaN
print(t.isnull()) # W X Y Z # a False False False False # b True True False
False # c True True True True print(t.notnull()) # W X Y Z # a True True True
True # b False False True True # c False False False False
<>删除NaN所在的行列
print(t.dropna(how="any")) # W X Y Z # a 0.0 1.0 2.0 3.0 print(t.dropna(how=
"all")) # W X Y Z # a 0.0 1.0 2.0 3.0 # b NaN NaN 6.0 7.0
<>填充NaN
print(t.fillna(0)) # W X Y Z # a 0.0 1.0 2.0 3.0 # b 0.0 0.0 6.0 7.0 # c 0.0
0.0 0.0 0.0 print(t.fillna(t.mean())) # W X Y Z # a 0.0 1.0 2.0 3.0 # b 0.0 1.0
6.0 7.0 # c 0.0 1.0 4.0 5.0 print(t["W"].fillna(t["W"].mean())) # a 0.0 # b 0.0
# c 0.0 # Name: W, dtype: float64

技术
下载桌面版
GitHub
百度网盘(提取码:draw)
Gitee
云服务器优惠
阿里云优惠券
腾讯云优惠券
华为云优惠券
站点信息
问题反馈
邮箱:[email protected]
QQ群:766591547
关注微信