交易者论坛

回帖：数据清理
#保留小数位，四舍六入五成双df.round(2) # 全部df.round({'A': 1, 'C': 2}) # 指定列df['Name'] = df.Name # 取列名的两个方法df[df.index == 'Jude'] # 索引列的查询要用 .indexdf[df[col] > 0.5] # 选择col列的值大于0.5的行# 多条件查询df[(df['team'] == 'A') & ( df['Q1'] > 80) & df.utype.isin(['老客', '老访客'])]# 筛选为空的内容df[df.order.isnull()]# 类似 SQL whereindf[df.team.isin('A','B')]df[(df.team=='B') & (df.Q1 == 17)]df[~(df['team'] == 'A') | ( df['Q1'] > 80)] # 非，或df[df.Name.str.contains('张')] # 包含字符df.sort_values(col1) # 按照列col1排序数据，默认升序排列df.col1.sort_values() # 同上, -> sdf.sort_values(col2, ascending=False) # 按照列 col1 降序排列数据# 先按列col1升序排列，后按col2降序排列数据df.sort_values([col1,col2], ascending=[True,False])df2 = pd.get_dummies(df, prefix='t_') # 将枚举的那些列带枚举转到列上s.set_index().plot()# 多索引处理dd.set_index(['utype', 'site_id', 'p_day'], inplace=True)dd.sort_index(inplace=True) # 按索引排序dd.loc['新访客', 2, '2019-06-22'].plot.barh() # loc 中按顺序指定索引内容# 前100行, 不能指定行，如：df[100]df[:100]# 只取指定行df1 = df.loc[0:, ['设计师ID', '姓名']]# 将ages平分成5个区间并指定 labelsages = np.array([1,5,10,40,36,12,58,62,77,89,100,18,20,25,30,32])pd.cut(ages, [0,5,20,30,50,100], labels=[u"婴儿",u"青年",u"中年",u"壮年",u"老年"])daily_index.difference(df_work_day.index) # 取出差别# 格式化df.index.name # 索引的名称 strdf.columns.tolist()df.values.tolist()df.总人口.values.tolist()data.apply(np.mean) # 对 DataFrame中的每一列应用函数 np.meandata.apply(np.max,axis=1) # 对 DataFrame中的每一行应用函数 np.maxdf.insert(1, 'three', 12, allow_duplicates=False) # 插入列 (位置、列名、[值])df.pop('class') # 删除列# 增加一行df.append(pd.DataFrame({'one':2,'two':3,'three': 4.4}, index=['f']),sort=True)

白山回帖于21-5-21 18:34

下一楼›：# 指定新列iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength'] ..(白山)
‹上一楼：数据清理df.columns = ['a','b','c'] # 重命名列名df.columns = df.col ..(白山)

查看全部回帖(13)

«返回主帖