import pandas as pd df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'], 'key2':['one', 'two', 'one', 'two', 'one'], 'data1':np.random.randn(5), 'data2':np.random.randn(5)}) df data1 data2 key1 key2 0-0.4106730.519378 a one 1-2.1207930.199074 a two 20.642216-0.143671 b one 30.975133-0.592994 b two 4-1.017495-0.530459 a one
grouped = df['data1'].groupby(df['key1'])
grouped.mean() key1 a -1.182987 b 0.808674 dtype: float64
means = df['data1'].groupby([df['key1'], df['key2']]).mean() means.unstack() key2 one two key1 a -0.714084-2.120793 b 0.6422160.975133
选取两个dataframe中列公共的元素
有df1.ID, df2.ID 两列,我需要选取两列中都有的元素。 可以用如下方法:
1 2 3 4 5 6 7
import pandas as pd m_df=pd.merge(df1,df2,how="inner",on="ID") ind=m_df.ID df1.set_index('ID',inplace=True) df2.set_index('ID',inplace=True) common_df1=df1.loc[ind,:] common_df2=df2.loc[ind,:]
a b 2011-01-0100:00:001.883381-0.416629 2011-01-0101:00:000.149948-1.782170 2011-01-0102:00:00-0.4076040.314168 2011-01-0103:00:001.452354 NaN 2011-01-0104:00:00-1.224869-0.947457 2011-01-0105:00:000.4983260.070416 2011-01-0106:00:000.401665 NaN 2011-01-0107:00:00-0.0197660.533641 2011-01-0108:00:00-1.101303-1.408561 2011-01-0109:00:001.671795-0.764629
import numpy as np index = df['b'].index[df['b'].apply(np.isnan)]
df['a'].ix[index[0]] >>> 1.452354
df_index = df.index.values.tolist() [df_index.index(i) for i in index] >>> [3, 6]
print"Drop all rows that have any NaN values:" print"Data size after filtering:" print df.dropna().shape print df.dropna().head(10)
print"Drop only if all columns are NaN:" print"Data size after filtering:" print df.dropna(how='all').shape print df.dropna(how='all').head(10)
print"Drop rows who do not have at least six values that are not NaN" print"Data size after filtering:" print df.dropna(thresh=6).shape print df.dropna(thresh=6).head(10)
print"Drop only if NaN in specific column:" print"Data size after filtering:" print df.dropna(subset=['closePrice']).shape print df.dropna(subset=['closePrice']).head(10)