Copyimport pandas as pdimport numpy as npurl = ('https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv')tips = pd.read_csv(url)output = tips.head()
Output:
Copytotal_bill tip sex smoker day time size0 16.99 1.01 Female No Sun Dinner 21 10.34 1.66 Male No Sun Dinner 32 21.01 3.50 Male No Sun Dinner 33 23.68 3.31 Male No Sun Dinner 24 24.59 3.61 Female No Sun Dinner 4
sql 语句: SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;
。
Copyoutput = tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
Output:
Copytotal_bill tip smoker time0 16.99 1.01 No Dinner1 10.34 1.66 No Dinner2 21.01 3.50 No Dinner3 23.68 3.31 No Dinner4 24.59 3.61 No Dinner
sql 语句: SELECT * FROM tips WHERE time = 'Dinner' LIMIT 5;
Copyoutput = tips[tips['time'] == 'Dinner'].head(5)# 或者output = tips.query("time == 'Dinner'").head(5)
Output:
Copytotal_bill tip sex smoker day time size0 16.99 1.01 Female No Sun Dinner 21 10.34 1.66 Male No Sun Dinner 32 21.01 3.50 Male No Sun Dinner 33 23.68 3.31 Male No Sun Dinner 24 24.59 3.61 Female No Sun Dinner 4
==
、 大于 >
、 大于等于 >=
、小于等于 <=
、不等于 !=
==
sql 语句:SELECT * FROM tips WHERE time = 'Dinner';
。
Copyoutput = tips[(tips['time'] == 'Dinner')]
>
sql 语句:SELECT * FROM tips WHERE tip > 5.00;
。
Copyoutput = tips[(tips['tip'] > 5.00)]
>=
sql 语句:SELECT * FROM tips WHERE tip >= 5.00;
。
Copyoutput = tips[(tips['size'] >= 5)]
<=
sql 语句:SELECT * FROM tips WHERE tip <= 5.00;
。
Copyoutput = tips[(tips['size'] <= 5)]
!=
sql 语句:SELECT * FROM tips WHERE tip <> 5.00;
。
Copyoutput = tips[(tips['size'] != 5)]
&
、或 |
、非 -
&
sql 语句:SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00;
Copyoutput = tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
|
sql 语句:SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;
。
Copyoutput = tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
-
sql 语句:SELECT * FROM tips WHERE not (size <> 5 AND size > 4);
Copyoutput = df[-((df['size'] != 5) & (df['size'] > 4))]
这里重新定义一个包含 NaN
数据的 DataFrame
。
Copyframe = pd.DataFrame({ 'col1': ['A', 'B', np.NaN, 'C', 'D'], 'col2': ['F', np.NaN, 'G', 'H', 'I'] })output = frame
Output:
Copycol1 col20 A F1 B NaN2 NaN G3 C H4 D I
sql 语句:SELECT * FROM frame WHERE col2 IS NULL;
。
Copyoutput = frame[frame['col2'].isna()]
Output:
Copycol1 col21 B NaN
sql 语句:SELECT * FROM frame WHERE col1 IS NOT NULL;
。
Copyoutput = frame[frame['col1'].notna()]
Output:
Copycol1 col20 A F1 B NaN3 C H4 D I
sql 语句:SELECT * FROM tips WHERE siez in (5, 6);
。
Copyoutput = tips[tips['size'].isin([2, 5])]
sql 语句:SELECT * FROM tips WHERE time like 'Din%';
。
Copyoutput = tips[tips.time.str.contains('Din*')]
sql 语句:SELECT sex, count(*) FROM tips GROUP BY sex;
Copyoutput = tips.groupby('sex').size()# 获取相应的结果output['Male']output['Female']
Copyoutput = tips.groupby('sex').count()# 获取相应的结果output['tip']['Female']
Copyoutput = tips.groupby('sex')['total_bill'].count()# 获取相应的结果output['Male']output['Female']
sql 语句:SELECT day, AVG(tip), COUNT(*) FROM tips GROUP BY day;
Copyoutput = tips.groupby('day').agg({'tip': np.mean, 'day': np.size})# 获取相应的结果output['day']['Fri']output['tip']['Fri']
sql 语句:SELECT smoker, day, COUNT(*), AVG(tip) FROM tips GROUP BY smoker, day;
Copyoutput = tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})# 获取相应的结果output['tip']['size']['No']['Fri']
sql 语句:SELECT tip, count(distinct sex) FROM tips GROUP BY tip;
Copyoutput = tips.groupby('tip').agg({'sex': pd.Series.nunique})
定义两个 DataFrame。
Copydf1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], 'value': np.random.randn(4)})df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], 'value': np.random.randn(4)})
sql 语句:SELECT * FROM df1 INNER JOIN df2 ON df1.key = df2.key;
Copyoutput = pd.merge(df1, df2, on='key')# 或indexed_df2 = df2.set_index('key')pd.merge(df1, indexed_df2, left_on='key', right_index=True)
sql 语句:SELECT * FROM df1 LEFT OUTER JOIN df2 ON df1.key = df2.key;
Copyoutput = pd.merge(df1, df2, on='key', how='left')# 或output = df1.join(df2, on='key', how='left')
sql 语句:SELECT * FROM df1 RIGHT OUTER JOIN df2 ON df1.key = df2.key;
Copyoutput = pd.merge(df1, df2, on='key', how='right')
sql 语句:SELECT * FROM df1 FULL OUTER JOIN df2 ON df1.key = df2.key;
Copyoutput = pd.merge(df1, df2, on='key', how='outer')
Copydf1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'], 'rank': range(1, 4)})df2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'], 'rank': [1, 4, 5]})
sql 语句:SELECT city, rank FROM df1 UNION ALL SELECT city, rank FROM df2;
Copyoutput = pd.concat([df1, df2])
sql 语句:SELECT city, rank FROM df1 UNION SELECT city, rank FROM df2;
Copyoutput = pd.concat([df1, df2]).drop_duplicates()
sql 语句:SELECT DISTINCT sex FROM tips;
Copyoutput = tips.drop_duplicates(subset=['sex'], keep='first', inplace=False)
sql 语句:SELECT total_bill AS total, sex AS xes FROM tips;
Copyoutput = tips.rename(columns={'total_bill': 'total', 'sex': 'xes'}, inplace=False)
sql 语句:SELECT * FROM tips ORDER BY tip DESC LIMIT 10 OFFSET 5;
Copyoutput = tips.nlargest(10 + 5, columns='tip').tail(10)
sql 语句:
CopySELECT * FROM ( SELECT t.*, ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn FROM tips t)WHERE rn < 3ORDER BY day, rn;
Copyoutput = tips.assign(rn=tips.sort_values(['total_bill'], ascending=False). groupby(['day']).cumcount() + 1). query('rn < 3'). sort_values(['day', 'rn'])
sql 语句:UPDATE tips SET tip = tip*2 WHERE tip < 2;
Copyoutput = tips.loc[tips['tip'] < 2, 'tip'] *= 2
sql 语句:DELETE FROM tips WHERE tip > 9;
Copyoutput = tips = tips.loc[tips['tip'] <= 9]
联系客服