Pandas Cheat Sheet for SQL Experts
If you know SQL, you already understand data manipulation. This guide maps pandas operations to their SQL equivalents so you can leverage your existing knowledge.
Quick Setup
import pandas as pd
import numpy as np
# Sample data for examples
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'age': [25, 30, 35, 28, 32],
'department': ['Sales', 'Engineering', 'Sales', 'HR', 'Engineering'],
'salary': [50000, 80000, 60000, 55000, 85000]
})
SELECT: Choosing Columns
SQL
SELECT name, age
FROM employees;
Pandas
# Single column
df['name']
# Multiple columns
df[['name', 'age']]
# All columns
df
SQL
SELECT *
FROM employees;
Pandas
df # or df.copy()
SQL
SELECT name,
age * 2 AS double_age
FROM employees;
Pandas
df.assign(double_age=df['age'] * 2)[['name', 'double_age']]
# Or
df['double_age'] = df['age'] * 2
df[['name', 'double_age']]
WHERE: Filtering Rows
SQL
SELECT *
FROM employees
WHERE age > 30;
Pandas
df[df['age'] > 30]
# Or with query()
df.query('age > 30')
SQL
SELECT *
FROM employees
WHERE department = 'Sales'
AND age > 25;
Pandas
df[(df['department'] == 'Sales') & (df['age'] > 25)]
# Or with query()
df.query("department == 'Sales' and age > 25")
SQL
SELECT *
FROM employees
WHERE department IN ('Sales', 'HR');
Pandas
df[df['department'].isin(['Sales', 'HR'])]
# Or with query()
df.query("department in ['Sales', 'HR']")
& (and), | (or), ~ (not) in pandas, NOT and/or.
Always use parentheses around conditions!
ORDER BY: Sorting
SQL
SELECT *
FROM employees
ORDER BY age DESC;
Pandas
df.sort_values('age', ascending=False)
# Multiple columns
df.sort_values(['department', 'age'],
ascending=[True, False])
LIMIT / OFFSET: Row Selection
SQL
SELECT *
FROM employees
LIMIT 10;
Pandas
df.head(10)
# Or slicing
df[:10]
SQL
SELECT *
FROM employees
LIMIT 10 OFFSET 20;
Pandas
df.iloc[20:30]
GROUP BY: Aggregations
SQL
SELECT department,
COUNT(*) as count,
AVG(salary) as avg_salary
FROM employees
GROUP BY department;
Pandas
df.groupby('department').agg({
'name': 'count',
'salary': 'mean'
}).rename(columns={'name': 'count', 'salary': 'avg_salary'})
# Or simpler
df.groupby('department')['salary'].mean()
SQL
SELECT department,
MIN(age),
MAX(age),
SUM(salary)
FROM employees
GROUP BY department;
Pandas
df.groupby('department').agg({
'age': ['min', 'max'],
'salary': 'sum'
})
SQL
SELECT department,
COUNT(*)
FROM employees
GROUP BY department
HAVING COUNT(*) > 1;
Pandas
df.groupby('department').size().reset_index(name='count')
.query('count > 1')
# Or
df.groupby('department').filter(lambda x: len(x) > 1)
JOIN: Combining DataFrames
# Setup for join examples
df2 = pd.DataFrame({
'department': ['Sales', 'Engineering', 'Marketing'],
'budget': [100000, 200000, 150000]
})
SQL
SELECT e.*, d.budget
FROM employees e
INNER JOIN departments d
ON e.department = d.department;
Pandas
df.merge(df2, on='department', how='inner')
# Or with different column names
df.merge(df2, left_on='dept', right_on='department')
SQL
SELECT *
FROM employees e
LEFT JOIN departments d
ON e.department = d.department;
Pandas
df.merge(df2, on='department', how='left')
- INNER JOIN →
how='inner' - LEFT JOIN →
how='left' - RIGHT JOIN →
how='right' - FULL OUTER JOIN →
how='outer'
UNION / CONCAT: Stacking Data
SQL
SELECT * FROM employees_2023
UNION ALL
SELECT * FROM employees_2024;
Pandas
pd.concat([df_2023, df_2024], ignore_index=True)
# UNION (remove duplicates)
pd.concat([df_2023, df_2024]).drop_duplicates()
DISTINCT: Unique Values
SQL
SELECT DISTINCT department
FROM employees;
Pandas
df['department'].unique()
# As DataFrame
df[['department']].drop_duplicates()
# Count unique
df['department'].nunique()
CASE WHEN: Conditional Logic
SQL
SELECT name,
CASE
WHEN age < 30 THEN 'Young'
WHEN age < 40 THEN 'Middle'
ELSE 'Senior'
END as age_group
FROM employees;
Pandas
df['age_group'] = pd.cut(df['age'],
bins=[0, 30, 40, 100],
labels=['Young', 'Middle', 'Senior'])
# Or with np.select
conditions = [
df['age'] < 30,
df['age'] < 40,
]
choices = ['Young', 'Middle']
df['age_group'] = np.select(conditions, choices, default='Senior')
Window Functions: Ranking & Rolling
SQL
SELECT name, salary,
RANK() OVER (ORDER BY salary DESC) as rank
FROM employees;
Pandas
df['rank'] = df['salary'].rank(ascending=False, method='min')
SQL
SELECT name, salary,
RANK() OVER (
PARTITION BY department
ORDER BY salary DESC
) as dept_rank
FROM employees;
Pandas
df['dept_rank'] = df.groupby('department')['salary']
.rank(ascending=False, method='min')
SQL
SELECT date, value,
AVG(value) OVER (
ORDER BY date
ROWS BETWEEN 2 PRECEDING
AND CURRENT ROW
) as moving_avg
FROM metrics;
Pandas
df['moving_avg'] = df['value'].rolling(window=3).mean()
NULL Handling
SQL
SELECT *
FROM employees
WHERE age IS NULL;
Pandas
df[df['age'].isna()]
# IS NOT NULL
df[df['age'].notna()]
SQL
SELECT COALESCE(age, 0)
FROM employees;
Pandas
df['age'].fillna(0)
# Multiple strategies
df.fillna({'age': 0, 'salary': df['salary'].mean()})
Subqueries
SQL
SELECT *
FROM employees
WHERE salary > (
SELECT AVG(salary)
FROM employees
);
Pandas
avg_salary = df['salary'].mean()
df[df['salary'] > avg_salary]
# Or inline
df[df['salary'] > df['salary'].mean()]
SQL
SELECT *
FROM employees e1
WHERE salary = (
SELECT MAX(salary)
FROM employees e2
WHERE e1.department = e2.department
);
Pandas
max_salaries = df.groupby('department')['salary'].transform('max')
df[df['salary'] == max_salaries]
Common String Operations
SQL
SELECT UPPER(name) FROM employees;
SELECT LOWER(name) FROM employees;
SELECT CONCAT(first_name, ' ', last_name);
SELECT name FROM employees WHERE name LIKE 'A%';
Pandas
df['name'].str.upper()
df['name'].str.lower()
df['first_name'] + ' ' + df['last_name']
df[df['name'].str.startswith('A')]
df[df['name'].str.contains('pattern', regex=True)]
INSERT / UPDATE / DELETE Equivalents
SQL
INSERT INTO employees
VALUES ('Frank', 40, 'IT', 70000);
Pandas
new_row = pd.DataFrame([{
'name': 'Frank',
'age': 40,
'department': 'IT',
'salary': 70000
}])
df = pd.concat([df, new_row], ignore_index=True)
SQL
UPDATE employees
SET salary = salary * 1.1
WHERE department = 'Sales';
Pandas
df.loc[df['department'] == 'Sales', 'salary'] *= 1.1
SQL
DELETE FROM employees
WHERE age > 60;
Pandas
df = df[df['age'] <= 60]
# Or drop by index
df = df.drop(df[df['age'] > 60].index)
Performance Tips
1. Avoid Iterating Rows
❌ for i, row in df.iterrows(): ...
✅ Use vectorized operations: df['new'] = df['col1'] + df['col2']
2. Use query() for Complex Filters
❌ df[(df['a'] > 5) & (df['b'] == 'x') & (df['c'] < 10)]
✅ df.query("a > 5 and b == 'x' and c < 10")
3. Set Index for Frequent Lookups
If you frequently filter by a column, set it as index:
✅ df.set_index('id', inplace=True)
4. Use Categorical for Repeated Strings
✅ df['department'] = df['department'].astype('category')
Saves memory and speeds up groupby operations!
Common Gotchas
Chain Assignment Warning
❌ df[df['age'] > 30]['salary'] = 100000
✅ df.loc[df['age'] > 30, 'salary'] = 100000
Copy vs. View
Most operations return views, not copies. Use .copy() when you need independence:
✅ df_subset = df[df['age'] > 30].copy()
Index Alignment
Operations align on index, not position:
df1 + df2 aligns by index, not row number!
Reset index if needed: df.reset_index(drop=True)
Practice Exercises
Exercise 1: Complex Query
Translate to pandas:
SELECT department, AVG(salary) as avg_sal
FROM employees
WHERE age > 25
GROUP BY department
HAVING AVG(salary) > 60000
ORDER BY avg_sal DESC;
Solution
df[df['age'] > 25].groupby('department')['salary']
.mean()
.rename('avg_sal')
.reset_index()
.query('avg_sal > 60000')
.sort_values('avg_sal', ascending=False)
Exercise 2: Window Function
Add a column with each employee's salary percentile within their department.
Solution
df['dept_percentile'] = df.groupby('department')['salary']
.rank(pct=True)