import pandas as pd
import numpy as np
from pandas import Series, DataFrame66 Intro to PD
66.1 Series
66.1.1 Creation
obj = pd.Series([4, 7, -5, 3])
type(obj)
obj0 4
1 7
2 -5
3 3
dtype: int64
# Array Representation
obj.array
type(obj.array)
# Index
obj.index
type(obj.index)pandas.core.indexes.range.RangeIndex
Series with index
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
obj2
# Index
obj2.indexIndex(['d', 'b', 'a', 'c'], dtype='object')
From Dict
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
obj3Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
Back to Dict, List, etc.
obj3.to_dict()
obj3.to_list()[35000, 71000, 16000, 5000]
66.1.2 Subset Series
# Position
obj2[0]
# Index
obj2["d"]
# list of indices
obj2[["d", "a"]]
# Logical
obj2[obj2 > 0]
# Assign
obj2["c"] = 10
obj2/var/folders/70/7wmmf6t55cb84bfx9g1c1k1m0000gn/T/ipykernel_68990/748481342.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
obj2[0]
d 4
b 7
a -5
c 10
dtype: int64
66.1.3 Vectorized Operation
obj2 * 2d 8
b 14
a -10
c 20
dtype: int64
Series is as a fixed-length, ordered dictionary
"b" in obj2True
66.1.4 Missing Value
x = pd.Series([1, 2, None])
x
# Check NA
x.isna()
x.notna()0 True
1 True
2 False
dtype: bool
66.1.5 Auto-align arithmatic
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(sdata, index=states)obj3
obj4California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
Data alignment features
obj3 + obj4California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
66.1.6 name Attribute
obj4.name = "population"
obj4.index.name = "state"
obj4state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
66.2 DataFrame
See Comparison with R data.frame
it can be thought of as a dictionary of Series all sharing the same index.
66.2.1 Creation
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
"year": [2000, 2001, 2002, 2001, 2002, 2003],
"pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
type(frame)
frame| state | year | pop | |
|---|---|---|---|
| 0 | Ohio | 2000 | 1.5 |
| 1 | Ohio | 2001 | 1.7 |
| 2 | Ohio | 2002 | 3.6 |
| 3 | Nevada | 2001 | 2.4 |
| 4 | Nevada | 2002 | 2.9 |
| 5 | Nevada | 2003 | 3.2 |
With specified column arrangement
frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2
# Column names
frame2.columns.to_list()['year', 'state', 'pop', 'debt']
# Shape
frame.shape
# Head
frame.head()
# Tail
frame.tail(3)| state | year | pop | |
|---|---|---|---|
| 3 | Nevada | 2001 | 2.4 |
| 4 | Nevada | 2002 | 2.9 |
| 5 | Nevada | 2003 | 3.2 |
Nested dictionary
populations = {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
"Nevada": {2001: 2.4, 2002: 2.9}}
pd.DataFrame(populations)| Ohio | Nevada | |
|---|---|---|
| 2000 | 1.5 | NaN |
| 2001 | 1.7 | 2.4 |
| 2002 | 3.6 | 2.9 |
66.2.2 Rows & Colums
frame.shape(6, 3)
frame.columnsIndex(['state', 'year', 'pop'], dtype='object')
66.2.3 Subset
frame2.head()| year | state | pop | debt | |
|---|---|---|---|---|
| 0 | 2000 | Ohio | 1.5 | NaN |
| 1 | 2001 | Ohio | 1.7 | NaN |
| 2 | 2002 | Ohio | 3.6 | NaN |
| 3 | 2001 | Nevada | 2.4 | NaN |
| 4 | 2002 | Nevada | 2.9 | NaN |
66.2.4 Select -> [[
R
select(df, col1, col2)Py
df[['col1', 'col2']]Select 1 column
# Quote or unquote
frame2["state"]
frame2["state"] is frame2.state
type(frame2.state) # Seriespandas.core.series.Series
Select multiple column
frame2sub = frame2[["state", "pop"]]
type(frame2sub) # DF
frame2sub| state | pop | |
|---|---|---|
| 0 | Ohio | 1.5 |
| 1 | Ohio | 1.7 |
| 2 | Ohio | 3.6 |
| 3 | Nevada | 2.4 |
| 4 | Nevada | 2.9 |
| 5 | Nevada | 3.2 |
66.2.5 Slice
R
slice(df, 1:10)Py
df.iloc[:9].iloc: integer location
frame.iloc[:3] # First 3 rows| state | year | pop | |
|---|---|---|---|
| 0 | Ohio | 2000 | 1.5 |
| 1 | Ohio | 2001 | 1.7 |
| 2 | Ohio | 2002 | 3.6 |
# Rows, Colums
frame.iloc[[0,1], [0, 2]]| state | pop | |
|---|---|---|
| 0 | Ohio | 1.5 |
| 1 | Ohio | 1.7 |
.loc: Location by Labels
frame.loc[[0, 1], ["state", "year"]]| state | year | |
|---|---|---|
| 0 | Ohio | 2000 |
| 1 | Ohio | 2001 |
Slicing also works
frame.loc[0:2, "state":"pop"]| state | year | pop | |
|---|---|---|---|
| 0 | Ohio | 2000 | 1.5 |
| 1 | Ohio | 2001 | 1.7 |
| 2 | Ohio | 2002 | 3.6 |
66.2.6 Filter -> query
R
filter(df, col1 == 1, col2 == 1)Py
df.query('col1 == 1 & col2 == 1')frame2
frame2[frame2.state == "Ohio"]
# Or
frame2.query('state == "Ohio"')
frame2.query('pop > 1.5')| year | state | pop | debt | |
|---|---|---|---|---|
| 1 | 2001 | Ohio | 1.7 | NaN |
| 2 | 2002 | Ohio | 3.6 | NaN |
| 3 | 2001 | Nevada | 2.4 | NaN |
| 4 | 2002 | Nevada | 2.9 | NaN |
| 5 | 2003 | Nevada | 3.2 | NaN |
66.2.7 Mutate -> assign
R: mutate(df, c=a-b)
Py: df.assign(c=df['a']-df['b'])
frame2.assign(pop2 = frame2["pop"] * 2)| year | state | pop | debt | pop2 | |
|---|---|---|---|---|---|
| 0 | 2000 | Ohio | 1.5 | NaN | 3.0 |
| 1 | 2001 | Ohio | 1.7 | NaN | 3.4 |
| 2 | 2002 | Ohio | 3.6 | NaN | 7.2 |
| 3 | 2001 | Nevada | 2.4 | NaN | 4.8 |
| 4 | 2002 | Nevada | 2.9 | NaN | 5.8 |
| 5 | 2003 | Nevada | 3.2 | NaN | 6.4 |
frame2["debt"] = np.arange(6.)
frame2| year | state | pop | debt | |
|---|---|---|---|---|
| 0 | 2000 | Ohio | 1.5 | 0.0 |
| 1 | 2001 | Ohio | 1.7 | 1.0 |
| 2 | 2002 | Ohio | 3.6 | 2.0 |
| 3 | 2001 | Nevada | 2.4 | 3.0 |
| 4 | 2002 | Nevada | 2.9 | 4.0 |
| 5 | 2003 | Nevada | 3.2 | 5.0 |
Assign by index of series
val = pd.Series([-1.2, -1.5, -1.7], index=[2, 4, 5])
val2 -1.2
4 -1.5
5 -1.7
dtype: float64
frame2.debt = val
frame2| year | state | pop | debt | |
|---|---|---|---|---|
| 0 | 2000 | Ohio | 1.5 | NaN |
| 1 | 2001 | Ohio | 1.7 | NaN |
| 2 | 2002 | Ohio | 3.6 | -1.2 |
| 3 | 2001 | Nevada | 2.4 | NaN |
| 4 | 2002 | Nevada | 2.9 | -1.5 |
| 5 | 2003 | Nevada | 3.2 | -1.7 |
frame2["pop2"] = frame2["pop"] * 2
frame2| year | state | pop | debt | pop2 | |
|---|---|---|---|---|---|
| 0 | 2000 | Ohio | 1.5 | NaN | 3.0 |
| 1 | 2001 | Ohio | 1.7 | NaN | 3.4 |
| 2 | 2002 | Ohio | 3.6 | -1.2 | 7.2 |
| 3 | 2001 | Nevada | 2.4 | NaN | 4.8 |
| 4 | 2002 | Nevada | 2.9 | -1.5 | 5.8 |
| 5 | 2003 | Nevada | 3.2 | -1.7 | 6.4 |
66.2.8 Rename
rename(df, col_one = col1)df.rename(columns={'col1': 'col_one'})frame2.rename(columns = {'year': 'yr'})| yr | state | pop | debt | pop2 | |
|---|---|---|---|---|---|
| 0 | 2000 | Ohio | 1.5 | NaN | 3.0 |
| 1 | 2001 | Ohio | 1.7 | NaN | 3.4 |
| 2 | 2002 | Ohio | 3.6 | -1.2 | 7.2 |
| 3 | 2001 | Nevada | 2.4 | NaN | 4.8 |
| 4 | 2002 | Nevada | 2.9 | -1.5 | 5.8 |
| 5 | 2003 | Nevada | 3.2 | -1.7 | 6.4 |
66.2.9 Summarize
frame2.describe()| year | pop | debt | pop2 | |
|---|---|---|---|---|
| count | 6.000000 | 6.000000 | 3.000000 | 6.000000 |
| mean | 2001.500000 | 2.550000 | -1.466667 | 5.100000 |
| std | 1.048809 | 0.836062 | 0.251661 | 1.672124 |
| min | 2000.000000 | 1.500000 | -1.700000 | 3.000000 |
| 25% | 2001.000000 | 1.875000 | -1.600000 | 3.750000 |
| 50% | 2001.500000 | 2.650000 | -1.500000 | 5.300000 |
| 75% | 2002.000000 | 3.125000 | -1.350000 | 6.250000 |
| max | 2003.000000 | 3.600000 | -1.200000 | 7.200000 |
66.2.10 Group by, Summarize
frame2.head()| year | state | pop | debt | pop2 | |
|---|---|---|---|---|---|
| 0 | 2000 | Ohio | 1.5 | NaN | 3.0 |
| 1 | 2001 | Ohio | 1.7 | NaN | 3.4 |
| 2 | 2002 | Ohio | 3.6 | -1.2 | 7.2 |
| 3 | 2001 | Nevada | 2.4 | NaN | 4.8 |
| 4 | 2002 | Nevada | 2.9 | -1.5 | 5.8 |
frame2.groupby('state').agg({'pop': 'mean'})| pop | |
|---|---|
| state | |
| Nevada | 2.833333 |
| Ohio | 2.266667 |
66.2.11 Count
frame2.value_counts("year")year
2001 2
2002 2
2000 1
2003 1
Name: count, dtype: int64
frame2.value_counts(["year", "state"], sort=True)year state
2000 Ohio 1
2001 Nevada 1
Ohio 1
2002 Nevada 1
Ohio 1
2003 Nevada 1
Name: count, dtype: int64