import pandas as pd
import numpy as np
from pandas import Series, DataFrame
42 Intro to PD
42.1 Series
42.1.1 Creation
= pd.Series([4, 7, -5, 3])
obj type(obj)
obj
0 4
1 7
2 -5
3 3
dtype: int64
# Array Representation
obj.arraytype(obj.array)
# Index
obj.indextype(obj.index)
pandas.core.indexes.range.RangeIndex
Series with index
= pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
obj2
obj2
# Index
obj2.index
Index(['d', 'b', 'a', 'c'], dtype='object')
From Dict
= {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
sdata = pd.Series(sdata)
obj3 obj3
Ohio 35000
Texas 71000
Oregon 16000
Utah 5000
dtype: int64
Back to Dict
, List
, etc.
obj3.to_dict() obj3.to_list()
[35000, 71000, 16000, 5000]
42.1.2 Subset Series
# Position
0]
obj2[
# Index
"d"]
obj2[
# list of indices
"d", "a"]]
obj2[[
# Logical
> 0]
obj2[obj2
# Assign
"c"] = 10
obj2[ obj2
/var/folders/70/7wmmf6t55cb84bfx9g1c1k1m0000gn/T/ipykernel_19261/748481342.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
obj2[0]
d 4
b 7
a -5
c 10
dtype: int64
42.1.3 Vectorized Operation
* 2 obj2
d 8
b 14
a -10
c 20
dtype: int64
Series is as a fixed-length, ordered dictionary
"b" in obj2
True
42.1.4 Missing Value
= pd.Series([1, 2, None])
x
x# Check NA
x.isna() x.notna()
0 True
1 True
2 False
dtype: bool
42.1.5 Auto-align arithmatic
= ["California", "Ohio", "Oregon", "Texas"]
states = pd.Series(sdata, index=states) obj4
obj3 obj4
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
Data alignment features
+ obj4 obj3
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
42.1.6 name
Attribute
= "population"
obj4.name = "state"
obj4.index.name obj4
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
42.2 DataFrame
See Comparison with R data.frame
it can be thought of as a dictionary of Series all sharing the same index.
42.2.1 Creation
= {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
data "year": [2000, 2001, 2002, 2001, 2002, 2003],
"pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
= pd.DataFrame(data)
frame type(frame)
frame
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
3 | Nevada | 2001 | 2.4 |
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
With specified column arrangement
= pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2
frame2
# Column names
frame2.columns.to_list()
['year', 'state', 'pop', 'debt']
# Shape
frame.shape# Head
frame.head()# Tail
3) frame.tail(
state | year | pop | |
---|---|---|---|
3 | Nevada | 2001 | 2.4 |
4 | Nevada | 2002 | 2.9 |
5 | Nevada | 2003 | 3.2 |
Nested dictionary
= {"Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
populations "Nevada": {2001: 2.4, 2002: 2.9}}
pd.DataFrame(populations)
Ohio | Nevada | |
---|---|---|
2000 | 1.5 | NaN |
2001 | 1.7 | 2.4 |
2002 | 3.6 | 2.9 |
42.2.2 Rows & Colums
frame.shape
(6, 3)
frame.columns
Index(['state', 'year', 'pop'], dtype='object')
42.2.3 Subset
frame2.head()
year | state | pop | debt | |
---|---|---|---|---|
0 | 2000 | Ohio | 1.5 | NaN |
1 | 2001 | Ohio | 1.7 | NaN |
2 | 2002 | Ohio | 3.6 | NaN |
3 | 2001 | Nevada | 2.4 | NaN |
4 | 2002 | Nevada | 2.9 | NaN |
42.2.4 Select -> [[
R
select(df, col1, col2)
Py
'col1', 'col2']] df[[
Select 1 column
# Quote or unquote
"state"]
frame2[
"state"] is frame2.state
frame2[
type(frame2.state) # Series
pandas.core.series.Series
Select multiple column
= frame2[["state", "pop"]]
frame2sub type(frame2sub) # DF
frame2sub
state | pop | |
---|---|---|
0 | Ohio | 1.5 |
1 | Ohio | 1.7 |
2 | Ohio | 3.6 |
3 | Nevada | 2.4 |
4 | Nevada | 2.9 |
5 | Nevada | 3.2 |
42.2.5 Slice
R
slice(df, 1:10)
Py
9] df.iloc[:
.iloc
: integer location
3] # First 3 rows frame.iloc[:
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
# Rows, Colums
0,1], [0, 2]] frame.iloc[[
state | pop | |
---|---|---|
0 | Ohio | 1.5 |
1 | Ohio | 1.7 |
.loc
: Location by Labels
0, 1], ["state", "year"]] frame.loc[[
state | year | |
---|---|---|
0 | Ohio | 2000 |
1 | Ohio | 2001 |
Slicing also works
0:2, "state":"pop"] frame.loc[
state | year | pop | |
---|---|---|---|
0 | Ohio | 2000 | 1.5 |
1 | Ohio | 2001 | 1.7 |
2 | Ohio | 2002 | 3.6 |
42.2.6 Filter -> query
R
filter(df, col1 == 1, col2 == 1)
Py
'col1 == 1 & col2 == 1') df.query(
frame2
== "Ohio"]
frame2[frame2.state # Or
'state == "Ohio"')
frame2.query(
'pop > 1.5') frame2.query(
year | state | pop | debt | |
---|---|---|---|---|
1 | 2001 | Ohio | 1.7 | NaN |
2 | 2002 | Ohio | 3.6 | NaN |
3 | 2001 | Nevada | 2.4 | NaN |
4 | 2002 | Nevada | 2.9 | NaN |
5 | 2003 | Nevada | 3.2 | NaN |
42.2.7 Mutate -> assign
R
: mutate(df, c=a-b)
Py
: df.assign(c=df['a']-df['b'])
= frame2["pop"] * 2) frame2.assign(pop2
year | state | pop | debt | pop2 | |
---|---|---|---|---|---|
0 | 2000 | Ohio | 1.5 | NaN | 3.0 |
1 | 2001 | Ohio | 1.7 | NaN | 3.4 |
2 | 2002 | Ohio | 3.6 | NaN | 7.2 |
3 | 2001 | Nevada | 2.4 | NaN | 4.8 |
4 | 2002 | Nevada | 2.9 | NaN | 5.8 |
5 | 2003 | Nevada | 3.2 | NaN | 6.4 |
"debt"] = np.arange(6.)
frame2[ frame2
year | state | pop | debt | |
---|---|---|---|---|
0 | 2000 | Ohio | 1.5 | 0.0 |
1 | 2001 | Ohio | 1.7 | 1.0 |
2 | 2002 | Ohio | 3.6 | 2.0 |
3 | 2001 | Nevada | 2.4 | 3.0 |
4 | 2002 | Nevada | 2.9 | 4.0 |
5 | 2003 | Nevada | 3.2 | 5.0 |
Assign by index of series
= pd.Series([-1.2, -1.5, -1.7], index=[2, 4, 5])
val val
2 -1.2
4 -1.5
5 -1.7
dtype: float64
= val
frame2.debt frame2
year | state | pop | debt | |
---|---|---|---|---|
0 | 2000 | Ohio | 1.5 | NaN |
1 | 2001 | Ohio | 1.7 | NaN |
2 | 2002 | Ohio | 3.6 | -1.2 |
3 | 2001 | Nevada | 2.4 | NaN |
4 | 2002 | Nevada | 2.9 | -1.5 |
5 | 2003 | Nevada | 3.2 | -1.7 |
"pop2"] = frame2["pop"] * 2
frame2[ frame2
year | state | pop | debt | pop2 | |
---|---|---|---|---|---|
0 | 2000 | Ohio | 1.5 | NaN | 3.0 |
1 | 2001 | Ohio | 1.7 | NaN | 3.4 |
2 | 2002 | Ohio | 3.6 | -1.2 | 7.2 |
3 | 2001 | Nevada | 2.4 | NaN | 4.8 |
4 | 2002 | Nevada | 2.9 | -1.5 | 5.8 |
5 | 2003 | Nevada | 3.2 | -1.7 | 6.4 |
42.2.8 Rename
rename(df, col_one = col1)
={'col1': 'col_one'}) df.rename(columns
= {'year': 'yr'}) frame2.rename(columns
yr | state | pop | debt | pop2 | |
---|---|---|---|---|---|
0 | 2000 | Ohio | 1.5 | NaN | 3.0 |
1 | 2001 | Ohio | 1.7 | NaN | 3.4 |
2 | 2002 | Ohio | 3.6 | -1.2 | 7.2 |
3 | 2001 | Nevada | 2.4 | NaN | 4.8 |
4 | 2002 | Nevada | 2.9 | -1.5 | 5.8 |
5 | 2003 | Nevada | 3.2 | -1.7 | 6.4 |
42.2.9 Summarize
frame2.describe()
year | pop | debt | pop2 | |
---|---|---|---|---|
count | 6.000000 | 6.000000 | 3.000000 | 6.000000 |
mean | 2001.500000 | 2.550000 | -1.466667 | 5.100000 |
std | 1.048809 | 0.836062 | 0.251661 | 1.672124 |
min | 2000.000000 | 1.500000 | -1.700000 | 3.000000 |
25% | 2001.000000 | 1.875000 | -1.600000 | 3.750000 |
50% | 2001.500000 | 2.650000 | -1.500000 | 5.300000 |
75% | 2002.000000 | 3.125000 | -1.350000 | 6.250000 |
max | 2003.000000 | 3.600000 | -1.200000 | 7.200000 |
42.2.10 Group by, Summarize
frame2.head()
year | state | pop | debt | pop2 | |
---|---|---|---|---|---|
0 | 2000 | Ohio | 1.5 | NaN | 3.0 |
1 | 2001 | Ohio | 1.7 | NaN | 3.4 |
2 | 2002 | Ohio | 3.6 | -1.2 | 7.2 |
3 | 2001 | Nevada | 2.4 | NaN | 4.8 |
4 | 2002 | Nevada | 2.9 | -1.5 | 5.8 |
'state').agg({'pop': 'mean'}) frame2.groupby(
pop | |
---|---|
state | |
Nevada | 2.833333 |
Ohio | 2.266667 |
42.2.11 Count
"year") frame2.value_counts(
year
2001 2
2002 2
2000 1
2003 1
Name: count, dtype: int64
"year", "state"], sort=True) frame2.value_counts([
year state
2000 Ohio 1
2001 Nevada 1
Ohio 1
2002 Nevada 1
Ohio 1
2003 Nevada 1
Name: count, dtype: int64