import pandas as pd

Pandas Basics¶

Pandas is a high-level data manipulation tool(advanced numpy) that allows you to manipulate tabular data easily.

Two tutorials:

DataFrames ¶

DataFrames is a key data structure in Pandas(advanced 2-dimension ndarray in numpy).

dict = {"country": ["Brazil", "Russia", "India", "China", "South Africa"],
       "capital": ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
       "area": [8.516, 17.10, 3.286, 9.597, 1.221],
       "population": [200.4, 143.5, 1252, 1357, 52.98] }
brics = pd.DataFrame(dict)
brics

# data index, row index
brics.index

RangeIndex(start=0, stop=5, step=1)

brics.index = ["BR", "RU", "IN", "CH", "SA"] # length must match
brics

# column index
brics.columns

Index(['area', 'capital', 'country', 'population'], dtype='object')

brics.describe() # describe the statistics of data of number type

Indexing DataFrames¶

# column index, by name
brics[['area', 'capital']]

# row index
brics[1:4]

# table index by name, the fist dim is row and the second dim is column
brics.loc[['RU','IN'], ['area', 'capital']]

# table index by index
brics.iloc[:2, :2]

# boolean indexing
brics[brics.area > 4]

for t in brics: # feel like a dict by column for 'for loop'
    print(t) # key
    print(type(brics[t])) #  feel like one column dataframe, actually a Series
    print(brics[t]) # value,

area
<class 'pandas.core.series.Series'>
BR     8.516
RU    17.100
IN     3.286
CH     9.597
SA     1.221
Name: area, dtype: float64
capital
<class 'pandas.core.series.Series'>
BR     Brasilia
RU       Moscow
IN    New Dehli
CH      Beijing
SA     Pretoria
Name: capital, dtype: object
country
<class 'pandas.core.series.Series'>
BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object
population
<class 'pandas.core.series.Series'>
BR     200.40
RU     143.50
IN    1252.00
CH    1357.00
SA      52.98
Name: population, dtype: float64

Sorting DataFrames¶

# sort by index, by row or column
# by row
brics.sort_index(axis=0, ascending=False)

# by column
brics.sort_index(axis=1, ascending=False)

# by value, same to by row index, but sort by the values in some columns
brics.sort_values('population')

brics.sort_values(['area', 'population']) # dict order for multi columns

Series ¶

area = brics['area']
print(type(area))

<class 'pandas.core.series.Series'>

area # fill like a one column DataFrame, but actually not.

BR     8.516
RU    17.100
IN     3.286
CH     9.597
SA     1.221
Name: area, dtype: float64

# one column datafarme, just like matrix (n, 1) vs vector (n,)
brics[['area']]

# series to dataframe
brics_area = pd.DataFrame(area)
brics_area

# fit with an array: area.values
area2 = pd.Series(area.values)

area2

0     8.516
1    17.100
2     3.286
3     9.597
4     1.221
dtype: float64

# 'for loop' test
for v in area: # like a array
    print(v)

8.516
17.1
3.286
9.597
1.221

for i in area.index: # but with index value
    print(i)

BR
RU
IN
CH
SA

# index Series by index name
area[area.index[0]]

8.516

# by index
area[0]

8.516

# sort is the same to dataframe
area.sort_index()

BR     8.516
CH     9.597
IN     3.286
RU    17.100
SA     1.221
Name: area, dtype: float64

area.sort_values(ascending=False)

RU    17.100
CH     9.597
BR     8.516
IN     3.286
SA     1.221
Name: area, dtype: float64

Modify value¶

# jsut assign value like numpy
brics[:1] = 0 # auto broadcast
brics

brics[:1] = [8.516, 'Brasilia', 'Brazi', 200.40]
brics

Change to numpy¶

np_brics = brics.values # no row indices and column names
np_brics

array([[8.516, 'Brasilia', 'Brazi', 200.4],
       [17.1, 'Moscow', 'Russia', 143.5],
       [3.286, 'New Dehli', 'India', 1252.0],
       [9.597, 'Beijing', 'China', 1357.0],
       [1.221, 'Pretoria', 'South Africa', 52.98]], dtype=object)

brics.to_numpy()

array([[8.516, 'Brasilia', 'Brazi', 200.4],
       [17.1, 'Moscow', 'Russia', 143.5],
       [3.286, 'New Dehli', 'India', 1252.0],
       [9.597, 'Beijing', 'China', 1357.0],
       [1.221, 'Pretoria', 'South Africa', 52.98]], dtype=object)

np_area = area.values # no row indices
np_area

array([ 8.516, 17.1  ,  3.286,  9.597,  1.221])

area.to_numpy()

array([ 8.516, 17.1  ,  3.286,  9.597,  1.221])

# do not share the same reference
np_brics[0,0] = 0

np_brics # change to 0

array([[0, 'Brasilia', 'Brazi', 200.4],
       [17.1, 'Moscow', 'Russia', 143.5],
       [3.286, 'New Dehli', 'India', 1252.0],
       [9.597, 'Beijing', 'China', 1357.0],
       [1.221, 'Pretoria', 'South Africa', 52.98]], dtype=object)

brics # keep origin value

Pandas Advance¶

The operations above are also available in numpy, we acctually do not have to use Pandas. Here are some Pandas features.

Join¶

# sql style join
pd.merge(left=brics, right=brics, on='area')

Append¶

# numpy style
brics.append(brics)

# column append
brics_new = brics.copy()
brics_new.columns = ['a', 'b', 'c', 'd']
brics.join(brics_new)

Group¶

By “group by” we are referring to a process involving one or more of the following steps:

Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure

brics2 = brics.append(brics)

# step 1
area_group = brics2.groupby('area')

# step 2 and 3, the followings are the same
area_group.count()

area_group.sum() # onlt support number type

area_group.mean()

	area	population
count	5.000000	5.000000
mean	7.944000	601.176000
std	6.200557	645.261454
min	1.221000	52.980000
25%	3.286000	143.500000
50%	8.516000	200.400000
75%	9.597000	1252.000000
max	17.100000	1357.000000

	area	capital	country	population
0	8.516	Brasilia	Brazil	200.40
1	17.100	Moscow	Russia	143.50
2	3.286	New Dehli	India	1252.00
3	9.597	Beijing	China	1357.00
4	1.221	Pretoria	South Africa	52.98

	area	capital	country	population
BR	8.516	Brasilia	Brazi	200.4
RU	17.100	Moscow	Russia	143.5
CH	9.597	Beijing	China	1357.0

	area	capital	country	population
BR	0.000	0	0	0.00
RU	17.100	Moscow	Russia	143.50
IN	3.286	New Dehli	India	1252.00
CH	9.597	Beijing	China	1357.00
SA	1.221	Pretoria	South Africa	52.98

	population
area
1.221	105.96
3.286	2504.00
8.516	400.80
9.597	2714.00
17.100	287.00