In [1]:
import pandas as pd

Pandas Basics

Pandas is a high-level data manipulation tool(advanced numpy) that allows you to manipulate tabular data easily.

Two tutorials:

  1. 10 minutes to pandas
  2. Pandas cookbook

DataFrames

DataFrames is a key data structure in Pandas(advanced 2-dimension ndarray in numpy).

In [2]:
dict = {"country": ["Brazil", "Russia", "India", "China", "South Africa"],
       "capital": ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
       "area": [8.516, 17.10, 3.286, 9.597, 1.221],
       "population": [200.4, 143.5, 1252, 1357, 52.98] }
brics = pd.DataFrame(dict)
brics
Out[2]:
area capital country population
0 8.516 Brasilia Brazil 200.40
1 17.100 Moscow Russia 143.50
2 3.286 New Dehli India 1252.00
3 9.597 Beijing China 1357.00
4 1.221 Pretoria South Africa 52.98
In [3]:
# data index, row index
brics.index
Out[3]:
RangeIndex(start=0, stop=5, step=1)
In [6]:
brics.index = ["BR", "RU", "IN", "CH", "SA"] # length must match
brics
Out[6]:
area capital country population
BR 8.516 Brasilia Brazil 200.40
RU 17.100 Moscow Russia 143.50
IN 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00
SA 1.221 Pretoria South Africa 52.98
In [54]:
# column index
brics.columns
Out[54]:
Index(['area', 'capital', 'country', 'population'], dtype='object')
In [58]:
brics.describe() # describe the statistics of data of number type
Out[58]:
area population
count 5.000000 5.000000
mean 7.944000 601.176000
std 6.200557 645.261454
min 1.221000 52.980000
25% 3.286000 143.500000
50% 8.516000 200.400000
75% 9.597000 1252.000000
max 17.100000 1357.000000

Indexing DataFrames

In [33]:
# column index, by name
brics[['area', 'capital']]
Out[33]:
area capital
BR 8.516 Brasilia
RU 17.100 Moscow
IN 3.286 New Dehli
CH 9.597 Beijing
SA 1.221 Pretoria
In [11]:
# row index
brics[1:4]
Out[11]:
area capital country population
RU 17.100 Moscow Russia 143.5
IN 3.286 New Dehli India 1252.0
CH 9.597 Beijing China 1357.0
In [37]:
# table index by name, the fist dim is row and the second dim is column
brics.loc[['RU','IN'], ['area', 'capital']]
Out[37]:
area capital
RU 17.100 Moscow
IN 3.286 New Dehli
In [39]:
# table index by index
brics.iloc[:2, :2]
Out[39]:
area capital
BR 8.516 Brasilia
RU 17.100 Moscow
In [78]:
# boolean indexing
brics[brics.area > 4]
Out[78]:
area capital country population
BR 8.516 Brasilia Brazi 200.4
RU 17.100 Moscow Russia 143.5
CH 9.597 Beijing China 1357.0
In [21]:
for t in brics: # feel like a dict by column for 'for loop'
    print(t) # key
    print(type(brics[t])) #  feel like one column dataframe, actually a Series
    print(brics[t]) # value,
area
<class 'pandas.core.series.Series'>
BR     8.516
RU    17.100
IN     3.286
CH     9.597
SA     1.221
Name: area, dtype: float64
capital
<class 'pandas.core.series.Series'>
BR     Brasilia
RU       Moscow
IN    New Dehli
CH      Beijing
SA     Pretoria
Name: capital, dtype: object
country
<class 'pandas.core.series.Series'>
BR          Brazil
RU          Russia
IN           India
CH           China
SA    South Africa
Name: country, dtype: object
population
<class 'pandas.core.series.Series'>
BR     200.40
RU     143.50
IN    1252.00
CH    1357.00
SA      52.98
Name: population, dtype: float64

Sorting DataFrames

In [72]:
# sort by index, by row or column
# by row
brics.sort_index(axis=0, ascending=False)
Out[72]:
area capital country population
SA 1.221 Pretoria South Africa 52.98
RU 17.100 Moscow Russia 143.50
IN 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00
BR 8.516 Brasilia Brazi 200.40
In [74]:
# by column
brics.sort_index(axis=1, ascending=False)
Out[74]:
population country capital area
BR 200.40 Brazi Brasilia 8.516
RU 143.50 Russia Moscow 17.100
IN 1252.00 India New Dehli 3.286
CH 1357.00 China Beijing 9.597
SA 52.98 South Africa Pretoria 1.221
In [76]:
# by value, same to by row index, but sort by the values in some columns
brics.sort_values('population')
Out[76]:
area capital country population
SA 1.221 Pretoria South Africa 52.98
RU 17.100 Moscow Russia 143.50
BR 8.516 Brasilia Brazi 200.40
IN 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00
In [77]:
brics.sort_values(['area', 'population']) # dict order for multi columns
Out[77]:
area capital country population
SA 1.221 Pretoria South Africa 52.98
IN 3.286 New Dehli India 1252.00
BR 8.516 Brasilia Brazi 200.40
CH 9.597 Beijing China 1357.00
RU 17.100 Moscow Russia 143.50
In [22]:
area = brics['area']
print(type(area))
<class 'pandas.core.series.Series'>
In [23]:
area # fill like a one column DataFrame, but actually not.
Out[23]:
BR     8.516
RU    17.100
IN     3.286
CH     9.597
SA     1.221
Name: area, dtype: float64
In [40]:
# one column datafarme, just like matrix (n, 1) vs vector (n,)
brics[['area']]
Out[40]:
area
BR 8.516
RU 17.100
IN 3.286
CH 9.597
SA 1.221
In [59]:
# series to dataframe
brics_area = pd.DataFrame(area)
brics_area
Out[59]:
area
BR 8.516
RU 17.100
IN 3.286
CH 9.597
SA 1.221
In [26]:
# fit with an array: area.values
area2 = pd.Series(area.values)
In [27]:
area2
Out[27]:
0     8.516
1    17.100
2     3.286
3     9.597
4     1.221
dtype: float64
In [28]:
# 'for loop' test
for v in area: # like a array
    print(v)
8.516
17.1
3.286
9.597
1.221
In [29]:
for i in area.index: # but with index value
    print(i)
BR
RU
IN
CH
SA
In [30]:
# index Series by index name
area[area.index[0]]
Out[30]:
8.516
In [31]:
# by index
area[0]
Out[31]:
8.516
In [70]:
# sort is the same to dataframe
area.sort_index()
Out[70]:
BR     8.516
CH     9.597
IN     3.286
RU    17.100
SA     1.221
Name: area, dtype: float64
In [71]:
area.sort_values(ascending=False)
Out[71]:
RU    17.100
CH     9.597
BR     8.516
IN     3.286
SA     1.221
Name: area, dtype: float64

Modify value

In [45]:
# jsut assign value like numpy
brics[:1] = 0 # auto broadcast
brics
Out[45]:
area capital country population
BR 0.000 0 0 0.00
RU 17.100 Moscow Russia 143.50
IN 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00
SA 1.221 Pretoria South Africa 52.98
In [46]:
brics[:1] = [8.516, 'Brasilia', 'Brazi', 200.40]
brics
Out[46]:
area capital country population
BR 8.516 Brasilia Brazi 200.40
RU 17.100 Moscow Russia 143.50
IN 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00
SA 1.221 Pretoria South Africa 52.98

Change to numpy

In [50]:
np_brics = brics.values # no row indices and column names
np_brics
Out[50]:
array([[8.516, 'Brasilia', 'Brazi', 200.4],
       [17.1, 'Moscow', 'Russia', 143.5],
       [3.286, 'New Dehli', 'India', 1252.0],
       [9.597, 'Beijing', 'China', 1357.0],
       [1.221, 'Pretoria', 'South Africa', 52.98]], dtype=object)
In [56]:
brics.to_numpy()
Out[56]:
array([[8.516, 'Brasilia', 'Brazi', 200.4],
       [17.1, 'Moscow', 'Russia', 143.5],
       [3.286, 'New Dehli', 'India', 1252.0],
       [9.597, 'Beijing', 'China', 1357.0],
       [1.221, 'Pretoria', 'South Africa', 52.98]], dtype=object)
In [43]:
np_area = area.values # no row indices
np_area
Out[43]:
array([ 8.516, 17.1  ,  3.286,  9.597,  1.221])
In [57]:
area.to_numpy()
Out[57]:
array([ 8.516, 17.1  ,  3.286,  9.597,  1.221])
In [51]:
# do not share the same reference
np_brics[0,0] = 0
In [52]:
np_brics # change to 0
Out[52]:
array([[0, 'Brasilia', 'Brazi', 200.4],
       [17.1, 'Moscow', 'Russia', 143.5],
       [3.286, 'New Dehli', 'India', 1252.0],
       [9.597, 'Beijing', 'China', 1357.0],
       [1.221, 'Pretoria', 'South Africa', 52.98]], dtype=object)
In [53]:
brics # keep origin value
Out[53]:
area capital country population
BR 8.516 Brasilia Brazi 200.40
RU 17.100 Moscow Russia 143.50
IN 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00
SA 1.221 Pretoria South Africa 52.98

Pandas Advance

The operations above are also available in numpy, we acctually do not have to use Pandas. Here are some Pandas features.

Join

In [81]:
# sql style join
pd.merge(left=brics, right=brics, on='area')
Out[81]:
area capital_x country_x population_x capital_y country_y population_y
0 8.516 Brasilia Brazi 200.40 Brasilia Brazi 200.40
1 17.100 Moscow Russia 143.50 Moscow Russia 143.50
2 3.286 New Dehli India 1252.00 New Dehli India 1252.00
3 9.597 Beijing China 1357.00 Beijing China 1357.00
4 1.221 Pretoria South Africa 52.98 Pretoria South Africa 52.98

Append

In [82]:
# numpy style
brics.append(brics)
Out[82]:
area capital country population
BR 8.516 Brasilia Brazi 200.40
RU 17.100 Moscow Russia 143.50
IN 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00
SA 1.221 Pretoria South Africa 52.98
BR 8.516 Brasilia Brazi 200.40
RU 17.100 Moscow Russia 143.50
IN 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00
SA 1.221 Pretoria South Africa 52.98
In [95]:
# column append
brics_new = brics.copy()
brics_new.columns = ['a', 'b', 'c', 'd']
brics.join(brics_new)
Out[95]:
area capital country population a b c d
BR 8.516 Brasilia Brazi 200.40 8.516 Brasilia Brazi 200.40
RU 17.100 Moscow Russia 143.50 17.100 Moscow Russia 143.50
IN 3.286 New Dehli India 1252.00 3.286 New Dehli India 1252.00
CH 9.597 Beijing China 1357.00 9.597 Beijing China 1357.00
SA 1.221 Pretoria South Africa 52.98 1.221 Pretoria South Africa 52.98

Group

By “group by” we are referring to a process involving one or more of the following steps:

  • Splitting the data into groups based on some criteria
  • Applying a function to each group independently
  • Combining the results into a data structure
In [96]:
brics2 = brics.append(brics)
In [105]:
# step 1
area_group = brics2.groupby('area')
In [108]:
# step 2 and 3, the followings are the same
area_group.count()
Out[108]:
capital country population
area
1.221 2 2 2
3.286 2 2 2
8.516 2 2 2
9.597 2 2 2
17.100 2 2 2
In [113]:
area_group.sum() # onlt support number type
Out[113]:
population
area
1.221 105.96
3.286 2504.00
8.516 400.80
9.597 2714.00
17.100 287.00
In [114]:
area_group.mean()
Out[114]:
population
area
1.221 52.98
3.286 1252.00
8.516 200.40
9.597 1357.00
17.100 143.50
In [ ]: