Numpy-Pandas-SciPy

Numpy

Basic

Import

import numpy as np

Get shape

x.shape

Meshgrid

x = np.arange(-5, 5, 0.1)
y = np.arange(-5, 5, 0.1)
xx, yy = np.meshgrid(x, y)
z = np.sin(xx**2 + yy**2) / (xx**2 + yy**2)
h = plt.contourf(x,y,z)

Sort

np.argmax(x)
np.argsort(x)

Matrix

# change axes
xx.transpose(1,0,2)

Random

Generate random samples

# generate random integers over [low, high) with size d0*d1 from a uniform dist
x = np.random.randint(low, high, (d0, d1))
# generate d0*d1*d2 samples from a uniform distribution over [0, 1)
x = np.random.rand(d0, d1, d2)
# generate d0*d1*d2 samples from a normal distribution over [0, 1)
x = np.random.randn(d0, d1, d2)
# generate n samples from a uniform distribution over [0, 1)
x = np.random.random(n)
x = np.random.sample(n)
x = np.random.random_sample(n)
x = np.random.randf(n)

Make random choice

np.random.choice(4, 12, p=[.4, .1, .1, .4])
reps = 10000
xb = np.random.choice(x, (n, reps))

Normal distribution

μ = 100
σ = 15
n = 10000
x = np.random.normal(μ, σ, n)

Sample from a given distribution

gb = np.random.gumbel(size=1000)
plt.hist(gb, bins=20, histtype='step', normed=True, linewidth=1)

Pandas

Import

import pandas as pd
from pandas import Series, DataFrame

IO

Read from file

df = pd.read_csv(filename, sep=',')
df = pd.read_table(filename, sep=',',
                   parse_dates=["Date & Time Stamp"],
                   index_col="Date & Time Stamp")
df = pd.read_fwf(filename,
                 widths=[width1, width2, width3]
                 names=['col1', 'col2', 'col3'])

Save to file

df.to_csv('path/filename')
df.to_pickle('path/filename')

Load JSON

import json

with open("data.json", "r") as f:
    data = json.load(f)
data = pd.DataFrame.from_dict(data["aggregations"]["stat"]["buckets"])

# load nested json
nested_data = pd.json_normalize(
    data["aggregations"]["stat"]["buckets"],
    meta_prefix='account-',
    meta=['key', ['date', 'buckets', 'key_as_string']],
    record_path=['date', 'buckets', 'apps', 'buckets'],
    record_prefix = 'app-'
)

Quick check

df.head(num_of_row)
df.tail(num_of_row)
df.describe()

Indexing

Set index

df.set_index(column, inplace=True)

Indexing

df.label
df['label']
df[i]
# select with label
df.loc['label']
# select with index
df.iloc[i]
# select with label/index
df.ix['label']
df.ix[i]

Select column

df.loc[:,'lable']

Rename columns

df.rename(columns={"old_name":"new_name"})

Shift index

# shift i row
df.shift(i)
# moving average
(df + df.shift(-1) + df.shift(-2)) / 3

Sort

df.sort_index()
df.sort_values(by='Country')

Binning

pd.cut(L, bin_num)

Groupby

df.groupby(col).mean()
df.groupby([col1, col2]).mean()
df.groupby(col).aggregate(['count', 'mean'])

# Find the min row of the group
df.loc[df.groupby(['Serial_Num']).apply(lambda x: x['Distance'].idxmin())]

Resample

df.resample('1D').max()     #max of one day
df.resample('2M').mean()    #mean of two months

时间序列重采样

ts = pd.Series()
ts.loc[datetime.now()] = 100
idx = pd.date_range(start=starttime, end=endtime, freq='300S')
ts = ts.reindex(idx, method="nearest") # method可以选择bfill, ffill等

Iteration

# Series
for idx, value in s.items():
    pass

# DataFrame
for idx, row in df.iterrows():
    pass

Scipy

Integrate

Import

from scipy import integrate

Integrate

def f(x, y):
    return 6*x*y**2
integrate.nquad(f, [[0, 1],[0, 1]])

Optimize

Import

from scipy.optimize import brentq

Find root

x = brentq(lambda x: x/exp(x)-k, 0, 1)

Statistics

Import

from scipy import stats

Example 1

n = 5   # number of sample
xs = [0.1, 0.5, 0.9]    # percentile
rv = stats.beta(a=0.5, b=0.5)   # generate a beta distribution

print(rv.pdf(xs)) # equivalent of dbeta
print(rv.cdf(xs)) # equivalent of pbeta
print(rv.ppf(xs)) # equvialent of qbeta
print(rv.rvs(n))  # equivalent of rbeta

Example 2

dist = stats.expon()
x = np.linspace(0,4,100)
y = np.linspace(0,1,100)

with plt.xkcd():
    plt.figure(figsize=(12,4))
    plt.subplot(121)
    plt.plot(x, expon_cdf(x))
    plt.axis([0, 4, 0, 1])