Numpy-Pandas-SciPy
Numpy
Basic
Import
import numpy as np
Get shape
x.shape
Meshgrid
x = np.arange(-5, 5, 0.1)
y = np.arange(-5, 5, 0.1)
xx, yy = np.meshgrid(x, y)
z = np.sin(xx**2 + yy**2) / (xx**2 + yy**2)
h = plt.contourf(x,y,z)
Sort
np.argmax(x)
np.argsort(x)
Matrix
# change axes
xx.transpose(1,0,2)
Random
Generate random samples
# generate random integers over [low, high) with size d0*d1 from a uniform dist
x = np.random.randint(low, high, (d0, d1))
# generate d0*d1*d2 samples from a uniform distribution over [0, 1)
x = np.random.rand(d0, d1, d2)
# generate d0*d1*d2 samples from a normal distribution over [0, 1)
x = np.random.randn(d0, d1, d2)
# generate n samples from a uniform distribution over [0, 1)
x = np.random.random(n)
x = np.random.sample(n)
x = np.random.random_sample(n)
x = np.random.randf(n)
Make random choice
np.random.choice(4, 12, p=[.4, .1, .1, .4])
reps = 10000
xb = np.random.choice(x, (n, reps))
Normal distribution
μ = 100
σ = 15
n = 10000
x = np.random.normal(μ, σ, n)
Sample from a given distribution
gb = np.random.gumbel(size=1000)
plt.hist(gb, bins=20, histtype='step', normed=True, linewidth=1)
Pandas
Import
import pandas as pd
from pandas import Series, DataFrame
IO
Read from file
df = pd.read_csv(filename, sep=',')
df = pd.read_table(filename, sep=',',
parse_dates=["Date & Time Stamp"],
index_col="Date & Time Stamp")
df = pd.read_fwf(filename,
widths=[width1, width2, width3]
names=['col1', 'col2', 'col3'])
Save to file
df.to_csv('path/filename')
df.to_pickle('path/filename')
Load JSON
import json
with open("data.json", "r") as f:
data = json.load(f)
data = pd.DataFrame.from_dict(data["aggregations"]["stat"]["buckets"])
# load nested json
nested_data = pd.json_normalize(
data["aggregations"]["stat"]["buckets"],
meta_prefix='account-',
meta=['key', ['date', 'buckets', 'key_as_string']],
record_path=['date', 'buckets', 'apps', 'buckets'],
record_prefix = 'app-'
)
Quick check
df.head(num_of_row)
df.tail(num_of_row)
df.describe()
Indexing
Set index
df.set_index(column, inplace=True)
Indexing
df.label
df['label']
df[i]
# select with label
df.loc['label']
# select with index
df.iloc[i]
# select with label/index
df.ix['label']
df.ix[i]
Select column
df.loc[:,'lable']
Rename columns
df.rename(columns={"old_name":"new_name"})
Shift index
# shift i row
df.shift(i)
# moving average
(df + df.shift(-1) + df.shift(-2)) / 3
Sort
df.sort_index()
df.sort_values(by='Country')
Binning
pd.cut(L, bin_num)
Groupby
df.groupby(col).mean()
df.groupby([col1, col2]).mean()
df.groupby(col).aggregate(['count', 'mean'])
# Find the min row of the group
df.loc[df.groupby(['Serial_Num']).apply(lambda x: x['Distance'].idxmin())]
Resample
df.resample('1D').max() #max of one day
df.resample('2M').mean() #mean of two months
时间序列重采样
ts = pd.Series()
ts.loc[datetime.now()] = 100
idx = pd.date_range(start=starttime, end=endtime, freq='300S')
ts = ts.reindex(idx, method="nearest") # method可以选择bfill, ffill等
Iteration
# Series
for idx, value in s.items():
pass
# DataFrame
for idx, row in df.iterrows():
pass
Scipy
Integrate
Import
from scipy import integrate
Integrate
def f(x, y):
return 6*x*y**2
integrate.nquad(f, [[0, 1],[0, 1]])
Optimize
Import
from scipy.optimize import brentq
Find root
x = brentq(lambda x: x/exp(x)-k, 0, 1)
Statistics
Import
from scipy import stats
Example 1
n = 5 # number of sample
xs = [0.1, 0.5, 0.9] # percentile
rv = stats.beta(a=0.5, b=0.5) # generate a beta distribution
print(rv.pdf(xs)) # equivalent of dbeta
print(rv.cdf(xs)) # equivalent of pbeta
print(rv.ppf(xs)) # equvialent of qbeta
print(rv.rvs(n)) # equivalent of rbeta
Example 2
dist = stats.expon()
x = np.linspace(0,4,100)
y = np.linspace(0,1,100)
with plt.xkcd():
plt.figure(figsize=(12,4))
plt.subplot(121)
plt.plot(x, expon_cdf(x))
plt.axis([0, 4, 0, 1])