1.导入库

 1 import os
 2 import h5py
 3 import shutil
 4 import sklearn
 5 import tempfile
 6 import numpy as np
 7 import pandas as pd
 8 import sklearn.datasets
 9 import sklearn.linear_model
10 import matplotlib.pyplot as plt
11 %matplotlib inline

2.产生数据

sklearn.datasets.make_classification产生测试数据。
10000组数据,特征向量维数为4。
sklearn.cross_validation.train_test_split为交叉验证。就是把data拆分为不同的train set和test set。
这里拆分为7500:2500
1 X, y = sklearn.datasets.make_classification(
2     n_samples=10000, n_features=4, n_redundant=0, n_informative=2, 
3     n_clusters_per_class=2, hypercube=False, random_state=0
4 )
5 
6 # Split into train and test
7 X, Xt, y, yt = sklearn.cross_validation.train_test_split(X, y)

 3.数据可视化

1 # Visualize sample of the data
2 # np.random.permutation产生序列或随机交换序列
3 # X.shape=7500
4 # 在此产生0-7499乱序序列并取前1000
5 ind = np.random.permutation(X.shape[0])[:1000]
6 df = pd.DataFrame(X[ind])
7 # 绘图 'kde'核密度估计,'hist'直方图
8 _ = pd.scatter_matrix(df, figsize=(9, 9), diagonal='kde', marker='o', s=40, alpha=.4, c=y[ind])
pd.scatter_matrix函数说明

 1 def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
 2                    diagonal='hist', marker='.', density_kwds=None,
 3                    hist_kwds=None, range_padding=0.05, **kwds):
 4     """
 5     Draw a matrix of scatter plots.
 6 
 7     Parameters
 8     ----------
 9     frame : DataFrame
10     alpha : float, optional
11         amount of transparency applied
12     figsize : (float,float), optional
13         a tuple (width, height) in inches
14     ax : Matplotlib axis object, optional
15     grid : bool, optional
16         setting this to True will show the grid
17     diagonal : {'hist', 'kde'}
18         pick between 'kde' and 'hist' for
19         either Kernel Density Estimation or Histogram
20         plot in the diagonal
21     marker : str, optional
22         Matplotlib marker type, default '.'    
23     hist_kwds : other plotting keyword arguments
24         To be passed to hist function
25     density_kwds : other plotting keyword arguments
26         To be passed to kernel density estimate plot
27     range_padding : float, optional
28         relative extension of axis range in x and y
29         with respect to (x_max - x_min) or (y_max - y_min),
30         default 0.05
31     kwds : other plotting keyword arguments
32         To be passed to scatter function
33 
34     Examples
35     --------
36     >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
37     >>> scatter_matrix(df, alpha=0.2)
38     """

View Code