# Extracting features from categorical variables
#Extracting features from categorical variables 独热编码 from sklearn.feature_extraction import DictVectorizer onehot_encoder=DictVectorizer() instance=[{'city':'New York'},{'city':'San Francisco'}, {'city':'Chapel Hill'}] print onehot_encoder.fit_transform(instance).toarray()
输出结果:
# Extracting features from text文字特征提取
## The bag-of-words representation
#bag-of-words model.词库模型 corpus = [ 'UNC played Duke in basketball', 'Duke lost the basketball game' ]
#CountVectorizer类通过正则表达式用空格分割句子,然后抽取长度大于等于2的字母序列。scikit-learn实现代码如下: from sklearn.feature_extraction.text import CountVectorizer corpus = [ 'UNC played Duke in basketball', 'Duke lost the basketball game' ] vectorizer=CountVectorizer() print vectorizer.fit_transform(corpus).todense()#todense将稀疏矩阵转化为完整特征矩阵 print vectorizer.vocabulary_
输出结果:
[[1 1 0 1 0 1 0 1]
[1 1 1 0 1 0 1 0]]
{u'duke': 1, u'basketball': 0, u'lost': 4, u'played': 5, u'game': 2, u'unc': 7, u'in': 3, u'the': 6}
corpus = [ 'UNC played Duke in basketball', 'Duke lost the basketball game', 'I ate a sandwich' ] vectorizer = CountVectorizer() print(vectorizer.fit_transform(corpus).todense()) print(vectorizer.vocabulary_)
输出结果:
[[0 1 1 0 1 0 1 0 0 1]
[0 1 1 1 0 1 0 0 1 0]
[1 0 0 0 0 0 0 1 0 0]]
{u'duke': 2, u'basketball': 1, u'lost': 5, u'played': 6, u'in': 4, u'game': 3, u'sandwich': 7, u'unc': 9, u'ate': 0, u'the': 8}
scikit-learn里面的euclidean_distances函数可以计算若干向量的距离,表示两个语义最相似的
文档其向量在空间中也是最接近的。
from sklearn.metrics.pairwise import euclidean_distances count=[[0, 1, 1, 0, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 0]] print 'Distance between 1st and 2nd documents:',euclidean_distances(count[0],count[1])
输出结果:Distance between 1st and 2nd documents: [[ 2.]]
#format方法 for x,y in[[0,1],[0,2],[1,2]]: count=[[0, 1, 1, 0, 0, 1, 0, 1], [0, 1, 1, 1, 1, 0, 0, 0], [1, 0, 0, 0, 0, 0, 1, 0]] dist=euclidean_distances(count[x],count[y]) print '文档{}文档{}文档{}'.format(x,y,dist)
输出结果:
mandrill=io.imread('1.jpg') mandrill=equalize_hist(rgb2gray(mandrill)) corners=corner_peaks(corner_harris(mandrill),min_distance=2) show_corners(corners,mandrill)
### SIFT和SURF
import mahotas as mh from mahotas.features import surf image = mh.imread('2.jpg', as_grey=True) print('第一个SURF描述符:\n{}\n'.format(surf.surf(image)[0])) print('抽取了%s个SURF描述符' % len(surf.surf(image)))
输出结果:
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:Python_sklearn机器学习库学习笔记(一)_Feature Extraction and Preprocessing(特征提取与预处理) - Python技术站