Skip to content

Commit 6e7adb7

Browse files
committed
Pushing the docs to dev/ for branch: master, commit 0b6308c2708fe03071cdbf24997eb967403f5965
1 parent c63e022 commit 6e7adb7

File tree

1,081 files changed

+4644
-3641
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,081 files changed

+4644
-3641
lines changed
-1.5 KB
Binary file not shown.
-1.44 KB
Binary file not shown.
+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {
7+
"collapsed": false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type": "markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Column Transformer with Heterogeneous Data Sources\n\n\nDatasets can often contain components of that require different feature\nextraction and processing pipelines. This scenario might occur when:\n\n1. Your dataset consists of heterogeneous data types (e.g. raster images and\n text captions)\n2. Your dataset is stored in a Pandas DataFrame and different columns\n require different processing pipelines.\n\nThis example demonstrates how to use\n:class:`sklearn.compose.ColumnTransformer` on a dataset containing\ndifferent types of features. We use the 20-newsgroups dataset and compute\nstandard bag-of-words features for the subject line and body in separate\npipelines as well as ad hoc features on the body. We combine them (with\nweights) using a ColumnTransformer and finally train a classifier on the\ncombined set of features.\n\nThe choice of features is not particularly helpful, but serves to illustrate\nthe technique.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"collapsed": false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Matt Terry <[email protected]>\n#\n# License: BSD 3 clause\nfrom __future__ import print_function\n\nimport numpy as np\n\nfrom sklearn.base import BaseEstimator, TransformerMixin\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics import classification_report\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.svm import SVC\n\n\nclass TextStats(BaseEstimator, TransformerMixin):\n \"\"\"Extract features from each document for DictVectorizer\"\"\"\n\n def fit(self, x, y=None):\n return self\n\n def transform(self, posts):\n return [{'length': len(text),\n 'num_sentences': text.count('.')}\n for text in posts]\n\n\nclass SubjectBodyExtractor(BaseEstimator, TransformerMixin):\n \"\"\"Extract the subject & body from a usenet post in a single pass.\n\n Takes a sequence of strings and produces a dict of sequences. Keys are\n `subject` and `body`.\n \"\"\"\n def fit(self, x, y=None):\n return self\n\n def transform(self, posts):\n # construct object dtype array with two columns\n # first column = 'subject' and second column = 'body'\n features = np.empty(shape=(len(posts), 2), dtype=object)\n for i, text in enumerate(posts):\n headers, _, bod = text.partition('\\n\\n')\n bod = strip_newsgroup_footer(bod)\n bod = strip_newsgroup_quoting(bod)\n features[i, 1] = bod\n\n prefix = 'Subject:'\n sub = ''\n for line in headers.split('\\n'):\n if line.startswith(prefix):\n sub = line[len(prefix):]\n break\n features[i, 0] = sub\n\n return features\n\n\npipeline = Pipeline([\n # Extract the subject & body\n ('subjectbody', SubjectBodyExtractor()),\n\n # Use C toolumnTransformer to combine the features from subject and body\n ('union', ColumnTransformer(\n [\n # Pulling features from the post's subject line (first column)\n ('subject', TfidfVectorizer(min_df=50), 0),\n\n # Pipeline for standard bag-of-words model for body (second column)\n ('body_bow', Pipeline([\n ('tfidf', TfidfVectorizer()),\n ('best', TruncatedSVD(n_components=50)),\n ]), 1),\n\n # Pipeline for pulling ad hoc features from post's body\n ('body_stats', Pipeline([\n ('stats', TextStats()), # returns a list of dicts\n ('vect', DictVectorizer()), # list of dicts -> feature matrix\n ]), 1),\n ],\n\n # weight components in ColumnTransformer\n transformer_weights={\n 'subject': 0.8,\n 'body_bow': 0.5,\n 'body_stats': 1.0,\n }\n )),\n\n # Use a SVC classifier on the combined features\n ('svc', SVC(kernel='linear')),\n])\n\n# limit the list of categories to make running this example faster.\ncategories = ['alt.atheism', 'talk.religion.misc']\ntrain = fetch_20newsgroups(random_state=1,\n subset='train',\n categories=categories,\n )\ntest = fetch_20newsgroups(random_state=1,\n subset='test',\n categories=categories,\n )\n\npipeline.fit(train.data, train.target)\ny = pipeline.predict(test.data)\nprint(classification_report(y, test.target))"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name": "Python 3",
36+
"language": "python",
37+
"name": "python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name": "ipython",
42+
"version": 3
43+
},
44+
"file_extension": ".py",
45+
"mimetype": "text/x-python",
46+
"name": "python",
47+
"nbconvert_exporter": "python",
48+
"pygments_lexer": "ipython3",
49+
"version": "3.6.5"
50+
}
51+
},
52+
"nbformat": 4,
53+
"nbformat_minor": 0
54+
}

dev/_downloads/hetero_feature_union.py renamed to dev/_downloads/column_transformer.py

+22-67
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
2-
=============================================
3-
Feature Union with Heterogeneous Data Sources
4-
=============================================
2+
==================================================
3+
Column Transformer with Heterogeneous Data Sources
4+
==================================================
55
66
Datasets can often contain components of that require different feature
77
extraction and processing pipelines. This scenario might occur when:
@@ -12,12 +12,12 @@
1212
require different processing pipelines.
1313
1414
This example demonstrates how to use
15-
:class:`sklearn.feature_extraction.FeatureUnion` on a dataset containing
15+
:class:`sklearn.compose.ColumnTransformer` on a dataset containing
1616
different types of features. We use the 20-newsgroups dataset and compute
1717
standard bag-of-words features for the subject line and body in separate
1818
pipelines as well as ad hoc features on the body. We combine them (with
19-
weights) using a FeatureUnion and finally train a classifier on the combined
20-
set of features.
19+
weights) using a ColumnTransformer and finally train a classifier on the
20+
combined set of features.
2121
2222
The choice of features is not particularly helpful, but serves to illustrate
2323
the technique.
@@ -38,50 +38,11 @@
3838
from sklearn.feature_extraction import DictVectorizer
3939
from sklearn.feature_extraction.text import TfidfVectorizer
4040
from sklearn.metrics import classification_report
41-
from sklearn.pipeline import FeatureUnion
4241
from sklearn.pipeline import Pipeline
42+
from sklearn.compose import ColumnTransformer
4343
from sklearn.svm import SVC
4444

4545

46-
class ItemSelector(BaseEstimator, TransformerMixin):
47-
"""For data grouped by feature, select subset of data at a provided key.
48-
49-
The data is expected to be stored in a 2D data structure, where the first
50-
index is over features and the second is over samples. i.e.
51-
52-
>> len(data[key]) == n_samples
53-
54-
Please note that this is the opposite convention to scikit-learn feature
55-
matrixes (where the first index corresponds to sample).
56-
57-
ItemSelector only requires that the collection implement getitem
58-
(data[key]). Examples include: a dict of lists, 2D numpy array, Pandas
59-
DataFrame, numpy record array, etc.
60-
61-
>> data = {'a': [1, 5, 2, 5, 2, 8],
62-
'b': [9, 4, 1, 4, 1, 3]}
63-
>> ds = ItemSelector(key='a')
64-
>> data['a'] == ds.transform(data)
65-
66-
ItemSelector is not designed to handle data grouped by sample. (e.g. a
67-
list of dicts). If your data is structured this way, consider a
68-
transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
69-
70-
Parameters
71-
----------
72-
key : hashable, required
73-
The key corresponding to the desired value in a mappable.
74-
"""
75-
def __init__(self, key):
76-
self.key = key
77-
78-
def fit(self, x, y=None):
79-
return self
80-
81-
def transform(self, data_dict):
82-
return data_dict[self.key]
83-
84-
8546
class TextStats(BaseEstimator, TransformerMixin):
8647
"""Extract features from each document for DictVectorizer"""
8748

@@ -104,21 +65,22 @@ def fit(self, x, y=None):
10465
return self
10566

10667
def transform(self, posts):
107-
features = np.recarray(shape=(len(posts),),
108-
dtype=[('subject', object), ('body', object)])
68+
# construct object dtype array with two columns
69+
# first column = 'subject' and second column = 'body'
70+
features = np.empty(shape=(len(posts), 2), dtype=object)
10971
for i, text in enumerate(posts):
11072
headers, _, bod = text.partition('\n\n')
11173
bod = strip_newsgroup_footer(bod)
11274
bod = strip_newsgroup_quoting(bod)
113-
features['body'][i] = bod
75+
features[i, 1] = bod
11476

11577
prefix = 'Subject:'
11678
sub = ''
11779
for line in headers.split('\n'):
11880
if line.startswith(prefix):
11981
sub = line[len(prefix):]
12082
break
121-
features['subject'][i] = sub
83+
features[i, 0] = sub
12284

12385
return features
12486

@@ -127,38 +89,31 @@ def transform(self, posts):
12789
# Extract the subject & body
12890
('subjectbody', SubjectBodyExtractor()),
12991

130-
# Use FeatureUnion to combine the features from subject and body
131-
('union', FeatureUnion(
132-
transformer_list=[
92+
# Use C toolumnTransformer to combine the features from subject and body
93+
('union', ColumnTransformer(
94+
[
95+
# Pulling features from the post's subject line (first column)
96+
('subject', TfidfVectorizer(min_df=50), 0),
13397

134-
# Pipeline for pulling features from the post's subject line
135-
('subject', Pipeline([
136-
('selector', ItemSelector(key='subject')),
137-
('tfidf', TfidfVectorizer(min_df=50)),
138-
])),
139-
140-
# Pipeline for standard bag-of-words model for body
98+
# Pipeline for standard bag-of-words model for body (second column)
14199
('body_bow', Pipeline([
142-
('selector', ItemSelector(key='body')),
143100
('tfidf', TfidfVectorizer()),
144101
('best', TruncatedSVD(n_components=50)),
145-
])),
102+
]), 1),
146103

147104
# Pipeline for pulling ad hoc features from post's body
148105
('body_stats', Pipeline([
149-
('selector', ItemSelector(key='body')),
150106
('stats', TextStats()), # returns a list of dicts
151107
('vect', DictVectorizer()), # list of dicts -> feature matrix
152-
])),
153-
108+
]), 1),
154109
],
155110

156-
# weight components in FeatureUnion
111+
# weight components in ColumnTransformer
157112
transformer_weights={
158113
'subject': 0.8,
159114
'body_bow': 0.5,
160115
'body_stats': 1.0,
161-
},
116+
}
162117
)),
163118

164119
# Use a SVC classifier on the combined features

dev/_downloads/hetero_feature_union.ipynb

-54
This file was deleted.

dev/_downloads/scikit-learn-docs.pdf

-133 KB
Binary file not shown.

dev/_images/iris.png

0 Bytes
-583 Bytes
333 Bytes
-220 Bytes
-220 Bytes
-1009 Bytes
-1009 Bytes
574 Bytes
-235 Bytes
102 Bytes
102 Bytes
146 Bytes
-20 Bytes
-37 Bytes
-43 Bytes
-722 Bytes
-59 Bytes
-59 Bytes
213 Bytes
213 Bytes
23 Bytes
23 Bytes
179 Bytes
179 Bytes
-88 Bytes
-88 Bytes
-80 Bytes
189 Bytes
-77 Bytes
-77 Bytes
945 Bytes
-83 Bytes
-226 Bytes
149 Bytes
149 Bytes
-14 Bytes
51 Bytes

0 commit comments

Comments
 (0)