sac de mot scikit apprend

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

docs = ['Tea is an aromatic beverage..',
        'After water, it is the most widely consumed drink in the world',
        'There are many different types of tea.',
        'Tea has a stimulating effect in humans.',
        'Tea originated in Southwest China during the Shang dynasty'] 

df = pd.DataFrame({'sms_message': docs, 'label': np.random.choice([0, 1], size=5)})

cv = CountVectorizer()
counts = cv.fit_transform(df['sms_message'])

df_counts = pd.DataFrame(counts.A, columns=cv.get_feature_names())
df_counts['label'] = df['label']
Fierce Flatworm