sac de mot scikit apprend
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
docs = ['Tea is an aromatic beverage..',
'After water, it is the most widely consumed drink in the world',
'There are many different types of tea.',
'Tea has a stimulating effect in humans.',
'Tea originated in Southwest China during the Shang dynasty']
df = pd.DataFrame({'sms_message': docs, 'label': np.random.choice([0, 1], size=5)})
cv = CountVectorizer()
counts = cv.fit_transform(df['sms_message'])
df_counts = pd.DataFrame(counts.A, columns=cv.get_feature_names())
df_counts['label'] = df['label']
Fierce Flatworm