Data Loading

In [0]:
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[0]:
True
In [0]:
from sklearn.metrics import accuracy_score
In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.multiclass import OneVsRestClassifier
In [0]:
stop_words = set(stopwords.words('english'))
In [0]:
from collections import OrderedDict 
In [0]:
df = pd.read_csv('Training_Data.csv',encoding = "ISO-8859-1")
In [0]:
test = df[df['Growth']== 0][df['Inflation']==0][df['Cad']==0][df['Money']==0][df['Fiscal']==0][df['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
test.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 189494.000000 189494.0 189494.0 189494.0 189494.0 189494.0 189494.0
mean 109299.531917 0.0 0.0 0.0 0.0 0.0 0.0
std 63907.299044 0.0 0.0 0.0 0.0 0.0 0.0
min 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 53950.250000 0.0 0.0 0.0 0.0 0.0 0.0
50% 107822.500000 0.0 0.0 0.0 0.0 0.0 0.0
75% 164733.750000 0.0 0.0 0.0 0.0 0.0 0.0
max 220852.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
df1 = pd.concat([test,df])
In [0]:
train = df1.drop_duplicates(keep=False)
In [0]:
train.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 31359.000000 31359.000000 31359.000000 31359.000000 31359.000000 31359.000000 31359.000000
mean 117232.943557 0.534073 0.411333 0.142957 0.096017 0.053222 0.040754
std 62395.971409 0.498846 0.492083 0.350035 0.294619 0.224480 0.197723
min 7.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 63979.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 124876.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 170273.500000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000
max 220848.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [0]:
train['Sentence']
Out[0]:
7         In addition to weak demand, the sharp fall in ...
9         “There could be improvement in the days to com...
45        The chart shows that average inflation expecta...
47        add_main_imageThe years of low inflation from ...
48        Conversely, the years of high inflation betwee...
                                ...                        
220840    The future of cryptocurrency startups is uncer...
220841    A draft bill, Banning of Crypto Currency and R...
220843    “Also, billions of people worldwide are introd...
220847    A person aware of the developments had told ET...
220848    Sathvik Vishwanath, CEO and cofounder, Unocoin...
Name: Sentence, Length: 31359, dtype: object

NLP MultiClass

In [0]:
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
In [0]:
 categories = ['Growth', 'Inflation', 'Cad', 'Money','Fiscal', 'Other']
In [0]:
x_train,x_test = tt(train,test_size = 0.5,random_state = 1)
In [0]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(x_test['Sentence'])
    print('Test accuracy is {}'.format(accuracy_score(x_test[category], prediction)))
... Processing Growth
Test accuracy is 0.9246173469387755
... Processing Inflation
Test accuracy is 0.9117984693877551
... Processing Cad
Test accuracy is 0.8779336734693878
... Processing Money
Test accuracy is 0.9185586734693878
... Processing Fiscal
Test accuracy is 0.9452168367346939
... Processing Other
Test accuracy is 0.9584183673469387
In [0]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
In [0]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(x_test['Sentence'])
    print('Test accuracy is {}'.format(accuracy_score(x_test[category], prediction)))
... Processing Growth
Test accuracy is 0.9815051020408163
... Processing Inflation
Test accuracy is 0.9830357142857142
... Processing Cad
Test accuracy is 0.9827168367346939
... Processing Money
Test accuracy is 0.9900510204081633
... Processing Fiscal
Test accuracy is 0.99375
... Processing Other
Test accuracy is 0.9926020408163265
In [0]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
In [0]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(x_test['Sentence'])
    print('Test accuracy is {}'.format(accuracy_score(x_test[category], prediction)))
... Processing Growth
Test accuracy is 0.973405612244898
... Processing Inflation
Test accuracy is 0.9677295918367347
... Processing Cad
Test accuracy is 0.963265306122449
... Processing Money
Test accuracy is 0.9741709183673469
... Processing Fiscal
Test accuracy is 0.9841198979591836
... Processing Other
Test accuracy is 0.9791454081632653

Test Dataset NLP Multiclass model

In [0]:
x_train,x_test = tt(train,test_size = 0.5,random_state = 9)
In [0]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(test['Sentence'])
    test[category] = prediction
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
In [0]:
anomoly = test[test['Growth']== 0][test['Inflation']==0][test['Cad']==0][test['Money']==0][test['Fiscal']==0][test['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
anomoly.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 126921.000000 126921.0 126921.0 126921.0 126921.0 126921.0 126921.0
mean 110844.316488 0.0 0.0 0.0 0.0 0.0 0.0
std 64008.795475 0.0 0.0 0.0 0.0 0.0 0.0
min 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 55265.000000 0.0 0.0 0.0 0.0 0.0 0.0
50% 110605.000000 0.0 0.0 0.0 0.0 0.0 0.0
75% 166511.000000 0.0 0.0 0.0 0.0 0.0 0.0
max 220852.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
x_train,x_test = tt(train,test_size = 0.5,random_state = 8)
In [0]:
train1 = pd.concat([test,train,anomoly])
In [0]:
train = train1.drop_duplicates(keep=False)
In [0]:
train.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 93932.000000 93932.000000 93932.000000 93932.000000 93932.000000 93932.000000 93932.000000
mean 109860.770398 0.611698 0.277541 0.100413 0.075012 0.029500 0.033769
std 63406.210596 0.487367 0.447788 0.300552 0.263412 0.169204 0.180635
min 3.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 55141.750000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 110184.500000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 164479.250000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000
max 220848.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [0]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(anomoly['Sentence'])
    anomoly[category] = prediction
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
In [0]:
Check = anomoly[anomoly['Growth']== 0][anomoly['Inflation']==0][anomoly['Cad']==0][anomoly['Money']==0][anomoly['Fiscal']==0][anomoly['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
Check.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 91234.000000 91234.0 91234.0 91234.0 91234.0 91234.0 91234.0
mean 112575.311989 0.0 0.0 0.0 0.0 0.0 0.0
std 63780.718361 0.0 0.0 0.0 0.0 0.0 0.0
min 0.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 57420.500000 0.0 0.0 0.0 0.0 0.0 0.0
50% 114303.000000 0.0 0.0 0.0 0.0 0.0 0.0
75% 167940.000000 0.0 0.0 0.0 0.0 0.0 0.0
max 220852.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
train1 = pd.concat([Check,train,anomoly])
In [0]:
train = train1.drop_duplicates(keep=False)
In [0]:
x_train,x_test = tt(train,test_size = 0.5,random_state = 5)
In [0]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(Check['Sentence'])
    Check[category] = prediction
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
In [0]:
Check1 = Check[Check['Growth']== 0][Check['Inflation']==0][Check['Cad']==0][Check['Money']==0][Check['Fiscal']==0][Check['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
Check1.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 25095.000000 25095.0 25095.0 25095.0 25095.0 25095.0 25095.0
mean 113323.783702 0.0 0.0 0.0 0.0 0.0 0.0
std 62525.957493 0.0 0.0 0.0 0.0 0.0 0.0
min 1.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 59747.000000 0.0 0.0 0.0 0.0 0.0 0.0
50% 116830.000000 0.0 0.0 0.0 0.0 0.0 0.0
75% 167029.500000 0.0 0.0 0.0 0.0 0.0 0.0
max 220851.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
train1 = pd.concat([Check,train,Check1])
train = train1.drop_duplicates(keep=False)
In [0]:
train.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 195758.000000 195758.000000 195758.000000 195758.000000 195758.000000 195758.000000 195758.000000
mean 110054.521532 0.750595 0.163104 0.068922 0.049158 0.017123 0.023197
std 63901.404852 0.432670 0.369462 0.253322 0.216198 0.129731 0.150529
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 54702.250000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 109516.500000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 165445.500000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 220852.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [0]:
train.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 195758.000000 195758.000000 195758.000000 195758.000000 195758.000000 195758.000000 195758.000000
mean 110054.521532 0.750595 0.163104 0.068922 0.049158 0.017123 0.023197
std 63901.404852 0.432670 0.369462 0.253322 0.216198 0.129731 0.150529
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 54702.250000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 109516.500000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 165445.500000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 220852.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [0]:
x_train,x_test = tt(train,test_size = 0.5,random_state = 4)
In [0]:
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(Check1['Sentence'])
    Check1[category] = prediction
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
In [0]:
Check11 = Check1[Check1['Growth']== 0][Check1['Inflation']==0][Check1['Cad']==0][Check1['Money']==0][Check1['Fiscal']==0][Check1['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
Check11.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 10104.000000 10104.0 10104.0 10104.0 10104.0 10104.0 10104.0
mean 115568.118864 0.0 0.0 0.0 0.0 0.0 0.0
std 62281.130722 0.0 0.0 0.0 0.0 0.0 0.0
min 24.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 62664.000000 0.0 0.0 0.0 0.0 0.0 0.0
50% 121571.500000 0.0 0.0 0.0 0.0 0.0 0.0
75% 168562.500000 0.0 0.0 0.0 0.0 0.0 0.0
max 220849.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
train1 = pd.concat([Check11,train,Check1])
train = train1.drop_duplicates(keep=False)
In [0]:
x_train,x_test = tt(train,test_size = 0.5,random_state = 4)
In [0]:
x_train,x_test = tt(train,test_size = 0.5,random_state = 4)
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(Check11['Sentence'])
    Check11[category] = prediction
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
In [0]:
Check111 = Check11[Check11['Growth']== 0][Check11['Inflation']==0][Check11['Cad']==0][Check11['Money']==0][Check11['Fiscal']==0][Check11['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
Check111.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 6771.000000 6771.0 6771.0 6771.0 6771.0 6771.0 6771.0
mean 114612.731354 0.0 0.0 0.0 0.0 0.0 0.0
std 62597.833021 0.0 0.0 0.0 0.0 0.0 0.0
min 24.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 61470.500000 0.0 0.0 0.0 0.0 0.0 0.0
50% 120668.000000 0.0 0.0 0.0 0.0 0.0 0.0
75% 168218.500000 0.0 0.0 0.0 0.0 0.0 0.0
max 220849.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
train1 = pd.concat([Check111,train,Check11])
train = train1.drop_duplicates(keep=False)
In [0]:
x_train,x_test = tt(train,test_size = 0.5,random_state = 4)
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(Check111['Sentence'])
    Check111[category] = prediction
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
In [0]:
Check1111 = Check111[Check111['Growth']== 0][Check111['Inflation']==0][Check111['Cad']==0][Check111['Money']==0][Check111['Fiscal']==0][Check111['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
Check1111.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 5475.000000 5475.0 5475.0 5475.0 5475.0 5475.0 5475.0
mean 113767.193973 0.0 0.0 0.0 0.0 0.0 0.0
std 62271.146454 0.0 0.0 0.0 0.0 0.0 0.0
min 24.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 61459.500000 0.0 0.0 0.0 0.0 0.0 0.0
50% 119135.000000 0.0 0.0 0.0 0.0 0.0 0.0
75% 166385.000000 0.0 0.0 0.0 0.0 0.0 0.0
max 220849.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
train1 = pd.concat([Check1111,train,Check111])
train = train1.drop_duplicates(keep=False)
In [0]:
x_train,x_test = tt(train,test_size = 0.4,random_state = 5)
for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    SVC_pipeline.fit(x_train['Sentence'], x_train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(Check1111['Sentence'])
    Check1111[category] = prediction
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
In [0]:
Check11111 = Check1111[Check1111['Growth']== 0][Check1111['Inflation']==0][Check1111['Cad']==0][Check1111['Money']==0][Check1111['Fiscal']==0][Check1111['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
Check11111.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 4225.000000 4225.0 4225.0 4225.0 4225.0 4225.0 4225.0
mean 113050.889941 0.0 0.0 0.0 0.0 0.0 0.0
std 61702.225308 0.0 0.0 0.0 0.0 0.0 0.0
min 24.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 61077.000000 0.0 0.0 0.0 0.0 0.0 0.0
50% 118479.000000 0.0 0.0 0.0 0.0 0.0 0.0
75% 163889.000000 0.0 0.0 0.0 0.0 0.0 0.0
max 220765.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
train1 = pd.concat([Check11111,train,Check1111])
train = train1.drop_duplicates(keep=False)
In [0]:
train.to_csv("Out.csv")

checking

In [0]:
test = Check11111
In [0]:
for i in range(10):
    print(i)
    x_train,x_test = tt(train,test_size = 0.5,random_state = i)
    for category in categories:
        print('... Processing {}'.format(category))
        # train the model using X_dtm & y
        SVC_pipeline.fit(x_train['Sentence'], x_train['Growth'])
        # compute the testing accuracy
        prediction = SVC_pipeline.predict(test['Sentence'])
        test[category] = prediction
    anomoly = test[test['Growth']== 0][test['Inflation']==0][test['Cad']==0][test['Money']==0][test['Fiscal']==0][test['Other']==0]
    test = anomoly
    train1 = pd.concat([test,train,anomoly])
    train = train1.drop_duplicates(keep=False)
0
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
1
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
2
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
3
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
4
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
5
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
6
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
7
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
8
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
9
... Processing Growth
... Processing Inflation
... Processing Cad
... Processing Money
... Processing Fiscal
... Processing Other
In [0]:
train.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 219233.000000 219233.000000 219233.000000 219233.000000 219233.000000 219233.000000 219233.000000
mean 110384.700013 0.743907 0.150073 0.069337 0.050230 0.016667 0.023664
std 63759.941782 0.436475 0.357144 0.254027 0.218419 0.128021 0.152001
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 55170.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 110307.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 165615.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 220852.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [0]:
test.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 2796.000000 2796.0 2796.0 2796.0 2796.0 2796.0 2796.0
mean 113590.826538 0.0 0.0 0.0 0.0 0.0 0.0
std 62177.845272 0.0 0.0 0.0 0.0 0.0 0.0
min 84.000000 0.0 0.0 0.0 0.0 0.0 0.0
25% 61212.750000 0.0 0.0 0.0 0.0 0.0 0.0
50% 117528.500000 0.0 0.0 0.0 0.0 0.0 0.0
75% 166869.500000 0.0 0.0 0.0 0.0 0.0 0.0
max 220775.000000 0.0 0.0 0.0 0.0 0.0 0.0
In [0]:
 
In [0]:
train2 = pd.concat([df,train])
In [0]:
train2 = train2.drop_duplicates(keep=False)
In [0]:
train2.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 373513.000000 373513.000000 373513.000000 373513.000000 373513.000000 373513.000000 373513.000000
mean 109234.044371 0.389732 0.053401 0.028318 0.021341 0.005207 0.010383
std 63928.644515 0.487690 0.224832 0.165879 0.144517 0.071974 0.101364
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 53869.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 107649.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 164715.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 220852.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [0]:
train = pd.concat([train,anomoly])
In [0]:
acc = {}
for i in range(2):
  print(i)
  x_train,x_test = tt(train,test_size = 0.5,random_state = i)
  for category in categories:
      print('... Processing {}'.format(category))
      # train the model using X_dtm & y
      SVC_pipeline.fit(x_train['Sentence'], x_train['category'])
      # compute the testing accuracy

      # test[category] = prediction
  # anomoly = test[test['Growth']== 0][test['Inflation']==0][test['Cad']==0][test['Money']==0][test['Fiscal']==0][test['Other']==0]
  # test = anomoly
  # train1 = pd.concat([test,train,anomoly])
  # train = train1.drop_duplicates(keep=False)
0
1
In [0]:
acc
Out[0]:
{1: 'epoch', 'Score': 0.9539698868418591}
In [0]:
 

NLP Model

In [0]:
vect = CountVectorizer()
In [0]:
vect.fit(train['Sentence'])
counts= vect.transform(train['Sentence'])
In [0]:
tf = TfidfTransformer()
train1 = tf.fit_transform(counts)
In [0]:
train1.shape[1]
Out[0]:
43322
In [0]:
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Growth']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
In [0]:
modl=BernoulliNB().fit(x_train,y_train)
In [0]:
from sklearn.metrics import accuracy_score
In [0]:
acc = modl.predict(x_test)
In [0]:
accuracy_score(acc,y_test)
Out[0]:
0.8827938027459378
In [0]:
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Inflation']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
Out[0]:
0.9286906411386825
In [0]:
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Cad']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
Out[0]:
0.944435697191082
In [0]:
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Money']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
Out[0]:
0.9635816853507998
In [0]:
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Fiscal']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
Out[0]:
0.9852311374228492

Test dataset

In [0]:
vect1 = CountVectorizer()
vect1.fit(test['Sentence'])
counts= vect.transform(test['Sentence'])
tf1 = TfidfTransformer()
test1 = tf1.fit_transform(counts)

Growth Classification

In [0]:
modl=BernoulliNB().fit(train1,train['Growth'])
y = modl.predict(test1)
test['Growth'] = y

Inflation Classification

In [0]:
modl=BernoulliNB().fit(train1,train['Inflation'])
y = modl.predict(test1)
test['Inflation'] = y

Cad

In [0]:
modl=BernoulliNB().fit(train1,train['Cad'])
y = modl.predict(test1)
test['Cad'] = y

Money

In [0]:
modl=BernoulliNB().fit(train1,train['Money'])
y = modl.predict(test1)
test['Money'] = y

Fiscal

In [0]:
modl=BernoulliNB().fit(train1,train['Fiscal'])
y = modl.predict(test1)
test['Fiscal'] = y

Other

In [0]:
modl=BernoulliNB().fit(train1,train['Other'])
y = modl.predict(test1)
test['Other'] = y
In [0]:
anomoly = test[test['Growth']== 0][test['Inflation']==0][test['Cad']==0][test['Money']==0][test['Fiscal']==0][test['Other']==0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  """Entry point for launching an IPython kernel.
In [0]:
t = pd.concat([anomoly,test])
In [0]:
test = t.drop_duplicates(keep=False)
In [0]:
test.describe()
Out[0]:
Unnamed: 0 Growth Inflation Cad Money Fiscal Other
count 179788.000000 179788.000000 179788.000000 179788.000000 179788.000000 179788.000000 179788.000000
mean 109290.296388 0.836074 0.100212 0.086813 0.049469 0.004555 0.017409
std 64015.937259 0.370210 0.300284 0.281562 0.216847 0.067340 0.130791
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 53842.750000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 107597.500000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 164891.250000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 220852.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
In [0]:
vect1 = CountVectorizer()
vect1.fit(anomoly['Sentence'])
counts= vect.transform(anomoly['Sentence'])
tf1 = TfidfTransformer()
anomoly1 = tf1.fit_transform(counts)
In [0]:
modl=BernoulliNB().fit(train1,train['Growth'])Other = anomoly[anomoly['Growth']== 0][anomoly['Inflation']==0][anomoly['Cad']==0][anomoly['Money']==0][anomoly['Fiscal']==0][anomoly['Other']==0]
anomoly['Growth'] = modl.predict(anomoly1)
  File "<ipython-input-353-25eaa21c8070>", line 1
    modl=BernoulliNB().fit(train1,train['Growth'])Other = anomoly[anomoly['Growth']== 0][anomoly['Inflation']==0][anomoly['Cad']==0][anomoly['Money']==0][anomoly['Fiscal']==0][anomoly['Other']==0]
                                                      ^
SyntaxError: invalid syntax
In [0]:
 
In [0]:
 
In [0]:
modl=BernoulliNB().fit(train1,train['Inflation'])
anomoly['Inflation'] = modl.predict(anomoly1)
In [0]:
modl=BernoulliNB().fit(train1,train['Cad'])
anomoly['Cad'] = modl.predict(anomoly1)
In [0]:
modl=BernoulliNB().fit(train1,train['Money'])
anomoly['Money'] = modl.predict(anomoly1)
In [0]:
modl=BernoulliNB().fit(train1,train['Fiscal'])
anomoly['Fiscal'] = modl.predict(anomoly1)
In [0]:
modl=BernoulliNB().fit(train1,train['Other'])
anomoly['Other'] = modl.predict(anomoly1)
In [0]:
Other = anomoly[anomoly['Growth']== 0][anomoly['Inflation']==0][anomoly['Cad']==0][anomoly['Money']==0][anomoly['Fiscal']==0][anomoly['Other']==0]
In [0]:
Other

References

In [0]:
class NeuralNet():
	def __init__(self,input):
		np.random.seed(1)
		self.weights = 2 * np.random.random(input) - 1
		print("The weights are ",self.weights)
	def sigmoid(self,a):
		return 1/(1 + np.exp(-a))	

	def der_sigmoid(self,a):
		return a * (1 - a)	

	def think(self,ip):
		ip = ip.astype(float)
		output = self.sigmoid(np.dot(ip,self.weights))
		return output

	def training(self,input_layer,T_outputs,iterations):
		for i in range(iterations):	
			output = self.think(input_layer)
			error = T_outputs - output
			adjust = error*self.der_sigmoid(output)
			self.weights += np.dot(input_layer.T,adjust)
In [0]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
news_train = fetch_20newsgroups(subset = 'train', categories= categories)
news_test = fetch_20newsgroups(subset = 'test',categories= categories)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', TfidfVectorizer()), 
                      ('clf', MultinomialNB()) ])

# train the model
text_clf.fit(news_train.data, news_train.target)
# Predict the test cases
predicted = text_clf.predict(news_test.data)

from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np

print('Accuracy achieved is ' + str(np.mean(predicted == news_test.target)))
print(metrics.classification_report(news_test.target, predicted, target_names=news_test.target_names)),
metrics.confusion_matrix(news_test.target, predicted)
In [0]:
news_train.target