import nltk
nltk.download('stopwords')
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.multiclass import OneVsRestClassifier
stop_words = set(stopwords.words('english'))
from collections import OrderedDict
df = pd.read_csv('Training_Data.csv',encoding = "ISO-8859-1")
test = df[df['Growth']== 0][df['Inflation']==0][df['Cad']==0][df['Money']==0][df['Fiscal']==0][df['Other']==0]
test.describe()
df1 = pd.concat([test,df])
train = df1.drop_duplicates(keep=False)
train.describe()
train['Sentence']
NB_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words=stop_words)),
('clf', OneVsRestClassifier(MultinomialNB(
fit_prior=True, class_prior=None))),
])
categories = ['Growth', 'Inflation', 'Cad', 'Money','Fiscal', 'Other']
x_train,x_test = tt(train,test_size = 0.5,random_state = 1)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
NB_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = NB_pipeline.predict(x_test['Sentence'])
print('Test accuracy is {}'.format(accuracy_score(x_test[category], prediction)))
SVC_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words=stop_words)),
('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = SVC_pipeline.predict(x_test['Sentence'])
print('Test accuracy is {}'.format(accuracy_score(x_test[category], prediction)))
LogReg_pipeline = Pipeline([
('tfidf', TfidfVectorizer(stop_words=stop_words)),
('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
])
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
LogReg_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = LogReg_pipeline.predict(x_test['Sentence'])
print('Test accuracy is {}'.format(accuracy_score(x_test[category], prediction)))
x_train,x_test = tt(train,test_size = 0.5,random_state = 9)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = SVC_pipeline.predict(test['Sentence'])
test[category] = prediction
anomoly = test[test['Growth']== 0][test['Inflation']==0][test['Cad']==0][test['Money']==0][test['Fiscal']==0][test['Other']==0]
anomoly.describe()
x_train,x_test = tt(train,test_size = 0.5,random_state = 8)
train1 = pd.concat([test,train,anomoly])
train = train1.drop_duplicates(keep=False)
train.describe()
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
LogReg_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = LogReg_pipeline.predict(anomoly['Sentence'])
anomoly[category] = prediction
Check = anomoly[anomoly['Growth']== 0][anomoly['Inflation']==0][anomoly['Cad']==0][anomoly['Money']==0][anomoly['Fiscal']==0][anomoly['Other']==0]
Check.describe()
train1 = pd.concat([Check,train,anomoly])
train = train1.drop_duplicates(keep=False)
x_train,x_test = tt(train,test_size = 0.5,random_state = 5)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = SVC_pipeline.predict(Check['Sentence'])
Check[category] = prediction
Check1 = Check[Check['Growth']== 0][Check['Inflation']==0][Check['Cad']==0][Check['Money']==0][Check['Fiscal']==0][Check['Other']==0]
Check1.describe()
train1 = pd.concat([Check,train,Check1])
train = train1.drop_duplicates(keep=False)
train.describe()
train.describe()
x_train,x_test = tt(train,test_size = 0.5,random_state = 4)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = SVC_pipeline.predict(Check1['Sentence'])
Check1[category] = prediction
Check11 = Check1[Check1['Growth']== 0][Check1['Inflation']==0][Check1['Cad']==0][Check1['Money']==0][Check1['Fiscal']==0][Check1['Other']==0]
Check11.describe()
train1 = pd.concat([Check11,train,Check1])
train = train1.drop_duplicates(keep=False)
x_train,x_test = tt(train,test_size = 0.5,random_state = 4)
x_train,x_test = tt(train,test_size = 0.5,random_state = 4)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = SVC_pipeline.predict(Check11['Sentence'])
Check11[category] = prediction
Check111 = Check11[Check11['Growth']== 0][Check11['Inflation']==0][Check11['Cad']==0][Check11['Money']==0][Check11['Fiscal']==0][Check11['Other']==0]
Check111.describe()
train1 = pd.concat([Check111,train,Check11])
train = train1.drop_duplicates(keep=False)
x_train,x_test = tt(train,test_size = 0.5,random_state = 4)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = SVC_pipeline.predict(Check111['Sentence'])
Check111[category] = prediction
Check1111 = Check111[Check111['Growth']== 0][Check111['Inflation']==0][Check111['Cad']==0][Check111['Money']==0][Check111['Fiscal']==0][Check111['Other']==0]
Check1111.describe()
train1 = pd.concat([Check1111,train,Check111])
train = train1.drop_duplicates(keep=False)
x_train,x_test = tt(train,test_size = 0.4,random_state = 5)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train[category])
# compute the testing accuracy
prediction = SVC_pipeline.predict(Check1111['Sentence'])
Check1111[category] = prediction
Check11111 = Check1111[Check1111['Growth']== 0][Check1111['Inflation']==0][Check1111['Cad']==0][Check1111['Money']==0][Check1111['Fiscal']==0][Check1111['Other']==0]
Check11111.describe()
train1 = pd.concat([Check11111,train,Check1111])
train = train1.drop_duplicates(keep=False)
train.to_csv("Out.csv")
test = Check11111
for i in range(10):
print(i)
x_train,x_test = tt(train,test_size = 0.5,random_state = i)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train['Growth'])
# compute the testing accuracy
prediction = SVC_pipeline.predict(test['Sentence'])
test[category] = prediction
anomoly = test[test['Growth']== 0][test['Inflation']==0][test['Cad']==0][test['Money']==0][test['Fiscal']==0][test['Other']==0]
test = anomoly
train1 = pd.concat([test,train,anomoly])
train = train1.drop_duplicates(keep=False)
train.describe()
test.describe()
train2 = pd.concat([df,train])
train2 = train2.drop_duplicates(keep=False)
train2.describe()
train = pd.concat([train,anomoly])
acc = {}
for i in range(2):
print(i)
x_train,x_test = tt(train,test_size = 0.5,random_state = i)
for category in categories:
print('... Processing {}'.format(category))
# train the model using X_dtm & y
SVC_pipeline.fit(x_train['Sentence'], x_train['category'])
# compute the testing accuracy
# test[category] = prediction
# anomoly = test[test['Growth']== 0][test['Inflation']==0][test['Cad']==0][test['Money']==0][test['Fiscal']==0][test['Other']==0]
# test = anomoly
# train1 = pd.concat([test,train,anomoly])
# train = train1.drop_duplicates(keep=False)
acc
vect = CountVectorizer()
vect.fit(train['Sentence'])
counts= vect.transform(train['Sentence'])
tf = TfidfTransformer()
train1 = tf.fit_transform(counts)
train1.shape[1]
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Growth']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
from sklearn.metrics import accuracy_score
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Inflation']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Cad']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Money']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
x_train,x_test,y_train,y_test=tt(train1,np.asarray(train['Fiscal']),test_size = 0.3,stratify = np.asarray(train['Growth']),random_state = 1)
modl=BernoulliNB().fit(x_train,y_train)
acc = modl.predict(x_test)
accuracy_score(acc,y_test)
vect1 = CountVectorizer()
vect1.fit(test['Sentence'])
counts= vect.transform(test['Sentence'])
tf1 = TfidfTransformer()
test1 = tf1.fit_transform(counts)
Growth Classification
modl=BernoulliNB().fit(train1,train['Growth'])
y = modl.predict(test1)
test['Growth'] = y
Inflation Classification
modl=BernoulliNB().fit(train1,train['Inflation'])
y = modl.predict(test1)
test['Inflation'] = y
Cad
modl=BernoulliNB().fit(train1,train['Cad'])
y = modl.predict(test1)
test['Cad'] = y
Money
modl=BernoulliNB().fit(train1,train['Money'])
y = modl.predict(test1)
test['Money'] = y
Fiscal
modl=BernoulliNB().fit(train1,train['Fiscal'])
y = modl.predict(test1)
test['Fiscal'] = y
Other
modl=BernoulliNB().fit(train1,train['Other'])
y = modl.predict(test1)
test['Other'] = y
anomoly = test[test['Growth']== 0][test['Inflation']==0][test['Cad']==0][test['Money']==0][test['Fiscal']==0][test['Other']==0]
t = pd.concat([anomoly,test])
test = t.drop_duplicates(keep=False)
test.describe()
vect1 = CountVectorizer()
vect1.fit(anomoly['Sentence'])
counts= vect.transform(anomoly['Sentence'])
tf1 = TfidfTransformer()
anomoly1 = tf1.fit_transform(counts)
modl=BernoulliNB().fit(train1,train['Growth'])Other = anomoly[anomoly['Growth']== 0][anomoly['Inflation']==0][anomoly['Cad']==0][anomoly['Money']==0][anomoly['Fiscal']==0][anomoly['Other']==0]
anomoly['Growth'] = modl.predict(anomoly1)
modl=BernoulliNB().fit(train1,train['Inflation'])
anomoly['Inflation'] = modl.predict(anomoly1)
modl=BernoulliNB().fit(train1,train['Cad'])
anomoly['Cad'] = modl.predict(anomoly1)
modl=BernoulliNB().fit(train1,train['Money'])
anomoly['Money'] = modl.predict(anomoly1)
modl=BernoulliNB().fit(train1,train['Fiscal'])
anomoly['Fiscal'] = modl.predict(anomoly1)
modl=BernoulliNB().fit(train1,train['Other'])
anomoly['Other'] = modl.predict(anomoly1)
Other = anomoly[anomoly['Growth']== 0][anomoly['Inflation']==0][anomoly['Cad']==0][anomoly['Money']==0][anomoly['Fiscal']==0][anomoly['Other']==0]
Other
class NeuralNet():
def __init__(self,input):
np.random.seed(1)
self.weights = 2 * np.random.random(input) - 1
print("The weights are ",self.weights)
def sigmoid(self,a):
return 1/(1 + np.exp(-a))
def der_sigmoid(self,a):
return a * (1 - a)
def think(self,ip):
ip = ip.astype(float)
output = self.sigmoid(np.dot(ip,self.weights))
return output
def training(self,input_layer,T_outputs,iterations):
for i in range(iterations):
output = self.think(input_layer)
error = T_outputs - output
adjust = error*self.der_sigmoid(output)
self.weights += np.dot(input_layer.T,adjust)
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
news_train = fetch_20newsgroups(subset = 'train', categories= categories)
news_test = fetch_20newsgroups(subset = 'test',categories= categories)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', TfidfVectorizer()),
('clf', MultinomialNB()) ])
# train the model
text_clf.fit(news_train.data, news_train.target)
# Predict the test cases
predicted = text_clf.predict(news_test.data)
from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np
print('Accuracy achieved is ' + str(np.mean(predicted == news_test.target)))
print(metrics.classification_report(news_test.target, predicted, target_names=news_test.target_names)),
metrics.confusion_matrix(news_test.target, predicted)
news_train.target