Required Packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Activation, Flatten, Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import plot_model
import math
import time
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re
import os
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.python.client import device_lib
# Model saving
import bz2
import pickle
import _pickle as cPickle
# Saves the "data" with the "title" and adds the .pickle
def full_pickle(title, data):
pikd = open(title + '.pickle', 'wb')
pickle.dump(data, pikd)
pikd.close()
def loosen(file):
pikd = open(file, 'rb')
data = pickle.load(pikd)
pikd.close()
return data
# Pickle a file and then compress it into a file with extension
def compressed_pickle(title, data):
with bz2.BZ2File(title + '.pbz2', 'w') as f:
cPickle.dump(data, f)
def decompress_pickle(file):
data = bz2.BZ2File(file, 'rb')
data = cPickle.load(data)
return data
device_lib.list_local_devices()
Fake News Code
class fakeNews:
def __init__(self,directory='/mnt/c/Users/thecu/Documents/R Projects/4270/charLevelCNN/',lowercase=True):
self.lowercase = lowercase
self.directory = directory
if lowercase == True:
self.alphabet =
"abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
else:
self.alphabet =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
def make_fakeNews(self, balance=True, content = False, pad = 1.25):
self.pad = pad
self.balance = balance
self.content = content
self.pad = pad
articles = []
for dirname, _, filenames in os.walk(self.directory+'data'):
for filename in filenames:
mypath = os.path.join(dirname, filename)
name = re.sub('\.csv','',filename + '_all')
exec("%s = pd.read_csv(" % (name,)+
"'{path}".format(path=mypath)+"')")
if name == 'Fake_all':
booler = 1
else:
booler = 0
exec("%s.insert(%s.shape[1],'y',[%d]*%s.shape[0])" % (name,name,booler,name))
exec("articles.append(%s)" % (name,))
if self.balance == True:
clsLen = []
for df in articles:
clsLen.append(df.shape[0])
classSize = int(np.min(100*np.floor(np.array(clsLen)/100)))
articles_balanced = []
for df in articles:
articles_balanced.append(df.iloc[:classSize,:])
Articles_bal = pd.concat(articles_balanced,axis=0,ignore_index=True)
Articles_all = Articles_bal
else:
Articles_ubl = pd.concat(articles,axis=0,ignore_index=True)
Articles_all = Articles_ubl
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
train_titles = Articles_all.iloc[:,0]
train_bodies = Articles_all.iloc[:,1]
all_y = Articles_all['y'].values
tk.fit_on_texts(train_titles)
char_dict = {}
for i, char in enumerate(self.alphabet):
char_dict[char] = i + 1
# Use char_dict to replace the tk.word_index
tk.word_index = char_dict.copy()
# Add 'UNK' to the vocabulary
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
# Convert string to index
title_sequences = tk.texts_to_sequences(train_titles)
max_ttl = int(math.ceil(self.pad*max([len(x) for x in Articles_all.iloc[:,0].values])))
if self.content == True:
content_sequences = tk.texts_to_sequences(train_bodies)
max_cnt = int(math.ceil(self.pad*max([len(x) for x in Articles_all.iloc[:,1].values])))
coded_titles = pad_sequences(title_sequences, maxlen=max_ttl,
padding='post')
coded_bodies = pad_sequences(content_sequences, maxlen=max_cnt,
padding='post')
return (np.concatenate([coded_titles,coded_bodies],axis=1).astype('int32'),
all_y,tk.word_index)
else:
return (pad_sequences(title_sequences, maxlen=max_ttl, padding='post').astype('int32'), all_y,tk.word_index)
def three_way_split(self,make_Fake, split='default'):
self.X = make_Fake[0]
self.y = make_Fake[1]
tot_len = self.y.shape[0]
self.split = split
if self.split == 'default':
tts = (7,2,1)
else:
tts = self.split
tsp = int((tts[2]*tot_len/20))
vsp = int(((tts[1]+tts[2])*tot_len/20))
hlf = int((1*tot_len/2))
test_tp = (np.concatenate([self.X[:tsp,:],self.X[hlf:(hlf+tsp),:]],axis=0),
np.concatenate([self.y[:tsp],self.y[hlf:(hlf+tsp)]],axis=0))
valid_tp = (np.concatenate([self.X[tsp:vsp,:],self.X[(hlf+tsp):(hlf+vsp),:]],axis=0),
np.concatenate([self.y[tsp:vsp], self.y[(hlf+tsp):(hlf+vsp)]],axis=0))
train_tp = (np.concatenate([self.X[vsp:hlf,:],self.X[(hlf+vsp):,:]],axis=0),
np.concatenate([self.y[vsp:hlf],self.y[(hlf+vsp):]],axis=0))
app = []
for tp in [train_tp, valid_tp,test_tp]:
alength = np.arange(tp[1].shape[0])
np.random.seed(5318008)
np.random.shuffle(alength)
wee = []
for q in tp:
if len(q.shape) == 2:
wee.append(q[alength,:])
elif len(q.shape) == 1:
wee.append(q[alength])
app.append(wee)
return app
def makeClassifier(self,make_Fake, fully_connected_layers = None, conv_layers = None, dropout = 0.5, optimizer = 'adam', loss='binary_crossentropy'):
self.fully_connected_layers = fully_connected_layers
self.dropout_p = dropout
self.optimizer = optimizer
self.loss = loss
self.X = make_Fake[0]
self.y = make_Fake[1]
self.wind = make_Fake[2]
self.fully_connected_layers = fully_connected_layers
self.conv_layers = conv_layers
input_size = self.X.shape[1]
vocab_size = len(self.alphabet) + 1
embedding_size = len(self.alphabet) + 1
if self.fully_connected_layers == None:
fully_connected_layers=[1024, 1024]
if self.conv_layers == None:
conv_layers = [[256, 7, 3],
[256, 7, 3],
[256, 3, -1],
[256, 3, -1],
[256, 3, -1],
[256, 3, 3]]
# Embedding weights
embedding_weights = [] # (len(alphabet)+2, len(alphabet)+1)
embedding_weights.append(np.zeros(vocab_size)) # (0, len(alphabet)+2)
for char, i in self.wind.items(): # from index 1 to len(alphabet)+2
onehot = np.zeros(vocab_size)
onehot[i - 1] = 1
embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)
print('Load')
# Embedding layer Initialization
embedding_layer = Embedding(vocab_size + 1,
embedding_size,
input_length=input_size,
weights=[embedding_weights])
# Model Construction
# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64') # shape=(?, padded input size)
# Embedding
x = embedding_layer(inputs)
# Conv
for filter_num, filter_size, pooling_size in conv_layers:
x = Conv1D(filter_num, filter_size)(x)
x = Activation('relu')(x)
if pooling_size != -1:
x = MaxPooling1D(pool_size=pooling_size)(x) # Final shape=(None, 34, 256)
x = Flatten()(x) # (None, 8704)
# Fully connected layers
for dense_size in fully_connected_layers:
x = Dense(dense_size, activation='relu')(x) # dense_size == 1024
x = Dropout(self.dropout_p)(x)
# Output Layer
predictions = Dense(1, activation='sigmoid')(x)
# Build model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss,
metrics=['accuracy']) # Adam, categorical_crossentropy
model.summary()
return model
#plot_model(model, to_file=self.directory + 'model_plot.png', show_shapes=True, show_layer_names=True)