Appendix A: Code

Required Packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Activation, Flatten, Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import plot_model
import math
import time
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import re
import os
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.python.client import device_lib
# Model saving
import bz2
import pickle
import _pickle as cPickle
# Saves the "data" with the "title" and adds the .pickle
def full_pickle(title, data):
    pikd = open(title + '.pickle', 'wb')
    pickle.dump(data, pikd)
    pikd.close()
def loosen(file):
    pikd = open(file, 'rb')
    data = pickle.load(pikd)
    pikd.close()
    return data
# Pickle a file and then compress it into a file with extension 
def compressed_pickle(title, data):
    with bz2.BZ2File(title + '.pbz2', 'w') as f: 
        cPickle.dump(data, f)
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = cPickle.load(data)
    return data
device_lib.list_local_devices()

Fake News Code

class fakeNews:
    def __init__(self,directory='/mnt/c/Users/thecu/Documents/R Projects/4270/charLevelCNN/',lowercase=True):
        self.lowercase = lowercase
        self.directory = directory
        if lowercase == True:
            self.alphabet =
            "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
        else:
            self.alphabet =
            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    def make_fakeNews(self, balance=True, content = False, pad = 1.25):
        self.pad = pad
        self.balance = balance
        self.content = content
        self.pad = pad
        articles = []
        for dirname, _, filenames in os.walk(self.directory+'data'):
            for filename in filenames:
                mypath = os.path.join(dirname, filename)
                name = re.sub('\.csv','',filename + '_all')

            exec("%s = pd.read_csv(" % (name,)+
            "'{path}".format(path=mypath)+"')")
            if name == 'Fake_all':
                booler = 1
            else:
                booler = 0
            exec("%s.insert(%s.shape[1],'y',[%d]*%s.shape[0])" % (name,name,booler,name))
            exec("articles.append(%s)" % (name,))
    if self.balance == True:
        clsLen = []
        for df in articles:
            clsLen.append(df.shape[0])
        classSize = int(np.min(100*np.floor(np.array(clsLen)/100)))
        articles_balanced = []
        for df in articles:
            articles_balanced.append(df.iloc[:classSize,:])
        Articles_bal = pd.concat(articles_balanced,axis=0,ignore_index=True)
        Articles_all = Articles_bal
    else:
        Articles_ubl = pd.concat(articles,axis=0,ignore_index=True)
        Articles_all = Articles_ubl
    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
    train_titles = Articles_all.iloc[:,0]
    train_bodies = Articles_all.iloc[:,1]
    all_y = Articles_all['y'].values
    tk.fit_on_texts(train_titles)
    char_dict = {}
    for i, char in enumerate(self.alphabet):
        char_dict[char] = i + 1

    # Use char_dict to replace the tk.word_index
    tk.word_index = char_dict.copy()
    # Add 'UNK' to the vocabulary
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
    # Convert string to index
    title_sequences = tk.texts_to_sequences(train_titles)
    max_ttl = int(math.ceil(self.pad*max([len(x) for x in Articles_all.iloc[:,0].values])))
    if self.content == True:
        content_sequences = tk.texts_to_sequences(train_bodies)
        max_cnt = int(math.ceil(self.pad*max([len(x) for x in Articles_all.iloc[:,1].values])))
        coded_titles = pad_sequences(title_sequences, maxlen=max_ttl,
        padding='post')
        coded_bodies = pad_sequences(content_sequences, maxlen=max_cnt,
        padding='post')
        return (np.concatenate([coded_titles,coded_bodies],axis=1).astype('int32'),
        all_y,tk.word_index)
    else:  
        return (pad_sequences(title_sequences, maxlen=max_ttl, padding='post').astype('int32'), all_y,tk.word_index)
def three_way_split(self,make_Fake, split='default'):
    self.X = make_Fake[0]
    self.y = make_Fake[1]
    tot_len = self.y.shape[0]
    self.split = split
    if self.split == 'default':
        tts = (7,2,1)
    else:
        tts = self.split
    tsp = int((tts[2]*tot_len/20))
    vsp = int(((tts[1]+tts[2])*tot_len/20))
    hlf = int((1*tot_len/2))
    test_tp =  (np.concatenate([self.X[:tsp,:],self.X[hlf:(hlf+tsp),:]],axis=0), 
    np.concatenate([self.y[:tsp],self.y[hlf:(hlf+tsp)]],axis=0))
    valid_tp = (np.concatenate([self.X[tsp:vsp,:],self.X[(hlf+tsp):(hlf+vsp),:]],axis=0),
    np.concatenate([self.y[tsp:vsp], self.y[(hlf+tsp):(hlf+vsp)]],axis=0))
    train_tp = (np.concatenate([self.X[vsp:hlf,:],self.X[(hlf+vsp):,:]],axis=0),
    np.concatenate([self.y[vsp:hlf],self.y[(hlf+vsp):]],axis=0)) 
    app = []
    for tp in [train_tp, valid_tp,test_tp]:
        alength = np.arange(tp[1].shape[0])
        np.random.seed(5318008)
        np.random.shuffle(alength)
        wee = []
        for q in tp:
            if len(q.shape) == 2:
                wee.append(q[alength,:])
            elif len(q.shape) == 1:
                wee.append(q[alength])
        app.append(wee)     
    return app
def makeClassifier(self,make_Fake, fully_connected_layers = None, conv_layers = None, dropout = 0.5, optimizer = 'adam', loss='binary_crossentropy'):
    self.fully_connected_layers = fully_connected_layers
    self.dropout_p = dropout
    self.optimizer = optimizer
    self.loss = loss
    self.X = make_Fake[0]
    self.y = make_Fake[1]
    self.wind = make_Fake[2]
    self.fully_connected_layers = fully_connected_layers
    self.conv_layers = conv_layers
    input_size = self.X.shape[1]
    vocab_size = len(self.alphabet) + 1
    embedding_size = len(self.alphabet) + 1
    if self.fully_connected_layers == None:
        fully_connected_layers=[1024, 1024]
    if self.conv_layers == None:
        conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]
            # Embedding weights
    embedding_weights = []  # (len(alphabet)+2, len(alphabet)+1)
    embedding_weights.append(np.zeros(vocab_size))  # (0, len(alphabet)+2)

    for char, i in self.wind.items():  # from index 1 to len(alphabet)+2
        onehot = np.zeros(vocab_size)
        onehot[i - 1] = 1
        embedding_weights.append(onehot)

    embedding_weights = np.array(embedding_weights)
    print('Load')

    # Embedding layer Initialization
    embedding_layer = Embedding(vocab_size + 1,
                                embedding_size,
                                input_length=input_size,
                                weights=[embedding_weights])

    # Model Construction
    # Input
    inputs = Input(shape=(input_size,), name='input', dtype='int64')         # shape=(?, padded input size)
    # Embedding
    x = embedding_layer(inputs)
    # Conv
    for filter_num, filter_size, pooling_size in conv_layers:
        x = Conv1D(filter_num, filter_size)(x)
        x = Activation('relu')(x)
        if pooling_size != -1:
            x = MaxPooling1D(pool_size=pooling_size)(x)  # Final shape=(None, 34, 256)
    x = Flatten()(x)  # (None, 8704)
    # Fully connected layers
    for dense_size in fully_connected_layers:
        x = Dense(dense_size, activation='relu')(x)  # dense_size == 1024
        x = Dropout(self.dropout_p)(x)
    # Output Layer
    predictions = Dense(1, activation='sigmoid')(x)
    # Build model
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer=optimizer, loss=loss,
    metrics=['accuracy'])  # Adam, categorical_crossentropy
    model.summary()
    return model
    #plot_model(model, to_file=self.directory + 'model_plot.png', show_shapes=True, show_layer_names=True)

Histograms

scrimblo = fakeNews(lowercase=True)
edastart = scrimblo.make_fakeNews(eda=True)
bseq = np.sqrt(np.arange(626))
for q in range(len(edastart)):
    if edastart[q]['y'].values[0]==1:
        lbl = 'Fake'
        col = '#007f7f7f'
    else:
        lbl = 'real'
        col = '#7f007f7f'
    plt.hist(np.ravel(edastart[q]['title'].apply(lambda x: len(x)).values),bins=15*bseq,color=col, label=lbl,log=True)
plt.legend()
plt.title('Histogram of Lengths of Real vs. Fake Title Length')
plt.savefig('/mnt/c/Users/thecu/Documents/R Projects/4270/charLevelCNN/eda/' + 'titles_hist.png')
plt.clf()
for q in range(len(edastart)):
    if edastart[q]['y'].values[0]==1:
        lbl = 'Fake'
        col = '#007f7f7f'
    else:
        lbl = 'real'
        col = '#7f007f7f'
    plt.hist(np.ravel(edastart[q]['text'].apply(lambda x: len(x)).values),bins=2500*(25-bseq)[::-1],color=col, label=lbl,log=True)
plt.legend()
plt.title('Histogram of Lengths of Real vs. Fake Article Body Length')
plt.savefig(
'/mnt/c/Users/thecu/Documents/R Projects/4270/charLevelCNN/eda/' + 'bodies_hist.png')

Models

Model 4

    scrimblo_u = fakeNews(lowercase=False)
zabloing_uf = scrimblo_u.make_fakeNews(content=False,raw=False)
scrungus_ud = scrimblo_u.makeClassifier(zabloing_uf,fully_connected_layers=[256,256])
kablooey_uf = scrimblo_u.three_way_split(zabloing_uf)
myModel3 = scrimblo_u.get_fit(scrungus_ud,kablooey_uf, name='model_3',
          batch_size = 64,
          epochs = 40,
          verbose = 2,
          display = True,
          predict = True)

Model 6

scrimblo_u = fakeNews(lowercase=False)
zabloing_uf = scrimblo_u.make_fakeNews(content=False,raw=False,min_title_len=30,max_title_len=96,x_wid=123)
scrungus_ud = scrimblo_u.makeClassifier(zabloing_uf,fully_connected_layers=[256,256])
kablooey_uf = scrimblo_u.three_way_split(zabloing_uf)
myModel5 = scrimblo_u.get_fit(scrungus_ud,kablooey_uf, name='model_5',
          batch_size = 64,
          epochs = 40,
          verbose = 2,
          display = True,
          predict = True)