import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, log_loss
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, learning_curve
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense

df = pd.read_csv("phishing.csv")
df_sample = df.sample(n=5, random_state=42)
df_sample

df.isnull().sum()

Unnamed: 0     0
Email Text    16
Email Type     0
dtype: int64

df.drop(["Unnamed: 0"],axis=1,inplace=True)
df.dropna(inplace=True,axis=0)
df.drop_duplicates(inplace=True)

def clean_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)
     # remove any hyperlinks [proceeded by http]
    text = re.sub(r'http\S+', ' ', text)
    text = text.lower()
    text = ' '.join(text.split())
    return text

df["Email Text"] = df["Email Text"].apply(clean_text)

df.isnull().sum()

Email Text    0
Email Type    0
dtype: int64

le = LabelEncoder()

df["Email Type"] = le.fit_transform(df["Email Type"])

# initialize a new tfid vectorizer called tf, it will remove common words like
# "and" and "the" in the english idiom, and limit max num of unique words to 10000
tf = TfidfVectorizer(stop_words="english",max_features=10000)

feature_x = tf.fit_transform(df["Email Text"]).toarray()

y_tf = np.array(df['Email Type'])

# we will opt for a standard 80-20 split when it comes to training and testing data sizes
x_train,x_test,y_train,y_test = train_test_split(feature_x,y_tf,train_size=0.8,random_state=0)

sgd = SGDClassifier(loss='log_loss')

n_epochs = 10       
batch_size = 64     # mini batch size
train_losses = []  
test_losses = []    
train_accuracies = []  
test_accuracies = []   

for epoch in range(n_epochs):
    indices = np.random.permutation(len(x_train))
    x_train_shuffled = x_train[indices]
    y_train_shuffled = np.array(y_train)[indices]
    
    for i in range(0, len(x_train), batch_size):
        batch_x = x_train_shuffled[i:i + batch_size]
        batch_y = y_train_shuffled[i:i + batch_size]
        sgd.partial_fit(batch_x, batch_y, classes=[0, 1])
    

    train_pred_prob = sgd.predict_proba(x_train)
    train_loss = log_loss(y_train, train_pred_prob)
    train_losses.append(train_loss)
    train_pred = sgd.predict(x_train)
    train_accuracy = accuracy_score(y_train, train_pred)
    train_accuracies.append(train_accuracy)
    

    test_pred_prob = sgd.predict_proba(x_test)
    test_loss = log_loss(y_test, test_pred_prob)
    test_losses.append(test_loss)
    test_pred = sgd.predict(x_test)
    test_accuracy = accuracy_score(y_test, test_pred)
    test_accuracies.append(test_accuracy)


print(f"SGD Classifier Accuracy: {accuracy_score(y_test, test_pred) * 100:.2f}%")
print(f"SGD Classifier F1 Score: {f1_score(y_test, test_pred) * 100:.2f}%")

SGD Classifier Accuracy: 97.92%
SGD Classifier F1 Score: 98.32%

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(n_epochs), train_losses, marker='o', label="Train Loss")
plt.plot(range(n_epochs), test_losses, marker='x', label="Test Loss")
plt.title("Log Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Log Loss")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(range(n_epochs), train_accuracies, marker='o', label="Train Accuracy")
plt.plot(range(n_epochs), test_accuracies, marker='x', label="Test Accuracy")
plt.title("Accuracy Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()

lgreg = LogisticRegression()
lgreg.fit(x_train,y_train)

pred_lgreg = lgreg.predict(x_test)
print("")
print(f"Logistic Regression Accuracy:{accuracy_score(y_test,pred_lgreg)*100:.2f} %")
print(f"Logistic Regression F1 Score: {f1_score(y_test,pred_lgreg)*100:.2f} %")

Logistic Regression Accuracy:98.06 %
Logistic Regression F1 Score: 98.43 %

train_sizes, train_scores, test_scores = learning_curve(lgreg, x_train, y_train, cv=5)

# learning curve
plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores.mean(axis=1), label='Training Score')
plt.plot(train_sizes, test_scores.mean(axis=1), label='Test Score')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.ylim(0.925, 1)
plt.title('Learning Curve for Logistic Regression')
plt.legend()
plt.show()

dtr = DecisionTreeClassifier()
dtr.fit(x_train,y_train)
pred_dtr = dtr.predict(x_test)

print(f"Decision Tree Accuracy:{accuracy_score(y_test,pred_dtr)*100:.2f} %")
print(f"Decision Tree F1 Score: {f1_score(y_test,pred_dtr)*100:.2f} %")

Decision Tree Accuracy:93.56 %
Decision Tree F1 Score: 94.75 %

plt.figure(figsize=(20, 10))
plot_tree(dtr, 
          max_depth=3,
          filled=True, 
          class_names=['Class 0', 'Class 1'], 
          rounded=True)
plt.title("Top Layers of Decision Tree")
plt.show()

rnf = RandomForestClassifier()
rnf.fit(x_train,y_train)
pred_rnf = rnf.predict(x_test)
print(f"Random Forest Accuracy::{accuracy_score(y_test,pred_rnf)*100:.2f} %")
print(f"Random Forest F1 Score: {f1_score(y_test,pred_rnf)*100:.2f} %")

Random Forest Accuracy::97.81 %
Random Forest F1 Score: 98.21 %

feature_importances = rnf.feature_importances_
feature_names = tf.get_feature_names_out()

importance_df = pd.DataFrame({'Feature': feature_names,'Importance': feature_importances})
top_n = 20
top_features = importance_df.sort_values(by='Importance', ascending=False).head(top_n)

plt.figure(figsize=(10, 6))
plt.barh(top_features['Feature'], top_features['Importance'], color='skyblue')
plt.xlabel('Importance Score')
plt.ylabel('Features (Words)')
plt.title(f'Top {top_n} Important Words in Random Forest Model')
plt.gca().invert_yaxis()
plt.show()

tk = Tokenizer()
tk.fit_on_texts(df['Email Text'])
sequences = tk.texts_to_sequences(df['Email Text'])
vector = pad_sequences(sequences,padding='post', maxlen=150)
x = np.array(vector)
y = np.array(df["Email Type"])
x_train, x_test, y_train, y_test = train_test_split(vector,df['Email Type'], test_size=0.2, random_state =0)

lst = Sequential()
lst.add(Embedding(input_dim=len(tk.word_index)+1, output_dim=75))
lst.add(LSTM(units=110))
lst.add(Dropout(0.5))
lst.add(Dense(1, activation='sigmoid'))
lst.compile(loss='binary_crossentropy' , optimizer='adam', metrics=['accuracy'])

track = lst.fit(x_train, y_train, epochs=5, batch_size=16, validation_data=(x_test, y_test))

Epoch 1/5
877/877 ━━━━━━━━━━━━━━━━━━━━ 103s 115ms/step - accuracy: 0.6845 - loss: 0.5721 - val_accuracy: 0.6804 - val_loss: 0.5717
Epoch 2/5
877/877 ━━━━━━━━━━━━━━━━━━━━ 101s 115ms/step - accuracy: 0.7254 - loss: 0.5030 - val_accuracy: 0.9510 - val_loss: 0.1762
Epoch 3/5
877/877 ━━━━━━━━━━━━━━━━━━━━ 101s 115ms/step - accuracy: 0.9657 - loss: 0.1121 - val_accuracy: 0.9752 - val_loss: 0.0924
Epoch 4/5
877/877 ━━━━━━━━━━━━━━━━━━━━ 101s 115ms/step - accuracy: 0.9933 - loss: 0.0273 - val_accuracy: 0.9803 - val_loss: 0.0763
Epoch 5/5
877/877 ━━━━━━━━━━━━━━━━━━━━ 101s 115ms/step - accuracy: 0.9974 - loss: 0.0142 - val_accuracy: 0.9820 - val_loss: 0.0790

results = lst.evaluate(x_test, y_test)
loss = results[0]
accuracy = results[1]

print(f"Model Loss: {loss}")
print(f"Model Accuracy: {accuracy*100}")

110/110 ━━━━━━━━━━━━━━━━━━━━ 2s 22ms/step - accuracy: 0.9825 - loss: 0.0761
Model Loss: 0.07902191579341888
Model Accuracy: 98.204106092453

pd.DataFrame(track.history)

pd.DataFrame(track.history)[['accuracy', 'val_accuracy']].plot()
plt.title('Training Accuracy')
plt.xlabel('Epochs')
plt.ylabel('accuracy')

pd.DataFrame(track.history)[['loss', 'val_loss']].plot()
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')

Text(0, 0.5, 'Loss')

	Unnamed: 0	Email Text	Email Type
14469	14470	Once upon a time, Harri wrote :> Title page ha...	Safe Email
3014	3014	new books : generative studies we would like t...	Safe Email
16941	16942	new book : japanese linguistics japanese lingu...	Safe Email
14344	14345	renewal notice for your domain name ( s ) dear...	Safe Email
8097	8098	hot stock tip your broker won ' t share now th...	Phishing Email

Can a deep learning model detect phishing emails?¶

Atishay Dikshit¶

Introduction¶

Phase 1: Data cleaning¶

From the initial observation of the email data, it is clear that effective data cleaning is critical in ensuring effective vectorization to ultimately get accurate classification from our machine learning model.¶

1.1 | Removing null values¶

1.2 | Removing punctuation, hyperlinks & leading/trailing whitespace¶

1.3 | Binary Encoding¶

Afterthought: this kind of encoding saves memory by making data storage in the case of integer labels more efficient compared to strings. By ensuring that labels are kept the same, integer encoding further helps in interpreting model outputs.¶

Phase 2: Text Vectorization¶

Why Text Vectorization?¶

The Machine Learning model requires numerical data; it does not process raw text. The text must be converted into numbers such that the model finds some type of pattern in the data for analysis and classification.¶

TF-IDF (Term Frequency-Inverse Document Frequency)¶

Vectorization in context¶

Why TF-IDF over other techniques?¶

Advantages:¶

Limitations:¶

Phase 3: Algorithms¶

3.1 | SGD Classifier¶

Methodology¶

Analysis¶

3.2 | Logistic Regression

Background¶

Logistic regression is based on the assumption of a linear relationship between the independent variables and the log-odds of the outcome, which makes it well-suited for cases where the two classes are linearly separable or close to it.¶

Methodology

Analysis¶

The narrowing gap in accuracy between training and validation at larger sample sizes suggests that the model continues to benefit from additional data, which enhances its ability to generalize.¶

3.3 | Decision Tree

Background¶

Training¶

Decision Tree Visualization¶

Gini Impurity¶

Decision Boundaries¶

3.4 | Random Forests

Background¶

Training¶

Analysis¶

Words like "click," "remove," "free," and "money" align well with known phishing strategies, where the intent is often to prompt immediate action or exploit financial interests.¶

The prominence of "Enron" suggests the model may be partially influenced by dataset-specific characteristics rather than general phishing behaviors, which could impact its performance on emails unrelated to the Enron context.¶

Phase 4 | Long Short-Term Memory (LSTM) Architecture¶

Background¶

Methodology¶

Deep Dive into our parameters¶

Analysis and Conclusion¶