# Import all modules/packets imports:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import chisquare
warnings.filterwarnings("ignore")

df_threats = pd.read_csv('./FullThreatActors.csv') 
display(df_threats)

unique_df = df_threats.drop_duplicates() # handle the duplicates in the dataset
display(unique_df)

df_threats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         1118 non-null   object 
 1   Country      1117 non-null   object 
 2   Actor Type   1118 non-null   object 
 3   Established  0 non-null      float64
 4   Last Seen    1118 non-null   object 
dtypes: float64(1), object(4)
memory usage: 47.0+ KB

# Clean the data for the following diagrams (remove Undetermined after we identified the amount of untracked threats):
filtered_df = df_threats[(df_threats['Actor Type'] != 'Undetermined') & (df_threats['Country'] != 'Undetermined') & (df_threats['Established'] != 'Undetermined') & (df_threats['Last Seen'] != 'Undetermined')]
filtered_df = filtered_df.drop(columns = ['Established']) # drop the Established column as all of them are NaN
filtered_df.dropna(inplace = True)
filtered_df

filtered_df['Country'].value_counts()

Country
Russian Federation            72
United States of America      69
Iran (Islamic Republic of)    35
China                         34
India                         32
                              ..
Cuba                           1
Jordan                         1
Ghana                          1
Armenia                        1
Rwanda                         1
Name: count, Length: 78, dtype: int64

filtered_df['Country'].value_counts().head().plot.bar(rot=80)

<Axes: xlabel='Country'>

us_attacks = filtered_df[filtered_df.Country == "United States of America"] # actor's country is USA
us_attacks

iran_attacks = filtered_df[filtered_df.Country == "Iran (Islamic Republic of)"] # actor's country is Iran
iran_attacks

filtered_df['Actor Type'].value_counts()

Actor Type
Hacktivist      291
Nation-State     95
Criminal         71
Hobbyist         19
Terrorist         6
Name: count, dtype: int64

filtered_df['Actor Type'].value_counts().plot.bar(rot=30)

<Axes: xlabel='Actor Type'>

# Create the pivot table to display the actor count using Country and Actor Type variables
pivot_threats = (filtered_df.
                   groupby('Country')['Actor Type'].
                   value_counts())
pivot_threats.to_frame()

# Group the data by Country and Actor type
piv_by_country_actor = filtered_df.groupby(['Country', 'Actor Type']).size().reset_index(name='count')

# Find the country with max count for each Actor Type using the count as indexing
max_count = piv_by_country_actor.loc[piv_by_country_actor .groupby('Actor Type')['count'].idxmax()]
print(max_count)

                      Country    Actor Type  count
87         Russian Federation      Criminal     30
119  United States of America    Hacktivist     45
120  United States of America      Hobbyist      5
25                      China  Nation-State     26
103      Syrian Arab Republic     Terrorist      3

plt.figure(figsize=(10, 6))
for actor in max_count['Actor Type'].unique():
    data = max_count[max_count['Actor Type'] == actor]
    plt.bar(data['Country'], data['count'], label=actor)


plt.ylabel('Count')
plt.title('Country with Max Count for Each Actor Type')
plt.xticks(rotation=70)
plt.legend()
plt.tight_layout()
plt.show()

# Convert the pivot table to the dataframe for the plotting
pivot_thr_df = pivot_threats.to_frame().reset_index()
pivot_thr_df.columns = ['Country', 'Actor Type', 'count']

# Group the pivot table by the actor type to find all the counts per actor type (further pass to the country)
max_counts = pivot_thr_df.groupby('Actor Type')['count'].transform('max') # identify the max after grouping by the actor type (those are the max counts per type)

# Now we can display the countries with the max count per each Actor Type
countries_with_max_count = pivot_thr_df[pivot_thr_df['count'] == max_counts]
countries_with_max_count

# Extract the "Year" metric for the tidier data and our convenience:
filtered_df['Year'] = pd.to_datetime(filtered_df['Last Seen']).dt.year 
filtered_df['Year']

44      2015
51      2015
53      2015
54      2015
56      2020
        ... 
1108    2014
1109    2022
1112    2014
1115    2014
1116    2016
Name: Year, Length: 482, dtype: int32

filtered_df['Year'].value_counts().sort_index().plot.bar(rot=70)

<Axes: xlabel='Year'>

# Count the attack occurrences of each year
year_counts = filtered_df['Year'].value_counts().sort_index()

plt.plot(year_counts.index, year_counts.values, marker='o')
plt.xlabel('Year')
plt.ylabel('Attack Count')
plt.title('Yearly Attacks Trend')
plt.xticks(year_counts.index, rotation=70) 
plt.grid(True)
plt.show()

threats_cube = filtered_df.pivot_table(
    index="Country", columns=['Actor Type'],
    values='Year', aggfunc='count')                         
threats_cube

df_attack = pd.read_csv('./FullAttackList.csv')
df_attack

df_attack.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4100 entries, 0 to 4099
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Date            4100 non-null   object
 1   Location        4100 non-null   object
 2   Victim          4100 non-null   object
 3   Industry        4080 non-null   object
 4   Actor Location  4100 non-null   object
 5   Actor           4100 non-null   object
 6   Motive          4100 non-null   object
 7   Type            4100 non-null   object
 8   Sub-Type        4078 non-null   object
dtypes: object(9)
memory usage: 288.4+ KB

# Clean up the data to remove the Undetermined values from every column:
filtered_att = df_attack[(df_attack['Location'] != 'Undetermined') & (df_attack['Victim'] != 'Undetermined') & (df_attack['Industry'] != 'Undetermined') & (df_attack['Actor Location'] != 'Undetermined') & (df_attack['Actor'] != 'Undetermined') & (df_attack['Motive'] != 'Undetermined') & (df_attack['Type'] != 'Undetermined') & (df_attack['Sub-Type'] != 'Undetermined')]
filtered_att

filtered_att['Date']

3       2023-10-30
7       2023-10-30
8       2023-10-30
16      2023-10-27
17      2023-10-27
           ...    
4073    2022-04-05
4074    2022-04-05
4084    2022-04-02
4090    2022-04-01
4094    2022-03-31
Name: Date, Length: 957, dtype: object

# Extract the "Year" metric for the plotting, so create the new column in the dataset:
filtered_att['Year'] = pd.to_datetime(filtered_att['Date']).apply(lambda x: x.year)

filtered_att['Location'].unique() # some of these locations are too lengthy to display on the plot, so we create a mapping to the shorter names

array(['Russian Federation',
       'United Kingdom of Great Britain and Northern Ireland', 'Sweden',
       'United States of America', 'Canada', 'Israel', 'Germany',
       'Ukraine', 'Italy', 'Philippines', 'India', 'Hong Kong',
       'Australia', 'Czechia', 'Japan', 'France', 'Netherlands',
       'Pakistan', 'Kenya', 'Saint Vincent and the Grenadines', 'Estonia',
       'South Africa', 'Belarus', 'Ireland', 'Mali', 'Switzerland',
       'Korea (the Republic of)', 'Chile',
       'Venezuela (Bolivarian Republic of)', 'Guatemala', 'Brazil',
       'Belgium', 'Mexico', 'Finland', 'Denmark', 'Spain', 'Poland',
       'Dominican Republic', 'Iran (Islamic Republic of)', 'Slovakia',
       'Lithuania', 'Latvia', 'Cuba', 'Viet Nam', 'United Arab Emirates',
       'Austria', 'Colombia', 'Argentina',
       'Bonaire, Sint Eustatius and Saba', 'Moldova (the Republic of)',
       'Bulgaria', 'Malaysia', 'Greece', 'Norway', 'China', 'Peru',
       'Ecuador', 'Croatia', 'Kazakhstan', 'Romania', 'Montenegro',
       'Luxembourg', 'Thailand', 'Saudi Arabia',
       'Taiwan (Province of China)', 'Costa Rica', 'Kuwait', 'Zambia',
       'Jordan', 'Portugal'], dtype=object)

location_mapping = {
    'Korea (the Republic of)': 'South Korea',
    'United States of America': 'USA',
    'United Kingdom of Great Britain and Northern Ireland': 'UK',
    'Iran (Islamic Republic of)' : 'Iran'
}
# Replace long location names with shorter versions for countries
filtered_att['Location'] = filtered_att['Location'].replace(location_mapping)

filtered_att['Industry'].unique()

array(['Finance and Insurance', 'Public Administration', 'Information',
       'Administrative and Support and Waste Management and Remediation Services',
       'Health Care and Social Assistance', 'Retail Trade',
       'Educational Services', 'Accommodation and Food Services',
       'Professional, Scientific, and Technical Services',
       'Wholesale Trade', 'Real Estate and Rental and Leasing',
       'Transportation and Warehousing', 'Manufacturing',
       'Other Services (except Public Administration)', 'Utilities',
       'Arts, Entertainment, and Recreation', nan,
       'Management of Companies and Enterprises', 'Construction',
       'Mining, Quarrying, and Oil and Gas Extraction',
       'Agriculture, Forestry, Fishing and Hunting'], dtype=object)

industry_mapping = {
    'Administrative and Support and Waste Management and Remediation Services': 'Waste and Remediation Mgmnt',
    'Professional, Scientific, and Technical Services': 'Prof, Science, and Tech',
    'Real Estate and Rental and Leasing': 'Real Estate',
    'Other Services (except Public Administration)' : 'Others',
    'Arts, Entertainment, and Recreation' : 'Arts and Entertainment',
    'Management of Companies and Enterprises': 'Enterprise Mgmnt',
    'Mining, Quarrying, and Oil and Gas Extraction' : 'Mining, Oil, and Gas',
    'Agriculture, Forestry, Fishing and Hunting' : 'Agriculture'

}

# Replace long location names with shorter versions for industries
filtered_att['Industry'] = filtered_att['Industry'].replace(industry_mapping)

filtered_att.describe()

filtered_att['Location'].value_counts().head(n=10).plot.bar(rot=80)

<Axes: xlabel='Location'>

filtered_att['Industry'].head(n=20).value_counts().plot.bar(rot=85)

<Axes: xlabel='Industry'>

filtered_att['Actor Location'].unique()

array(['Ukraine', 'Russian Federation', 'Sudan', 'China',
       'Iran (Islamic Republic of)', 'Palestine, State of', 'India',
       'Italy', 'Afghanistan', 'Belarus', 'Turkey',
       "Korea (the Democratic People's Republic of)", 'Pakistan', 'Cuba',
       'Bangladesh', 'United States of America', 'Viet Nam', 'Thailand',
       'Malaysia'], dtype=object)

actor_loc_mapping = {
    'Iran (Islamic Republic of)': 'Iran',
    'Palestine, State of': 'Palestine',
    "Korea (the Democratic People's Republic of)": 'North Korea',
    'United States of America' : 'USA',
}

# Replace long location names with shorter versions for actor locations
filtered_att['Actor Location'] = filtered_att['Actor Location'].replace(actor_loc_mapping )

filtered_att['Actor Location'].head(n=15).value_counts().plot.bar(rot=80)

<Axes: xlabel='Actor Location'>

filtered_att['Actor'].value_counts().head()

Actor
NoName057(16)         252
Killnet                89
ALPHVM                 89
People's CyberArmy     67
Clop                   48
Name: count, dtype: int64

filtered_att['Motive'].value_counts().plot.bar(rot=85)

<Axes: xlabel='Motive'>

filtered_att['Type'].value_counts().plot.bar(rot=85)

<Axes: xlabel='Type'>

# Extract the years for the tidier data and our convenience
filtered_att['Year'] = pd.to_datetime(filtered_att['Date']).dt.year
filtered_att['Year'] = filtered_att['Year'].astype(int) 
filtered_att['Year']

3       2023
7       2023
8       2023
16      2023
17      2023
        ... 
4073    2022
4074    2022
4084    2022
4090    2022
4094    2022
Name: Year, Length: 957, dtype: int64

attack_cube = filtered_att.pivot_table(
    index="Location", columns=['Industry'],
    values='Year', aggfunc='count')                       
attack_cube

industries_by_country = {}

# Iterate over the attack cube and find the industries per each country that are being attacked:
for country in attack_cube.index:
    industries = attack_cube.loc[country].dropna().index.tolist()
    industries_by_country[country] = industries

# Convert the dictionary to a DataFrame:
industries_table = pd.DataFrame(industries_by_country.items(), columns=['Country', 'Industries'])

display(industries_table)

pivot_att_locs = (filtered_att.
                   groupby('Location')['Actor Location'].
                   value_counts())
pivot_att_locs.to_frame()

# Group the data by victim Location and Actor Location (who attaks who)
vic_loc_attack_loc = filtered_att.groupby(['Location', 'Actor Location']).size().reset_index(name='count')

# Find the victim country with max count for each Actor Location using the count as indexing
max_count_att = vic_loc_attack_loc.loc[vic_loc_attack_loc.groupby('Actor Location')['count'].idxmax()]
print(max_count_att)

               Location      Actor Location  count
101                 USA         Afghanistan      4
37                India          Bangladesh      2
5               Belarus             Belarus      3
102                 USA               China     11
20                 Cuba                Cuba      1
39                India               India      2
49               Israel                Iran     12
53                Italy               Italy      3
40                India            Malaysia      2
103                 USA         North Korea      2
41                India            Pakistan      2
50               Israel           Palestine      1
104                 USA  Russian Federation    148
52               Israel               Sudan     14
96             Thailand            Thailand      1
24              Denmark              Turkey      1
16                China                 USA      1
83   Russian Federation             Ukraine     74
100                  UK            Viet Nam      1

who_attacks_usa = max_count_att[max_count_att['Location'] == 'USA']
print(who_attacks_usa)

    Location      Actor Location  count
101      USA         Afghanistan      4
102      USA               China     11
103      USA         North Korea      2
104      USA  Russian Federation    148

plt.figure(figsize=(6, 6))
plt.bar(who_attacks_usa['Actor Location'],who_attacks_usa['count'], color='skyblue')
plt.xlabel('Actor Location')
plt.ylabel('Count')
plt.title('Who Attacks USA The Most?')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout()
plt.show()

who_attacks_india = max_count_att[max_count_att['Location'] == 'India']
print(who_attacks_india)

   Location Actor Location  count
37    India     Bangladesh      2
39    India          India      2
40    India       Malaysia      2
41    India       Pakistan      2

filtered_df['Actor Type'].value_counts()

Actor Type
Hacktivist      291
Nation-State     95
Criminal         71
Hobbyist         19
Terrorist         6
Name: count, dtype: int64

filtered_df # display the first dataset

filtered_att # display the second dataset

# The first function to pre-process all data and train KNN Classifier:
def train_knn_models(features, label_string, train_set, test_set):

    train_filtered = train_set[features + [label_string]].dropna() # make sure to drop the instances where either a feature or the label is NaN
    dropped_index = train_set.index.difference(train_filtered.index) # find the index of the dropped sample from the features

    X_train = train_filtered[features].to_dict(orient="records") # train set 
    y_train = train_set[label_string] 
    y_train_filtered = y_train.drop(dropped_index)

    test_filtered = test_set[features + [label_string]].dropna()
    dropped_index2 = test_set.index.difference(test_filtered.index)

    X_test = test_filtered[features].to_dict(orient="records") # test set 
    y_test = test_set[label_string] 
    y_test_filtered = y_test.drop(dropped_index2)

    # vectorize the training set:
    vec = DictVectorizer(sparse=False)
    vec.fit(X_train)
    X_train_tr3 = vec.transform(X_train)
    X_test_tr3= vec.transform(X_test)

    # standardize the data:
    scaler = StandardScaler()
    scaler.fit(X_train_tr3)
    X_train_sc3 = scaler.transform(X_train_tr3)
    X_test_sc3 = scaler.transform(X_test_tr3) # only transform the test data


    knn_models = {} # store the knn models

    for k in range(1, 100):
        model_each = KNeighborsClassifier(n_neighbors=k)
        model_each.fit(X_train_sc3, y_train_filtered)

        #y_pred_each2 = model_each2.predict(X_train_sc2) # for the train predicton
        y_test_pred_each = model_each.predict(X_test_sc3)

        #accuracy2_train = accuracy_score(y_train_rev, y_pred_each2) # for the train prediction
        accuracy_test = accuracy_score(y_test_filtered, y_test_pred_each)

        #f12_tr = f1_score(y_train_rev, y_pred_each2, average=None) # for the train prediction
        #f1_for_S2_tr = f12_tr[0] 

        f1_test = f1_score(y_test_filtered, y_test_pred_each, average=None)
        f1_for_S_test = f1_test[0] 

        knn_models [k] = {"k": k, "Accuracy Test": accuracy_test,  "f1 Test" : f1_for_S_test} 
    
    return knn_models

# The function to find the best KNN model based on the accuracy and F1 scores after training
def find_best_knn_model(models_list):
    # Search for the model with the highest accuracy:
    best_acc = 0
    best_k_for_acc = None # for additional printing

    for k, l in models_list.items():
        accuracy = l["Accuracy Test"] # each accuracy score
        if accuracy > best_acc:
            best_acc = accuracy
            best_k_for_acc = k

    best_f1_for_S = 0
    best_k_for_f1 = None

    for k, l in models_list.items():
        f1_each = l["f1 Test"] # each accuracy score
        if f1_each > best_f1_for_S:
            best_f1_for_S = f1_each
            best_k_for_f1 = k

    # Additional prints to display the best k value and its corresponding model:
    #print(f"Best k value: {best_k_for_acc}, with accuracy: {best_acc}.")
    #print(f"Best k value: {best_k_for_f1}, with F1 for S: {best_f1_for_S}.")
    
    return {"Model": "KNN", "Accuracy Test": best_acc, "f1 Test": best_f1_for_S}

# The code below uses the data pre-processing pipeline from the KNN Classifier training

# The second function to pre-process all data and train SVC:
def train_svm_model(features, label_string, train_set, test_set):
    train_filtered = train_set[features + [label_string]].dropna()
    dropped_index = train_set.index.difference(train_filtered.index)

    X_train = train_filtered[features].to_dict(orient="records")
    y_train = train_set[label_string]
    y_train_filtered = y_train.drop(dropped_index)

    test_filtered = test_set[features + [label_string]].dropna()
    dropped_index2 = test_set.index.difference(test_filtered.index)

    X_test = test_filtered[features].to_dict(orient="records")
    y_test = test_set[label_string]
    y_test_filtered = y_test.drop(dropped_index2)

    vec = DictVectorizer(sparse=False)
    vec.fit(X_train)
    X_train_tr = vec.transform(X_train)
    X_test_tr = vec.transform(X_test)

    scaler = StandardScaler()
    scaler.fit(X_train_tr)
    X_train_sc = scaler.transform(X_train_tr)
    X_test_sc = scaler.transform(X_test_tr)

    svm_model = SVC() # use Support Vector Machine Classifier 
    svm_model.fit(X_train_sc, y_train_filtered)

    y_test_pred = svm_model.predict(X_test_sc)
    accuracy_test = accuracy_score(y_test_filtered, y_test_pred)
    f1_test = f1_score(y_test_filtered, y_test_pred, average=None)[0]

    return {"Model": "SVM", "Accuracy Test": accuracy_test, "f1 Test": f1_test}

# The code below uses the data pre-processing pipeline from the KNN Classifier training

# The third function to pre-process all data and train Decision Tree Classifier:
def train_decision_tree_model(features, label_string, train_set, test_set):
    train_filtered = train_set[features + [label_string]].dropna()
    dropped_index = train_set.index.difference(train_filtered.index)

    X_train = train_filtered[features].to_dict(orient="records")
    y_train = train_set[label_string]
    y_train_filtered = y_train.drop(dropped_index)

    test_filtered = test_set[features + [label_string]].dropna()
    dropped_index2 = test_set.index.difference(test_filtered.index)

    X_test = test_filtered[features].to_dict(orient="records")
    y_test = test_set[label_string]
    y_test_filtered = y_test.drop(dropped_index2)

    vec = DictVectorizer(sparse=False)
    vec.fit(X_train)
    X_train_tr = vec.transform(X_train)
    X_test_tr = vec.transform(X_test)

    dt_model = DecisionTreeClassifier() # use Decision Tree Classifier
    dt_model.fit(X_train_tr, y_train_filtered)

    y_test_pred = dt_model.predict(X_test_tr)

    accuracy_test = accuracy_score(y_test_filtered, y_test_pred) # calculate scores on the test (validation) sets
    f1_test = f1_score(y_test_filtered, y_test_pred, average=None)[0]

    return {"Model": "DT", "Accuracy Test": accuracy_test, "f1 Test": f1_test}

# Split the dataset into the train and test set using fraction .8 (according to the regular train/test ratio 80:20)

# Threat Actors dataset:
train_set_thr = filtered_df.sample(frac=.8)
test_set_thr = filtered_df.drop(train_set_thr.index)

# Attack List dataset:
train_set_att = filtered_att.sample(frac=.8)
test_set_att = filtered_att.drop(train_set_att.index)

# Use the name, country of the attacker features to predict the actor type
# < Threat Actors > dataset
features2 = ["Name", "Country"] 
label_string2 = "Actor Type"

results_svm2 = train_svm_model(features2, label_string2, train_set_thr, test_set_thr) # SVC
results_knn2 = train_knn_models(features2, label_string2, train_set_thr, test_set_thr) # KNN
results_tree2 = train_decision_tree_model(features2, label_string2, train_set_thr, test_set_thr) # DT
best_knn2 = find_best_knn_model(results_knn2)

results2 = pd.DataFrame([best_knn2, results_svm2, results_tree2])
print(results2)

  Model  Accuracy Test   f1 Test
0   KNN       0.666667  0.344828
1   SVM       0.666667  0.000000
2    DT       0.614583  0.344828

# Use victim's location and industry to predict the actors' motives.
# < AttackList > dataset
features3 = ["Location", "Industry"]
label_string3 = "Motive"


results_knn3 = train_knn_models(features3, label_string3, train_set_att, test_set_att) # KNN
results_svm3 = train_svm_model(features3, label_string3, train_set_att, test_set_att) # SVC
results_tree3 = train_decision_tree_model(features3, label_string3, train_set_att, test_set_att) # DT
best_knn3 = find_best_knn_model(results_knn3)

results3 = pd.DataFrame([best_knn3, results_svm3, results_tree3])
print(results3)

  Model  Accuracy Test   f1 Test
0   KNN       0.806283  0.759259
1   SVM       0.785340  0.695652
2    DT       0.790576  0.762712

# Use victim's location and industry, as well as the type of the attack to predict the actors' motives.
# < AttackList > dataset

features4 = ["Location", "Industry", "Type"]
label_string4 = "Motive"

results_knn4 = train_knn_models(features4, label_string4, train_set_att, test_set_att) # KNN
results_svm4 = train_svm_model(features4, label_string4, train_set_att, test_set_att) # SVC
results_tree4 = train_decision_tree_model(features4, label_string4, train_set_att, test_set_att) # DT
best_knn4 = find_best_knn_model(results_knn4)

results4 = pd.DataFrame([best_knn4, results_svm4, results_tree4])
print("[Victim's Industry, Location, and Attack Type] -> [Actor's Motive]\n")
print(results4)

[Victim's Industry, Location, and Attack Type] -> [Actor's Motive]

  Model  Accuracy Test   f1 Test
0   KNN       0.858639  0.842975
1   SVM       0.890052  0.844828
2    DT       0.900524  0.912281

# Use victim's location and industry, as well as the attacker's motive to predict the actors' type
# < AttackList > dataset

features5 = ["Location", "Industry", "Motive"]
label_string5 = "Type"

results_knn5 = train_knn_models(features5, label_string5, train_set_att, test_set_att) # KNN
results_svm5 = train_svm_model(features5, label_string5, train_set_att, test_set_att) # SVC
results_tree5 = train_decision_tree_model(features5, label_string5, train_set_att, test_set_att) # DT
best_knn5 = find_best_knn_model(results_knn5)

results5 = pd.DataFrame([best_knn5, results_svm5, results_tree5])
print("[Victim's Industry, Location, and Actor's Motive] -> [Attack Type]\n")
print(results5)

[Victim's Industry, Location, and Actor's Motive] -> [Attack Type]

  Model  Accuracy Test   f1 Test
0   KNN       0.821990  0.920755
1   SVM       0.785340  0.923695
2    DT       0.832461  0.927419

# Create the counts table for the Actor's Country and Type:
counts_df = pd.crosstab(filtered_df["Country"], filtered_df["Actor Type"])
counts_norm1 = pd.crosstab(filtered_df["Country"], filtered_df["Actor Type"], normalize=True)
counts_df

counts_norm1

# Find the marginal probabilities for the Actor Type 
actor_type_marg = counts_norm1.sum(axis=0)
actor_type_marg

Actor Type
Criminal        0.147303
Hacktivist      0.603734
Hobbyist        0.039419
Nation-State    0.197095
Terrorist       0.012448
dtype: float64

# Given the actor type, the distribution of the actor's country
type_given_country = counts_norm1.divide(actor_type_marg, axis=1)
type_given_country

sns.heatmap(type_given_country) # Actor Type v Country

<Axes: xlabel='Actor Type', ylabel='Country'>

# Prepare the marginal distributions, joint probability, and the outer product for the independence
actors_all = counts_df.sum().sum() # total number of actors
types_marg = counts_df.sum(axis=0) / actors_all # The marginal distribution of actor type by summing over the country
country_marg = counts_df.sum(axis=1) / actors_all # The marginal distribution of actor country by summing over the actor type
expected1 = np.outer(country_marg, types_marg) # assuming independence

# Total Variation Distance:
tot_var_dist1 = (counts_norm1 - expected1).abs().sum().sum()

# Chi-Square Distance: 
chi_sq_1 = (((counts_norm1 - expected1) ** 2) / expected1).sum().sum()

# Mutual Information;
mut_info1 = (counts_norm1 * np.log(counts_norm1 / expected1)).sum().sum() # mutual information

# The correlation table:
results1_corr = pd.DataFrame()
results1_corr.insert(loc=0, column="Correlation", value=["Tot.Var.Dist", "Chi-Square", "Mut.Info"])
results1_corr.insert(loc=1, column="Value", value=[tot_var_dist1, chi_sq_1, mut_info1])
print("Actor's Country & Actor's Type")
print(results1_corr)

Actor's Country & Actor's Type
    Correlation     Value
0  Tot.Var.Dist  0.620194
1    Chi-Square  1.292809
2      Mut.Info  0.394576

# Create the counts table for the victim's Location and Industry
counts_att1 = pd.crosstab(filtered_att["Location"], filtered_att["Industry"])
counts_norm2 = pd.crosstab(filtered_att["Location"], filtered_att["Industry"], normalize=True)
counts_att1

# Find the marginal probabilities for the victim's industry 
industry_marg = counts_norm2.sum(axis=0)
industry_marg

Industry
Accommodation and Food Services      0.010471
Agriculture                          0.002094
Arts and Entertainment               0.012565
Construction                         0.006283
Educational Services                 0.065969
Enterprise Mgmnt                     0.002094
Finance and Insurance                0.113089
Health Care and Social Assistance    0.053403
Information                          0.075393
Manufacturing                        0.079581
Mining, Oil, and Gas                 0.003141
Others                               0.017801
Prof, Science, and Tech              0.048168
Public Administration                0.275393
Real Estate                          0.007330
Retail Trade                         0.010471
Transportation and Warehousing       0.126702
Utilities                            0.054450
Waste and Remediation Mgmnt          0.020942
Wholesale Trade                      0.014660
dtype: float64

# Given the victim's industry, the distribution of the victim's location
loc_given_industry = counts_norm2.divide(industry_marg, axis=1)
loc_given_industry

sns.heatmap(loc_given_industry.tail(n=20)) # Location v Industry (of the victims)

<Axes: xlabel='Industry', ylabel='Location'>

# Prepare the marginal distributions, joint probability, and the outer product for the independence
attacks_all = counts_att1.sum().sum() # total number of attacks
industry_marg = counts_att1.sum(axis=0) / attacks_all # The marginal distribution of victim's industry by summing over the country
victim_country_marg = counts_att1.sum(axis=1) / attacks_all # The marginal distribution of victim's country by summing over the industry
expected2 = np.outer(victim_country_marg, industry_marg) # assuming independence

# Total Variation Distance:
tot_var_dist2 = (counts_norm2 - expected2).abs().sum().sum()

# Chi-Square Distance: 
chi_sq_2 = (((counts_norm2 - expected2) ** 2) / expected2).sum().sum()

# Mutual Information;
mut_info2 = (counts_norm2 * np.log(counts_norm2 / expected2)).sum().sum() # mutual information

# The correlation table:
results2_corr = pd.DataFrame()
results2_corr.insert(loc=0, column="Correlation", value=["Tot.Var.Dist", "Chi-Square", "Mut.Info"])
results2_corr.insert(loc=1, column="Value", value=[tot_var_dist2, chi_sq_2, mut_info2])
print("Victim Location & Victim's Industry")
print(results2_corr)

Victim Location & Victim's Industry
    Correlation     Value
0  Tot.Var.Dist  0.790130
1    Chi-Square  2.132405
2      Mut.Info  0.592248

# Create the counts table for the victim's Location and actor's Location
counts_att2 = pd.crosstab(filtered_att["Location"], filtered_att["Actor Location"])
counts_norm3 = pd.crosstab(filtered_att["Location"], filtered_att["Actor Location"], normalize=True)
counts_att2

# Find the marginal probabilities for the actor's location
actor_loc = counts_norm3.sum(axis=0)
actor_loc

Actor Location
Afghanistan           0.013584
Bangladesh            0.002090
Belarus               0.004180
China                 0.027168
Cuba                  0.001045
India                 0.003135
Iran                  0.018809
Italy                 0.003135
Malaysia              0.002090
North Korea           0.005225
Pakistan              0.002090
Palestine             0.001045
Russian Federation    0.787879
Sudan                 0.025078
Thailand              0.001045
Turkey                0.002090
USA                   0.001045
Ukraine               0.098224
Viet Nam              0.001045
dtype: float64

# Given the actor's location, the distribution of the victim's location
vic_loc_given_act_loc = counts_norm3.divide(actor_loc, axis=1)
vic_loc_given_act_loc

sns.heatmap(vic_loc_given_act_loc.tail(n=20)) # Actor's Location v Victim's Location

<Axes: xlabel='Motive', ylabel='Location'>

# Prepare the marginal distributions, joint probability, and the outer product for the independence
attacks_all2 = counts_att2.sum().sum() # total number of attacks
actor_marg = counts_att2.sum(axis=0) / attacks_all2 # The marginal distribution of actor's location by summing over the victim's country
victim_loc_marg = counts_att2.sum(axis=1) / attacks_all2 # The marginal distribution of victim's country by summing over the actor's location
expected3 = np.outer(victim_loc_marg, actor_marg) # assuming independence

# Total Variation Distance:
tot_var_dist3 = (counts_norm3 - expected3).abs().sum().sum()

# Chi-Square Distance: 
chi_sq_3 = (((counts_norm3 - expected3) ** 2) / expected3).sum().sum()

# Mutual Information;
mut_info3 = (counts_norm3 * np.log(counts_norm3 / expected3)).sum().sum() # mutual information

# The correlation table:
results3_corr = pd.DataFrame()
results3_corr.insert(loc=0, column="Correlation", value=["Tot.Var.Dist", "Chi-Square", "Mut.Info"])
results3_corr.insert(loc=1, column="Value", value=[tot_var_dist3, chi_sq_3, mut_info3])
print("Victim Location & Actor's Location")
print(results3_corr)

Victim Location & Actor's Location
    Correlation     Value
0  Tot.Var.Dist  0.571103
1    Chi-Square  6.368058
2      Mut.Info  0.569270

# Create the counts table for the victim's Location and actor's Motive
counts_att3 = pd.crosstab(filtered_att["Location"], filtered_att["Motive"])
counts_norm4 = pd.crosstab(filtered_att["Location"], filtered_att["Motive"], normalize=True)
counts_att3

# Find the marginal probabilities for the actor's motive
actor_motive = counts_norm4.sum(axis=0)
actor_motive

Motive
Financial              0.261233
Political-Espionage    0.045977
Protest                0.681296
Protest,Financial      0.001045
Sabotage               0.010449
dtype: float64

# Given the actor's motive, the distribution of the victim's location
vic_loc_given_act_loc = counts_norm4.divide(actor_motive, axis=1)
vic_loc_given_act_loc

sns.heatmap(vic_loc_given_act_loc.tail(n=20)) # Actor's Motive v Victim's Location

<Axes: xlabel='Motive', ylabel='Location'>

# Prepare the marginal distributions, joint probability, and the outer product for the independence
attacks_all3 = counts_att3.sum().sum() # total number of attacks
motive_marg = counts_att3.sum(axis=0) / attacks_all3 # The marginal distribution of actor's motive by summing over the victim country
victim_loc_marg = counts_att3.sum(axis=1) / attacks_all3 # The marginal distribution of victim's country by summing over the actor's motive
expected4 = np.outer(victim_loc_marg, motive_marg) # assuming independence

# Total Variation Distance:
tot_var_dist4 = (counts_norm4 - expected4).abs().sum().sum()

# Chi-Square Distance: 
chi_sq_4 = (((counts_norm4 - expected4) ** 2) / expected4).sum().sum()

# Mutual Information;
mut_info4 = (counts_norm4 * np.log(counts_norm4 / expected4)).sum().sum() # mutual information

# The correlation table:
results4_corr = pd.DataFrame()
results4_corr.insert(loc=0, column="Correlation", value=["Tot.Var.Dist", "Chi-Square", "Mut.Info"])
results4_corr.insert(loc=1, column="Value", value=[tot_var_dist4, chi_sq_4, mut_info4])
print("Victim Location & Actor's Motive")
print(results4_corr)

Victim Location & Actor's Motive
    Correlation     Value
0  Tot.Var.Dist  0.639957
1    Chi-Square  0.847925
2      Mut.Info  0.390077

# Create the counts table for the actor's motive and actor's type
counts_att4 = pd.crosstab(filtered_att["Motive"], filtered_att["Type"])
counts_norm5 = pd.crosstab(filtered_att["Motive"], filtered_att["Type"], normalize=True)
counts_att4

# Find the marginal probabilities for the actor's type
actor_type2 = counts_norm5.sum(axis=0)
actor_type2

Type
Disruptive    0.668757
Exploitive    0.142111
Mixed         0.189133
dtype: float64

# Given the actor's type, the distribution of the victim's motive
motive_given_type = counts_norm5.divide(actor_type2, axis=1)
motive_given_type

sns.heatmap(motive_given_type) # Actor's Motive v Actor's Type

<Axes: xlabel='Type', ylabel='Motive'>

# Prepare the marginal distributions, joint probability, and the outer product for the independence
attacks_all4 = counts_att4.sum().sum() # total number of attacks
actor_type_marg = counts_att4.sum(axis=0) / attacks_all2 # The marginal distribution of actor's type by summing over the actor's type
motive_marg = counts_att4.sum(axis=1) / attacks_all2 # The marginal distribution of actor's type by summing over the actor's motive
expected5 = np.outer(motive_marg, actor_type_marg) # assuming independence

# Total Variation Distance:
tot_var_dist5 = (counts_norm5 - expected5).abs().sum().sum()

# Chi-Square Distance: 
chi_sq_5 = (((counts_norm5 - expected5) ** 2) / expected5).sum().sum()

# Mutual Information;
mut_info5 = (counts_norm5 * np.log(counts_norm5 / expected5)).sum().sum() # mutual information

# The correlation table:
results5_corr = pd.DataFrame()
results5_corr.insert(loc=0, column="Correlation", value=["Tot.Var.Dist", "Chi-Square", "Mut.Info"])
results5_corr.insert(loc=1, column="Value", value=[tot_var_dist5, chi_sq_5, mut_info5])
print("Actor's Motive & Actor's Type")
print(results5_corr)

Actor's Motive & Actor's Type
    Correlation     Value
0  Tot.Var.Dist  0.737340
1    Chi-Square  0.824788
2      Mut.Info  0.429383

# Create the counts table for the actor's Type and Sub-Type
counts_att5 = pd.crosstab(filtered_att["Type"], filtered_att["Sub-Type"])
counts_norm6 = pd.crosstab(filtered_att["Type"], filtered_att["Sub-Type"], normalize=True)
counts_att5

# Find the marginal probabilities for the actor's type
actor_type3 = counts_norm6.sum(axis=1)
actor_type3

Type
Disruptive    0.670157
Exploitive    0.140314
Mixed         0.189529
dtype: float64

# Given the actor's type, the distribution of the actor's sub-type
sub_type_given_type = counts_norm6.divide(actor_type3, axis=0)
sub_type_given_type

#sns.heatmap(sub_type_given_type) # Actor's Type v Actor's Sub-Type

plt.figure(figsize=(10, 6)) 
sns.heatmap(sub_type_given_type, xticklabels=True, yticklabels=True, cmap='coolwarm', annot=True, fmt=".1f")

# Rotate the x-axis labels by 80 degrees
plt.xticks(rotation=80)

plt.xlabel("Actor's Sub-Type")
plt.ylabel("Actor's Type")
plt.title("Actor's Type v Actor's Sub-Type")

plt.show()

# Prepare the marginal distributions, joint probability, and the outer product for the independence
attacks_all4 = counts_att5.sum().sum() # total number of attacks
sub_type_n = counts_att5.sum(axis=0) / attacks_all4 # The marginal distribution of actor's sub-type by summing over the actor's type
actor_type_n= counts_att5.sum(axis=1) / attacks_all4 # The marginal distribution of actor's type by summing over the actor's sub-type
expected6 = np.outer(actor_type_n, sub_type_n) # assuming independence

# Total Variation Distance:
tot_var_dist6 = (counts_norm6 - expected6).abs().sum().sum()

# Chi-Square Distance: 
chi_sq_6 = (((counts_norm6 - expected6) ** 2) / expected6).sum().sum()

# Mutual Information;
mut_info6 = (counts_norm6 * np.log(counts_norm6 / expected6)).sum().sum() # mutual information

# The correlation table:
results6_corr = pd.DataFrame()
results6_corr.insert(loc=0, column="Correlation", value=["Tot.Var.Dist", "Chi-Square", "Mut.Info"])
results6_corr.insert(loc=1, column="Value", value=[tot_var_dist6, chi_sq_6, mut_info6])
print("Victim Location & Actor's Location")
print(results6_corr)

Victim Location & Actor's Location
    Correlation     Value
0  Tot.Var.Dist  0.927806
1    Chi-Square  1.698736
2      Mut.Info  0.716728

	Name	Country	Actor Type	Last Seen
58	Adalat Ali	Iran (Islamic Republic of)	Hacktivist	2022-02-01
65	Agrius APT	Iran (Islamic Republic of)	Nation-State	2022-02-28
73	al-Tahera	Iran (Islamic Republic of)	Hacktivist	2022-07-12
81	Ali's Justice (Edalat-e Ali)	Iran (Islamic Republic of)	Hacktivist	2023-02-11
88	Altahrea Team	Iran (Islamic Republic of)	Hacktivist	2022-07-18
118	Anonymous (Iran)	Iran (Islamic Republic of)	Hacktivist	2020-01-10
149	Anti WMD Team	Iran (Islamic Republic of)	Hacktivist	2014-02-10
159	APT39	Iran (Islamic Republic of)	Nation-State	2019-03-04
164	Arvin Club	Iran (Islamic Republic of)	Criminal	2021-11-09
208	BlackMagic	Iran (Islamic Republic of)	Hacktivist	2022-12-07
210	BlackShadow	Iran (Islamic Republic of)	Criminal	2021-10-29
297	CyberAv3ngers	Iran (Islamic Republic of)	Hacktivist	2023-10-06
337	DEV-0056	Iran (Islamic Republic of)	Nation-State	2021-09-01
338	DEV-0228	Iran (Islamic Republic of)	Nation-State	2021-07-01
442	Green Leakers	Iran (Islamic Republic of)	Hacktivist	2019-05-09
443	Greenbug	Iran (Islamic Republic of)	Nation-State	2020-05-19
512	Iranian Nasr Institute (APT33)	Iran (Islamic Republic of)	Nation-State	2020-01-09
513	Iranian Revolutionary Guard Corps (CyberAv3ngers)	Iran (Islamic Republic of)	Nation-State	2023-10-17
514	IRIDIUM	Iran (Islamic Republic of)	Nation-State	2022-03-10
517	Islamic Cyber Resistance	Iran (Islamic Republic of)	Hacktivist	2014-08-06
518	Islamic Revolutionary Guard Corps (APT 35 Char...	Iran (Islamic Republic of)	Nation-State	2023-08-10
519	Islamic Revolutionary Guard Corps (IRGC)	Iran (Islamic Republic of)	Nation-State	2023-05-01
574	Lab Dookhtegan	Iran (Islamic Republic of)	Hacktivist	2019-06-03
617	Mango Sandstorm	Iran (Islamic Republic of)	Nation-State	2023-05-06
635	Ministry of Intelligence and Security (MOIS) (...	Iran (Islamic Republic of)	Nation-State	2022-05-11
643	Mint Sandstorm	Iran (Islamic Republic of)	Nation-State	2023-05-06
650	Mormoroth	Iran (Islamic Republic of)	Hacktivist	2014-02-25
656	Moses Staff	Iran (Islamic Republic of)	Hacktivist	2022-12-19
663	Mr.Xhat	Iran (Islamic Republic of)	Hobbyist	2014-01-06
763	People's Mujahedin of Iran (MEK)	Iran (Islamic Republic of)	Hacktivist	2022-06-02
764	People's Mujahideen Organization of Iran (PMOI)	Iran (Islamic Republic of)	Hacktivist	2022-01-27
849	Rocket Kitten	Iran (Islamic Republic of)	Nation-State	2016-08-02
864	RxR HaCker	Iran (Islamic Republic of)	Hacktivist	2015-05-07
897	Sharpboys	Iran (Islamic Republic of)	Criminal	2023-04-24
1116	Zurael_sTz	Iran (Islamic Republic of)	Hacktivist	2016-08-02

	Year
count	957.000000
mean	2022.345873
std	0.475901
min	2022.000000
25%	2022.000000
50%	2022.000000
75%	2023.000000
max	2023.000000

Sub-Type	Data Attack	Data Attack,Exploitation of Application Server	Exploitation of Application Server	Exploitation of End Host	Exploitation of End Hosts	Exploitation of End User	Exploitation of Network Infrastructure	Exploitation of Sensor	Exploitation of Sensors	External Denial of Service	External Denial of Services	Internal Denial of Service	Message Manipulation	Physical Attack	Unknown
Type
Disruptive	22	0	0	0	0	0	0	0	0	121	463	1	27	6	0
Exploitive	0	0	96	20	2	1	4	3	4	1	2	0	0	0	1
Mixed	147	21	10	0	0	0	0	0	0	1	2	0	0	0	0

Sub-Type	Data Attack	Data Attack,Exploitation of Application Server	Exploitation of Application Server	Exploitation of End Host	Exploitation of End Hosts	Exploitation of End User	Exploitation of Network Infrastructure	Exploitation of Sensor	Exploitation of Sensors	External Denial of Service	External Denial of Services	Internal Denial of Service	Message Manipulation	Physical Attack	Unknown
Type
Disruptive	0.034375	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.189062	0.723437	0.001563	0.042188	0.009375	0.000000
Exploitive	0.000000	0.000000	0.716418	0.149254	0.014925	0.007463	0.029851	0.022388	0.029851	0.007463	0.014925	0.000000	0.000000	0.000000	0.007463
Mixed	0.812155	0.116022	0.055249	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.005525	0.011050	0.000000	0.000000	0.000000	0.000000

"An Analytical Investigation into Threat Actors' Objectives and Their Targets in Cyberspace"

CMPS 3160: Introduction to Data Science

Presented by Sofiia Druchyna

GitHub.io Page

Datasets and Project Description¶

Research Questions¶

Methods¶

Threat Actors Dataset¶

Cyber-Attacks Dataset¶

MODELING¶

Data Pre-Processing and Model Definitions¶

Experiment 1¶

Experiment 2¶

Experiment 3¶

Experiment 4¶

Correlation between Categorical Features¶

Experiment 1¶

Experiment 2¶

Experiment 3¶

Experiment 4¶

Experiment 5¶

Experiment 6¶

Conclusions¶

	Name	Country	Actor Type	Established	Last Seen
0	$2a$45	Undetermined	Criminal	NaN	2018-03-09
1	'desserped'	Undetermined	Criminal	NaN	2023-10-11
2	0mega	Undetermined	Criminal	NaN	2023-02-20
3	0x0D1337	Undetermined	Undetermined	NaN	2015-09-30
4	0x2Taylor	Undetermined	Hacktivist	NaN	2016-10-13
...	...	...	...	...	...
1195	NaN	NaN	NaN	NaN	NaN
1196	NaN	NaN	NaN	NaN	NaN
1197	NaN	NaN	NaN	NaN	NaN
1198	NaN	NaN	NaN	NaN	NaN
1199	NaN	NaN	NaN	NaN	NaN

	Name	Country	Actor Type	Last Seen
44	@THTHerakles	Turkey	Hacktivist	2015-04-13
51	A. S. A. L. A.	Armenia	Hacktivist	2015-11-11
53	Abdellah Elmaghribi	United States of America	Hacktivist	2015-10-15
54	Abdellah Elmaghribi; Moroccan Wolf	Turkmenistan	Hacktivist	2015-04-09
56	Absa employee	South Africa	Criminal	2020-10-27
...	...	...	...	...
1108	Z Company Hacking Crew	India	Hacktivist	2014-10-27
1109	Zarya	Russian Federation	Hacktivist	2022-08-19
1112	Zer0Pwn	Syrian Arab Republic	Hacktivist	2014-04-06
1115	Zukr@in	Pakistan	Hacktivist	2014-10-09
1116	Zurael_sTz	Iran (Islamic Republic of)	Hacktivist	2016-08-02

		count
Country	Actor Type
Afghanistan	Nation-State	3
Afghanistan	Hobbyist	1
Albania	Hacktivist	1
Algeria	Hacktivist	1
Armenia	Hacktivist	1
...	...	...
Uzbekistan	Nation-State	1
Venezuela (Bolivarian Republic of)	Hacktivist	1
Viet Nam	Criminal	1
	Hobbyist	1
	Nation-State	1

	Country	Actor Type	count
23	China	Nation-State	26
87	Russian Federation	Criminal	30
103	Syrian Arab Republic	Terrorist	3
118	United States of America	Hacktivist	45
120	United States of America	Hobbyist	5

	Date	Location	Victim	Industry	Actor Location	Actor	Motive	Type	Sub-Type
0	2023-10-31	United States of America	Smoothie King	Accommodation and Food Services	Undetermined	Undetermined	Financial	Exploitive	Undetermined
1	2023-10-31	Italy	Avangate	Other Services (except Public Administration)	Undetermined	Alpha Team	Financial	Mixed	Data Attack
2	2023-10-30	United States of America	Dallas County	Public Administration	Undetermined	PLAY	Financial	Mixed	Data Attack
3	2023-10-30	Russian Federation	National Payment Card System (NSPK)	Finance and Insurance	Ukraine	DumpForums and Ukrainian Cyber Alliance	Protest	Disruptive	Message Manipulation
4	2023-10-30	Germany	Südwestfalen IT	Professional, Scientific, and Technical Services	Undetermined	Undetermined	Financial	Mixed	Data Attack
...	...	...	...	...	...	...	...	...	...
4095	2022-03-31	Undetermined	Undisclosed organization	Undetermined	Russian Federation	ALPHVM	Financial	Disruptive	Data Attack
4096	2022-03-31	United Kingdom of Great Britain and Northern I...	Individuals in the UK	Retail Trade	Undetermined	Undetermined	Financial	Exploitive	Exploitation of End Hosts
4097	2022-03-31	Spain	Iberdrola	Utilities	Undetermined	Undetermined	Financial	Exploitive	Exploitation of Application Server
4098	2022-03-31	Undetermined	Ola Finance	Finance and Insurance	Undetermined	Undetermined	Financial	Exploitive	Exploitation of Application Server
4099	2022-03-31	Russian Federation	volozhin.gov.by	Public Administration	Undetermined	Anonymous	Protest	Disruptive	Message Manipulation

Industry	Accommodation and Food Services	Agriculture	Arts and Entertainment	Construction	Educational Services	Enterprise Mgmnt	Finance and Insurance	Health Care and Social Assistance	Information	Manufacturing	Mining, Oil, and Gas	Others	Prof, Science, and Tech	Public Administration	Real Estate	Retail Trade	Transportation and Warehousing	Utilities	Waste and Remediation Mgmnt	Wholesale Trade
Location
Argentina	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Australia	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	4.0	1.0	2.0	NaN	1.0	NaN	NaN	NaN
Austria	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	1.0	NaN	NaN	NaN
Belarus	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN
Belgium	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	NaN	NaN	1.0	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Ukraine	2.0	1.0	3.0	3.0	2.0	NaN	26.0	2.0	28.0	12.0	NaN	3.0	NaN	40.0	NaN	1.0	10.0	11.0	1.0	8.0
United Arab Emirates	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Venezuela (Bolivarian Republic of)	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Viet Nam	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
Zambia	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Country	Industries
0	Argentina	[Accommodation and Food Services]
1	Australia	[Manufacturing, Prof, Science, and Tech, Publi...
2	Austria	[Public Administration, Transportation and War...
3	Belarus	[Educational Services, Public Administration]
4	Belgium	[Public Administration, Transportation and War...
...	...	...
64	Ukraine	[Accommodation and Food Services, Agriculture,...
65	United Arab Emirates	[Finance and Insurance]
66	Venezuela (Bolivarian Republic of)	[Finance and Insurance]
67	Viet Nam	[Finance and Insurance]
68	Zambia	[Finance and Insurance]

		count
Location	Actor Location
Argentina	Russian Federation	1
Australia	Russian Federation	5
	China	2
	Iran	2
Austria	Russian Federation	2
...	...	...
Ukraine	Sudan	1
United Arab Emirates	North Korea	1
Venezuela (Bolivarian Republic of)	Russian Federation	1
Viet Nam	North Korea	1
Zambia	Russian Federation	1

Industry	Accommodation and Food Services	Agriculture	Arts and Entertainment	Construction	Educational Services	Enterprise Mgmnt	Finance and Insurance	Health Care and Social Assistance	Information	Manufacturing	Mining, Oil, and Gas	Others	Prof, Science, and Tech	Public Administration	Real Estate	Retail Trade	Transportation and Warehousing	Utilities	Waste and Remediation Mgmnt	Wholesale Trade
Location
Argentina	0.1	0.0	0.00	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.00	0.000000
Australia	0.0	0.0	0.00	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.013158	0.0	0.000000	0.086957	0.003802	0.285714	0.0	0.008264	0.000000	0.00	0.000000
Austria	0.0	0.0	0.00	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.003802	0.000000	0.0	0.008264	0.000000	0.00	0.000000
Belarus	0.0	0.0	0.00	0.0	0.015873	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.011407	0.000000	0.0	0.000000	0.000000	0.00	0.000000
Belgium	0.0	0.0	0.00	0.0	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.011407	0.000000	0.0	0.008264	0.000000	0.00	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Ukraine	0.2	0.5	0.25	0.5	0.031746	0.0	0.240741	0.039216	0.388889	0.157895	0.0	0.176471	0.000000	0.152091	0.000000	0.1	0.082645	0.211538	0.05	0.571429
United Arab Emirates	0.0	0.0	0.00	0.0	0.000000	0.0	0.009259	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.00	0.000000
Venezuela (Bolivarian Republic of)	0.0	0.0	0.00	0.0	0.000000	0.0	0.009259	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.00	0.000000
Viet Nam	0.0	0.0	0.00	0.0	0.000000	0.0	0.009259	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.00	0.000000
Zambia	0.0	0.0	0.00	0.0	0.000000	0.0	0.009259	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.000000	0.000000	0.0	0.000000	0.000000	0.00	0.000000

Motive	Financial	Political-Espionage	Protest	Protest,Financial	Sabotage
Location
Argentina	0.004	0.000000	0.000000	0.0	0.0
Australia	0.020	0.068182	0.000000	0.0	0.1
Austria	0.004	0.000000	0.001534	0.0	0.0
Belarus	0.000	0.000000	0.006135	0.0	0.0
Belgium	0.000	0.000000	0.006135	0.0	0.0
...	...	...	...	...	...
Ukraine	0.004	0.454545	0.193252	0.0	0.6
United Arab Emirates	0.004	0.000000	0.000000	0.0	0.0
Venezuela (Bolivarian Republic of)	0.004	0.000000	0.000000	0.0	0.0
Viet Nam	0.004	0.000000	0.000000	0.0	0.0
Zambia	0.004	0.000000	0.000000	0.0	0.0

Type	Disruptive	Exploitive	Mixed
Motive
Financial	0.032813	0.507353	0.883978
Political-Espionage	0.000000	0.235294	0.066298
Protest	0.956250	0.250000	0.033149
Protest,Financial	0.000000	0.000000	0.005525
Sabotage	0.010938	0.007353	0.011050