import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score, cohen_kappa_score

def jaccard_similarity(x, y):
    """
    Returns jaccard score between x and y
    """
    return np.logical_and(x, y).sum() / np.logical_or(x, y).sum()

d = {'TS': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'Text': ['Regularly paying too much for free trials?', 
             'Exercise as a chance for your free vehicle.',
             'I have just as much fun as I need.',
             'Do you like donuts?',
             'Fresh donuts available for cheap',
             'They had fresh donuts available, so today was fun',
             'Register your free trial today',
             'What time is good for you?',
             'I didn\'t pay for the donuts',
             'Cheap viagra available',
             'Did you have a good time today?',
             'It was available so I registered'], 
     'Transformed': ['regular pay free trial', 
                     'exercise chance free vehicle',
                     'fun need',
                     'like donut',
                     'fresh donut available cheap',
                     'fresh donut available today fun',
                     'register free trial today',
                     'time good',
                     'pay donut',
                     'cheap viagra available',
                     'good time today',
                     'available register'],
     'Class': ['Spam', 'Spam', 'Not spam', 'Not spam', 'Spam', 'Not spam', 'Spam', 'Not spam', 'Not spam', 'Spam', 'Not spam', 'Not spam']}
data = pd.DataFrame(d)

Data stream of documents

display(data[['TS', 'Text', 'Class']])

	TS	Text	Class
0	1	Regularly paying too much for free trials?	Spam
1	2	Exercise as a chance for your free vehicle.	Spam
2	3	I have just as much fun as I need.	Not spam
3	4	Do you like donuts?	Not spam
4	5	Fresh donuts available for cheap	Spam
5	6	They had fresh donuts available, so today was fun	Not spam
6	7	Register your free trial today	Spam
7	8	What time is good for you?	Not spam
8	9	I didn't pay for the donuts	Not spam
9	10	Cheap viagra available	Spam
10	11	Did you have a good time today?	Not spam
11	12	It was available so I registered	Not spam

Convert the texts into binary vectors where the presence of a term is 1 and the absence is 0. Use the following structure for the document vectors, which excludes stop words:

[regular, pay, free, trial, exercise,chance, vehicle, fun, need, like, donut, fresh, available, cheap, register, today, time, good, viagra, run]

Note: Assume there is a pre-processing function that stems the terms, so paying becomes pay, trials become trial,etc.

vocab = ['regular', 'pay', 'free', 'trial', 'exercise', 'chance', 'vehicle', 'fun', 'need', 'like', 'donut', 'fresh', 'available', 'cheap', 'register', 'today', 'time', 'good', 'viagra', 'run']
vec = CountVectorizer(binary=True, stop_words='english', lowercase=True, vocabulary=vocab)
X = vec.fit_transform(data.Transformed)
text_vectors = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
text_vectors['TS'] = data.TS
text_vectors.set_index('TS', inplace=True)
display(text_vectors)

	regular	pay	free	trial	exercise	chance	vehicle	fun	need	like	donut	fresh	available	cheap	register	today	time	good	viagra	run
TS
1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	1	0	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	1	1	1	1	0	0	0	0	0	0
6	0	0	0	0	0	0	0	1	0	0	1	1	1	0	0	1	0	0	0	0
7	0	0	1	1	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0
8	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0
9	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0
11	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0

To measure document similarity, calculate the Jaccard coefficient between two document vectors.

Example:

doc1=[1, 1, 0, 0]

doc2=[1, 0, 1, 0]

Jaccard(doc1, doc2) = intersection / union

intersection: number of times 1 appears in both docs at the same position (1)

union: number of times 1 appears in one vector and either 0 or 1 appears in the other (3)

1 / (1 + 1 + 1) = 0.66

TS 6

Window: [1, 2, 3, 4, 5]

print('Jaccard(6, 1):', jaccard_similarity(text_vectors.loc[6], text_vectors.loc[1]))
print('Jaccard(6, 2):', jaccard_similarity(text_vectors.loc[6], text_vectors.loc[2]))
print('Jaccard(6, 3):', jaccard_similarity(text_vectors.loc[6], text_vectors.loc[3]))
print('Jaccard(6, 4):', jaccard_similarity(text_vectors.loc[6], text_vectors.loc[4]))
print('Jaccard(6, 5):', jaccard_similarity(text_vectors.loc[6], text_vectors.loc[5]))

Jaccard(6, 1): 0.0
Jaccard(6, 2): 0.0
Jaccard(6, 3): 0.16666666666666666
Jaccard(6, 4): 0.16666666666666666
Jaccard(6, 5): 0.5

Nearest Neighbors: [3: Not spam, 4: Not spam, 5: Spam]

Classification: Not spam

TS 7

Window: [2, 3, 4, 5, 6]

print('Jaccard(7, 2):', jaccard_similarity(text_vectors.loc[7], text_vectors.loc[2]))
print('Jaccard(7, 3):', jaccard_similarity(text_vectors.loc[7], text_vectors.loc[3]))
print('Jaccard(7, 4):', jaccard_similarity(text_vectors.loc[7], text_vectors.loc[4]))
print('Jaccard(7, 5):', jaccard_similarity(text_vectors.loc[7], text_vectors.loc[5]))
print('Jaccard(7, 6):', jaccard_similarity(text_vectors.loc[7], text_vectors.loc[6]))

Jaccard(7, 2): 0.14285714285714285
Jaccard(7, 3): 0.0
Jaccard(7, 4): 0.0
Jaccard(7, 5): 0.0
Jaccard(7, 6): 0.125

Nearest Neighbors: [2: Spam, 6: Not spam, 3, 4, 5] (Since there are many instances tied for 3rd nearest neighbor, keep lowering K till the tie is broken)

Nearest Neighbor: [2: Spam]

Classification: Spam

TS 8

Window: [3, 4, 5, 6, 7]

print('Jaccard(8, 3):', jaccard_similarity(text_vectors.loc[8], text_vectors.loc[3]))
print('Jaccard(8, 4):', jaccard_similarity(text_vectors.loc[8], text_vectors.loc[4]))
print('Jaccard(8, 5):', jaccard_similarity(text_vectors.loc[8], text_vectors.loc[5]))
print('Jaccard(8, 6):', jaccard_similarity(text_vectors.loc[8], text_vectors.loc[6]))
print('Jaccard(8, 7):', jaccard_similarity(text_vectors.loc[8], text_vectors.loc[7]))

Jaccard(8, 3): 0.0
Jaccard(8, 4): 0.0
Jaccard(8, 5): 0.0
Jaccard(8, 6): 0.0
Jaccard(8, 7): 0.0

Since all instances are tied, use majority classification

Classification: Not spam

TS 9

Window: [4, 5, 6, 7, 8]

print('Jaccard(9, 4):', jaccard_similarity(text_vectors.loc[9], text_vectors.loc[4]))
print('Jaccard(9, 5):', jaccard_similarity(text_vectors.loc[9], text_vectors.loc[5]))
print('Jaccard(9, 6):', jaccard_similarity(text_vectors.loc[9], text_vectors.loc[6]))
print('Jaccard(9, 7):', jaccard_similarity(text_vectors.loc[9], text_vectors.loc[7]))
print('Jaccard(9, 8):', jaccard_similarity(text_vectors.loc[9], text_vectors.loc[8]))

Jaccard(9, 4): 0.3333333333333333
Jaccard(9, 5): 0.2
Jaccard(9, 6): 0.16666666666666666
Jaccard(9, 7): 0.0
Jaccard(9, 8): 0.0

Nearest Neighbors: [4: Not Spam, 6: Not spam, 5, 7, 8] (Since there are many instances tied for 3rd nearest neighbor, keep lowering K till the tie is broken)

Nearest Neighbor: [4: Not spam, 6: Not spam]

Classification: Not spam

TS 10

Window: [5, 6, 7, 8, 9]

print('Jaccard(10, 5):', jaccard_similarity(text_vectors.loc[10], text_vectors.loc[5]))
print('Jaccard(10, 6):', jaccard_similarity(text_vectors.loc[10], text_vectors.loc[6]))
print('Jaccard(10, 7):', jaccard_similarity(text_vectors.loc[10], text_vectors.loc[7]))
print('Jaccard(10, 8):', jaccard_similarity(text_vectors.loc[10], text_vectors.loc[8]))
print('Jaccard(10, 9):', jaccard_similarity(text_vectors.loc[10], text_vectors.loc[9]))

Jaccard(10, 5): 0.4
Jaccard(10, 6): 0.14285714285714285
Jaccard(10, 7): 0.0
Jaccard(10, 8): 0.0
Jaccard(10, 9): 0.0

Nearest Neighbors: [5: Spam, 10: Spam, 7, 8, 9] (Since there are many instances tied for 3rd nearest neighbor, keep lowering K till the tie is broken)

Nearest Neighbor: [5: Spam, 10: Spam]

Classification: Spam

TS 11

Window: [6, 7, 8, 9, 10]

print('Jaccard(11, 6):', jaccard_similarity(text_vectors.loc[11], text_vectors.loc[6]))
print('Jaccard(11, 7):', jaccard_similarity(text_vectors.loc[11], text_vectors.loc[7]))
print('Jaccard(11, 8):', jaccard_similarity(text_vectors.loc[11], text_vectors.loc[8]))
print('Jaccard(11, 9):', jaccard_similarity(text_vectors.loc[11], text_vectors.loc[9]))
print('Jaccard(11, 10):', jaccard_similarity(text_vectors.loc[11], text_vectors.loc[10]))

Jaccard(11, 6): 0.14285714285714285
Jaccard(11, 7): 0.16666666666666666
Jaccard(11, 8): 0.6666666666666666
Jaccard(11, 9): 0.0
Jaccard(11, 10): 0.0

Nearest Neighbors: [8: Not spam, 7: Spam, 6: Not spam]

Classification: Not spam

TS 12

Window: [7, 8, 9, 10, 11]

print('Jaccard(12, 7):', jaccard_similarity(text_vectors.loc[12], text_vectors.loc[7]))
print('Jaccard(12, 8):', jaccard_similarity(text_vectors.loc[12], text_vectors.loc[8]))
print('Jaccard(12, 9):', jaccard_similarity(text_vectors.loc[12], text_vectors.loc[9]))
print('Jaccard(12, 10):', jaccard_similarity(text_vectors.loc[12], text_vectors.loc[10]))
print('Jaccard(12, 11):', jaccard_similarity(text_vectors.loc[12], text_vectors.loc[11]))

Jaccard(12, 7): 0.2
Jaccard(12, 8): 0.0
Jaccard(12, 9): 0.0
Jaccard(12, 10): 0.25
Jaccard(12, 11): 0.0

Nearest Neighbors: [10: Spam, 7: Spam, 8, 9, 10] (Since there are many instances tied for 3rd nearest neighbor, keep lowering K till the tie is broken)

Nearest Neighbor: [10: Spam, 7: Spam]

Classification: Spam

Summary of predictions

r = {'ts': [6, 7, 8, 9, 10, 11, 12],
     'pred': ['Not spam', 'Spam', 'Not spam', 'Not spam', 'Spam', 'Not spam', 'Spam'],
     'actual': ['Not spam', 'Spam', 'Not spam', 'Not spam', 'Spam', 'Spam', 'Not spam']}
results = pd.DataFrame(r).set_index('ts')
results

	pred	actual
ts
6	Not spam	Not spam
7	Spam	Spam
8	Not spam	Not spam
9	Not spam	Not spam
10	Spam	Spam
11	Not spam	Spam
12	Spam	Not spam

When the dataset is imbalanced, computing Kappa against the ground truth gives a more reliable performance estimate than accuracy. Higher values are better

print('Accuracy:', np.round(accuracy_score(results.actual, results.pred), 2))
print('Kappa   :', np.round(cohen_kappa_score(results.actual, results.pred), 2))

Accuracy: 0.71
Kappa   : 0.42

	regular	pay	free	trial	exercise	chance	vehicle	fun	need	like	donut	fresh	available	cheap	register	today	time	good	viagra	run
TS
1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	1	0	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	1	1	1	1	0	0	0	0	0	0
6	0	0	0	0	0	0	0	1	0	0	1	1	1	0	0	1	0	0	0	0
7	0	0	1	1	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0
8	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0
9	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0
11	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0

	regular	pay	free	trial	exercise	chance	vehicle	fun	need	like	donut	fresh	available	cheap	register	today	time	good	viagra	run
TS
1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	1	0	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	1	1	1	1	0	0	0	0	0	0
6	0	0	0	0	0	0	0	1	0	0	1	1	1	0	0	1	0	0	0	0
7	0	0	1	1	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0
8	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0
9	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0
11	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0

	regular	pay	free	trial	exercise	chance	vehicle	fun	need	like	donut	fresh	available	cheap	register	today	time	good	viagra	run
TS
1	1	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
2	0	0	1	0	1	1	1	0	0	0	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	1	1	1	1	0	0	0	0	0	0
6	0	0	0	0	0	0	0	1	0	0	1	1	1	0	0	1	0	0	0	0
7	0	0	1	1	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0
8	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0
9	0	1	0	0	0	0	0	0	0	0	1	0	0	0	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	0	0	1	1	0	0	0	0	1	0
11	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	1	1	1	0	0
12	0	0	0	0	0	0	0	0	0	0	0	0	1	0	1	0	0	0	0	0