from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import difflib
import re
import string
import math
from tqdm import tqdm
import nltk
# tokenize = lambda string_of_text: string_of_text.lower().split(" ")
[docs]def cosine_similarity(vector1, vector2):
dot_product = sum(p*q for p,q in zip(vector1, vector2))
magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
if not magnitude:
return 0
return dot_product/magnitude
[docs]def diff_jaccard(str1, str2):
"""
Calculates the Jaccard similarity between two strings.
:param str1: First string.
:param str2: Second string.
:return: float in the [0, 1] interval
"""
assert type(str1) == list and type(str2) == list
assert type(str1[0]) == str and type(str2[0]) == str
a = set(str1)
b = set(str2)
c = a.intersection(b)
return float(len(c)) / (len(a) + len(b) - len(c))
[docs]def diff_sk_cosine_tf(str1, str2, stop_words):
# Direct via sklearn
# sklearn_tf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=tokenize)
sklearn_tf = TfidfVectorizer(norm='l2', stop_words=stop_words, use_idf=False)
sklearn_representation = sklearn_tf.fit_transform([str1, str2])
x, y = sklearn_representation.toarray()
cosine_similarity = linear_kernel(x.reshape(1, -1), y.reshape(1, -1)) # Only works with norm='l2'
return cosine_similarity[0][0]
[docs]def diff_sk_cosine_tf_idf(str1, str2, stop_words):
# sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
sklearn_tfidf = TfidfVectorizer(norm='l2', stop_words=stop_words, use_idf=True)
sklearn_representation = sklearn_tfidf.fit_transform([str1, str2])
x, y = sklearn_representation.toarray()
cosine_similarity = linear_kernel(x.reshape(1, -1), y.reshape(1, -1)) # Only works with norm='l2'
return cosine_similarity[0][0]
[docs]def diff_cosine_tf(str1, str2):
"""
Calculates the Cosine TF similarity between two strings.
:param str1: First string.
:param str2: Second string.
:return: float in the [0, 1] interval
"""
vect = TfidfVectorizer(use_idf=False) # Per paper
tf = vect.fit_transform([str1, str2])
tf_similarity = tf * tf.T
return float(tf_similarity[0, 1])
[docs]def diff_cosine_tf_idf(str1, str2):
"""
Calculates the Cosine TF-IDF similarity between two strings.
:param str1: First string.
:param str2: Second string.
:return: float in the [0, 1] interval
"""
vect = TfidfVectorizer(use_idf=True) # Activate TF-IDF
tfidf = vect.fit_transform([str1, str2])
tfidf_similarity = tfidf * tfidf.T
return float(tfidf_similarity[0, 1])
[docs]def diff_minEdit(str1, str2):
"""
Calculates the minEdit similarity between two strings. This is word based.
WARNING: VERY SLOW BEYOND ~10,000 CHAR TO COMPARE.
:param str1: First string.
:param str2: Second string.
:return: float in the [0, 1] interval
"""
f = difflib.SequenceMatcher(None, a=str1, b=str2)
count_words_str1 = len(re.compile(r'\w+').findall(str1))
count_words_str2 = len(re.compile(r'\w+').findall(str2))
transformations = f.get_opcodes() # Impossible to compute for larger texts
transformations = [t for t in transformations if t[0] != 'equal']
similarity = 1-len(transformations)/(count_words_str1+count_words_str2)
similarity = abs(similarity)
similarity = min(1, abs(similarity)) # Prevent it from being negative
# similarity = f.ratio() # That could be another option
return similarity
[docs]def diff_gfg_editDistDP(str1, str2):
# WARNING: O(m x n) complexity in RAM & time (if enough RAM...)
# str1 = tokenize(str1)
# str2 = tokenize(str2)
assert type(str1) == list and type(str2) == list
assert type(str1[0]) == str and type(str2[0]) == str
m = len(str1)
n = len(str2)
# Create a table to store results of subproblems
dp = [[0 for x in range(n+1)] for x in range(m+1)]
# Fill d[][] in bottom up manner
for i in range(m+1):
for j in range(n+1):
# If first string is empty, only option is to
# insert all characters of second string
if i == 0:
dp[i][j] = j # Min. operations = j
# If second string is empty, only option is to
# remove all characters of second string
elif j == 0:
dp[i][j] = i # Min. operations = i
# If last characters are same, ignore last char
# and recur for remaining string
elif str1[i-1] == str2[j-1]:
dp[i][j] = dp[i-1][j-1]
# If last character are different, consider all
# possibilities and find minimum
else:
dp[i][j] = 1 + min(dp[i][j-1], # Insert
dp[i-1][j], # Remove
dp[i-1][j-1]) # Replace
# return dp[m][n]
return 1 - dp[m][n]/(m+n)
[docs]def diff_simple(str1, str2):
"""
Calculates the simple difference similarity between two strings. This is character based.
WARNING: VERY SLOW BEYOND ~10,000 CHAR TO COMPARE.
:param str1: First string.
:param str2: Second string.
:return: float in the [0, 1] interval
"""
d = difflib.Differ()
comparison = list(d.compare(str1, str2))
comparison = [change for change in comparison if change[0] != ' ']
similarity = 1-len(comparison)/(len(str1) + len(str2))
return similarity
[docs]def diff_edit_distance(str1, str2):
# str1 = tokenize(str1)
# str2 = tokenize(str2)
return nltk.edit_distance(str1, str2)/(len(str1)+len(str2))
[docs]def composite_index(data):
"""
Create a composite index based on the sentiment analysis based on Loughran and McDonald's
dictionary and script.
:param data: String to analyse.
:return: List of values. See unused variable OUTPUT_FIELDS in the source, there is a lot.
"""
OUTPUT_FIELDS = ['file type', 'file size', 'number of words', '% positive', '% negative',
'% uncertainty', '% litigious', '% modal-weak', '% modal moderate',
'% modal strong', '% constraining', '# of alphabetic', '# of digits',
'# of numbers', 'avg # of syllables per word', 'average word length', 'vocabulary']
# Sign will be of positive + negative proportion. Averaged by number of words.
if data[2] + data[3] + data[4]:
result = (data[3]-data[4])/data[2]
else: # Avoid the case when the text is too short and a div per zero error is thrown
result = 0
return result
[docs]def sing_sentiment(text, lm_dictionary):
"""
Run the Loughran and McDonald's sentiment analysis on a string.
:param text: String to analyze.
:param lm_dictionary: Sentiment dictionary
:return: Quite a few fields.
"""
text_len = len(text)
text = re.sub('(May|MAY)', ' ', text) # drop all May month references ## lol
text = text.upper() # for this parse caps aren't informative so shift
output_data = _get_data(text, lm_dictionary)
output_data[0] = type(text)
output_data[1] = text_len
result = composite_index(output_data)
return result
def _get_data(text, lm_dictionary):
"""
Internal function to load the data and process it - comes from Loughran and McDonald's work with light
modifications to incorporate it in my script.
:param text: string to analyze
:param lm_dictionary: Sentiment dictionary
:return:
"""
vdictionary = {}
_odata = [0] * 17
total_syllables = 0
word_length = 0
tokens = re.findall(r'\w+', text) # Note that \w+ splits hyphenated words
for token in tokens:
if not token.isdigit() and len(token) > 1 and token in lm_dictionary:
_odata[2] += 1 # word count
word_length += len(token)
if token not in vdictionary:
vdictionary[token] = 1
if lm_dictionary[token].positive: _odata[3] += 1
if lm_dictionary[token].negative: _odata[4] += 1
if lm_dictionary[token].uncertainty: _odata[5] += 1
if lm_dictionary[token].litigious: _odata[6] += 1
if lm_dictionary[token].weak_modal: _odata[7] += 1
if lm_dictionary[token].moderate_modal: _odata[8] += 1
if lm_dictionary[token].strong_modal: _odata[9] += 1
if lm_dictionary[token].constraining: _odata[10] += 1
total_syllables += lm_dictionary[token].syllables
_odata[11] = len(re.findall('[A-Z]', text))
_odata[12] = len(re.findall('[0-9]', text))
# drop punctuation within numbers for number count
text = re.sub(r'(?!=[0-9])(\.|,)(?=[0-9])', '', text)
text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
_odata[13] = len(re.findall(r'\b[-+\(]?[$€£]?[-+(]?\d+\)?\b', text))
_odata[14] = total_syllables / _odata[2]
_odata[15] = word_length / _odata[2]
_odata[16] = len(vdictionary)
# Convert counts to %
for i in range(3, 10 + 1):
_odata[i] = (_odata[i] / _odata[2]) * 100
# Vocabulary
return _odata