Source code for secScraper.metrics

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import difflib
import re
import string
import math
from tqdm import tqdm
import nltk


# tokenize = lambda string_of_text: string_of_text.lower().split(" ")

[docs]def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude




[docs]def diff_jaccard(str1, str2):
    """
    Calculates the Jaccard similarity between two strings.

    :param str1: First string.
    :param str2: Second string.
    :return: float in the [0, 1] interval
    """
    assert type(str1) == list and type(str2) == list
    assert type(str1[0]) == str and type(str2[0]) == str
    a = set(str1) 
    b = set(str2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


[docs]def diff_sk_cosine_tf(str1, str2, stop_words):
    # Direct via sklearn
    # sklearn_tf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=tokenize)
    sklearn_tf = TfidfVectorizer(norm='l2', stop_words=stop_words, use_idf=False)
    sklearn_representation = sklearn_tf.fit_transform([str1, str2])
    x, y = sklearn_representation.toarray()
    cosine_similarity = linear_kernel(x.reshape(1, -1), y.reshape(1, -1))  # Only works with norm='l2'
    return cosine_similarity[0][0]

[docs]def diff_sk_cosine_tf_idf(str1, str2, stop_words):
    # sklearn_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
    sklearn_tfidf = TfidfVectorizer(norm='l2', stop_words=stop_words, use_idf=True)
    sklearn_representation = sklearn_tfidf.fit_transform([str1, str2])
    x, y = sklearn_representation.toarray()
    cosine_similarity = linear_kernel(x.reshape(1, -1), y.reshape(1, -1))  # Only works with norm='l2'
    return cosine_similarity[0][0]

[docs]def diff_cosine_tf(str1, str2):
    """
    Calculates the Cosine TF similarity between two strings.

    :param str1: First string.
    :param str2: Second string.
    :return: float in the [0, 1] interval
    """
    vect = TfidfVectorizer(use_idf=False)  # Per paper
    tf = vect.fit_transform([str1, str2])
    tf_similarity = tf * tf.T
    return float(tf_similarity[0, 1])

[docs]def diff_cosine_tf_idf(str1, str2):
    """
    Calculates the Cosine TF-IDF similarity between two strings.

    :param str1: First string.
    :param str2: Second string.
    :return: float in the [0, 1] interval
    """
    vect = TfidfVectorizer(use_idf=True)  # Activate TF-IDF
    tfidf = vect.fit_transform([str1, str2])
    tfidf_similarity = tfidf * tfidf.T
    return float(tfidf_similarity[0, 1])


[docs]def diff_minEdit(str1, str2):
    """
    Calculates the minEdit similarity between two strings. This is word based.
    WARNING: VERY SLOW BEYOND ~10,000 CHAR TO COMPARE.

    :param str1: First string.
    :param str2: Second string.
    :return: float in the [0, 1] interval
    """
    f = difflib.SequenceMatcher(None, a=str1, b=str2)
    count_words_str1 = len(re.compile(r'\w+').findall(str1))
    count_words_str2 = len(re.compile(r'\w+').findall(str2))
    transformations = f.get_opcodes()  # Impossible to compute for larger texts
    transformations = [t for t in transformations if t[0] != 'equal']
    similarity = 1-len(transformations)/(count_words_str1+count_words_str2)
    similarity = abs(similarity)
    similarity = min(1, abs(similarity))  # Prevent it from being negative
    # similarity = f.ratio()  # That could be another option
    return similarity


[docs]def diff_gfg_editDistDP(str1, str2):
    # WARNING: O(m x n) complexity in RAM & time (if enough RAM...)
    # str1 = tokenize(str1)
    # str2 = tokenize(str2)
    assert type(str1) == list and type(str2) == list
    assert type(str1[0]) == str and type(str2[0]) == str
    
    m = len(str1)
    n = len(str2)
    # Create a table to store results of subproblems 
    dp = [[0 for x in range(n+1)] for x in range(m+1)]
  
    # Fill d[][] in bottom up manner 
    for i in range(m+1): 
        for j in range(n+1): 
  
            # If first string is empty, only option is to 
            # insert all characters of second string 
            if i == 0: 
                dp[i][j] = j    # Min. operations = j 
  
            # If second string is empty, only option is to 
            # remove all characters of second string 
            elif j == 0: 
                dp[i][j] = i    # Min. operations = i 
  
            # If last characters are same, ignore last char 
            # and recur for remaining string 
            elif str1[i-1] == str2[j-1]: 
                dp[i][j] = dp[i-1][j-1] 
  
            # If last character are different, consider all 
            # possibilities and find minimum 
            else: 
                dp[i][j] = 1 + min(dp[i][j-1],        # Insert 
                                   dp[i-1][j],        # Remove 
                                   dp[i-1][j-1])    # Replace 
  
    # return dp[m][n]
    return 1 - dp[m][n]/(m+n)


[docs]def diff_simple(str1, str2):
    """
    Calculates the simple difference similarity between two strings. This is character based.
    WARNING: VERY SLOW BEYOND ~10,000 CHAR TO COMPARE.

    :param str1: First string.
    :param str2: Second string.
    :return: float in the [0, 1] interval
    """
    d = difflib.Differ()
    comparison = list(d.compare(str1, str2))
    comparison = [change for change in comparison if change[0] != ' ']
    similarity = 1-len(comparison)/(len(str1) + len(str2))
    return similarity

[docs]def diff_edit_distance(str1, str2):
    # str1 = tokenize(str1)
    # str2 = tokenize(str2)
    return nltk.edit_distance(str1, str2)/(len(str1)+len(str2))


[docs]def composite_index(data):
    """
    Create a composite index based on the sentiment analysis based on Loughran and McDonald's
    dictionary and script.

    :param data: String to analyse.
    :return: List of values. See unused variable OUTPUT_FIELDS in the source, there is a lot.
    """
    OUTPUT_FIELDS = ['file type', 'file size', 'number of words', '% positive', '% negative',
                 '% uncertainty', '% litigious', '% modal-weak', '% modal moderate',
                 '% modal strong', '% constraining', '# of alphabetic', '# of digits',
                 '# of numbers', 'avg # of syllables per word', 'average word length', 'vocabulary']

    # Sign will be of positive + negative proportion. Averaged by number of words.
    if data[2] + data[3] + data[4]:
        result = (data[3]-data[4])/data[2]
    else:  # Avoid the case when the text is too short and a div per zero error is thrown
        result = 0
    
    return result


[docs]def sing_sentiment(text, lm_dictionary):
    """
    Run the Loughran and McDonald's sentiment analysis on a string.

    :param text: String to analyze.
    :param lm_dictionary: Sentiment dictionary
    :return: Quite a few fields.
    """

    text_len = len(text)
    text = re.sub('(May|MAY)', ' ', text)  # drop all May month references ## lol
    text = text.upper()  # for this parse caps aren't informative so shift
    output_data = _get_data(text, lm_dictionary)
    output_data[0] = type(text)
    output_data[1] = text_len
    result = composite_index(output_data)

    return result


def _get_data(text, lm_dictionary):
    """
    Internal function to load the data and process it - comes from Loughran and McDonald's work with light
    modifications to incorporate it in my script.

    :param text: string to analyze
    :param lm_dictionary: Sentiment dictionary
    :return:
    """
    vdictionary = {}
    _odata = [0] * 17
    total_syllables = 0
    word_length = 0

    tokens = re.findall(r'\w+', text)  # Note that \w+ splits hyphenated words
    for token in tokens:
        if not token.isdigit() and len(token) > 1 and token in lm_dictionary:
            _odata[2] += 1  # word count
            word_length += len(token)
            if token not in vdictionary:
                vdictionary[token] = 1
            if lm_dictionary[token].positive: _odata[3] += 1
            if lm_dictionary[token].negative: _odata[4] += 1
            if lm_dictionary[token].uncertainty: _odata[5] += 1
            if lm_dictionary[token].litigious: _odata[6] += 1
            if lm_dictionary[token].weak_modal: _odata[7] += 1
            if lm_dictionary[token].moderate_modal: _odata[8] += 1
            if lm_dictionary[token].strong_modal: _odata[9] += 1
            if lm_dictionary[token].constraining: _odata[10] += 1
            total_syllables += lm_dictionary[token].syllables

    _odata[11] = len(re.findall('[A-Z]', text))
    _odata[12] = len(re.findall('[0-9]', text))
    # drop punctuation within numbers for number count
    text = re.sub(r'(?!=[0-9])(\.|,)(?=[0-9])', '', text)
    text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    _odata[13] = len(re.findall(r'\b[-+\(]?[$€£]?[-+(]?\d+\)?\b', text))
    _odata[14] = total_syllables / _odata[2]
    _odata[15] = word_length / _odata[2]
    _odata[16] = len(vdictionary)

    # Convert counts to %
    for i in range(3, 10 + 1):
        _odata[i] = (_odata[i] / _odata[2]) * 100
    # Vocabulary

    return _odata