language-complexity/complexity_measures.py at main · stevenraphael/language-complexity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import wikipedia
import requests
#import spacy
import regex as re
import enchant
#from ntlk import parse.stanford.GenericStanfordParser
#from ntlk import parse
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.data import find
import numpy as np
import gensim
import textacy
from textacy import preprocessing
from textacy import extract, text_stats
textacy.set_doc_extensions("extract")
textacy.set_doc_extensions("text_stats.readability")
textacy.remove_doc_extensions("extract.matches")


from scraping import scrape
"""

See also: https://github.com/tsproisl/textcomplexity

https://quanteda.io/articles/quickstart.html

"""


DICTIONARY=enchant.Dict("en_US")

def true_unpack(text):
    """
    Takes string of text
    Returns unpack_by_sentence (nested list, inner lists are lists of words, each individual sentences)
    and unpack_by_word, lists of all words, not separated into sentences
    """
    l=[]
    flat=[]
    for sentence in sent_tokenize(text):
        sent=[]
        for word in word_tokenize(sentence):
            for subword in word.split('-'):
                sent.append(subword)
                flat.append(subword)
        l.append(sent)
    return l, flat


def unpack_by_word(text, dictionary):
    splitted=re.split(' |\\n|\.|,|\:|\?|\!', text)
    #print(splitted)
    refined=[]
    for word in splitted:
        #new_word=re.split('.|,|!|:|\?', word)
        #print(new_word, word)
        #for subsection in
        letter_exists=len(re.findall("a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z|0|1|2|3|4|5|6|7|8|9", word.lower()))
        if len(word)>0 and dictionary.check(word) and letter_exists:
            #print(word)
            refined.append(word)
        elif word.isdigit():
            refined.append(word)
    return refined

def unpack_by_sentence(text, dictionary):
    splitted=re.split('\\n|\.|\?|\!', text)

    #print(splitted)
    refined=[]

    for phrase in range(len(splitted)):
        sentence=[]
        for word in splitted[phrase].split(' '):
            #print(word)
            if len(word)>0 and dictionary.check(word):
                sentence.append(word)
                #print(refined)
            elif word.isdigit():
                sentence.append(word)

            if len(sentence)>0:
                refined.append(sentence)

    #print(refined)
    return refined


def average_length(sample):
    """
    For flattened sample (unpack_by_word). Average WORD length
    """
    return sum([len(word) for word in sample])/len(sample)

def lexical_diversity(sample, model):
    """
    For flattened sample. Sample is a list of depth 1 containing all words in an article, not separated into sentences. Generated by unpack_by_word
    """
    new_sample=[]
    for word in sample:
        if len(word)!=0 and (word in model or DICTIONARY.check(word)):
            new_sample.append(word)
        #else:
        #    print(word)
    return (len(set(new_sample))+1)/(len(new_sample)+1)
    #return len(set(sample))/len(sample)

def average_sentence_length(sample):
    """
    For nested list sample, separated by sentence. Generated by unpack_by_sentence
    """
    #print(samples)
    return sum([len(sentence) for sentence in sample])/len(sample)

def average_tree_depth(sample, parser):
    """
    Can be done with Spacy library in Python. https://spacy.io/

    For the sample, would need to be a sample where sentences are separated out, i.e.
    unpack_by_sentence . However, Spacy needs strings, not lists. Best if you join inner sentences lists
    to feed to spacy model.
    """
    pass

def lexical_cohesion(sample):
    """
    Sentence vectors can be generated using a variety of GPT and BERT -based packages,
    but sbert is pretty good: https://www.sbert.net/

    I would recommend using a model more effective with semantic search, see this article: https://medium.com/mlearning-ai/semantic-search-with-s-bert-is-all-you-need-951bc710e160
    """
    pass

def num_verbs_per_sentence(tokenized_sample):
    """
    nested list sample, separated by sentences
    """
    verb_counter=0
    #sentence_counter=0
    for sentence in tokenized_sample:
        for word in nltk.pos_tag(sentence):
        #if word[1]=='.':
        #    sentence_counter+=1
            if word[1][0]=='V':
                verb_counter+=1
    return verb_counter/len(tokenized_sample)


def create_mock_vec(sample, model):
    """
    sample must be by sentence. as in, must be a list containing words for an individual sentence
    """

    ignore=["DT", "TO", "IN"] #ignoring articles and prepositions, to make sentence vectors more unique
    summed=0

    len_words=0

    #averaging word vectors in sentence
    for word in nltk.pos_tag(sample):
        if word[0] in model and word[1] not in ignore:
            summed+=model[word[0]]
            len_words+=1
    if len_words==0:
        return None
    return summed/len(sample)

def freq_unknowns(sample, model):
    """
    2d depth
    """
    return sum([1
        for sentence in sample for word in sentence if word not in model
        ])/len(sample)


def mock_lexical_cohesion(sample, model):

    summed=0
    len_sentences=0
    #sample=sample.copy()
    #previous=sample.pop(0)
    previous=create_mock_vec(sample[0], model)
    for s in range(1, len(sample)):
        next_s=create_mock_vec(sample[s], model)
        if previous is None:
            previous=next_s
        elif next_s is not None:
            summed+=np.dot(previous, next_s)/(np.linalg.norm(previous)*np.linalg.norm(next_s))
            previous=next_s
            len_sentences+=1

    if len_sentences==0:
        return None
    return summed/len_sentences

def min_mlex_cohesion(sample, model):

    min_=1
    #len_sentences=0

    #sample=sample.copy()
    #previous=sample.pop(0)
    previous=create_mock_vec(sample[0], model)
    #previous_cos=1
    for s in range(1, len(sample)):
        next_s=create_mock_vec(sample[s], model)
        if previous is None:
            previous=next_s
        elif next_s is not None:

            min_=min(np.dot(previous, next_s)/(np.linalg.norm(previous)*np.linalg.norm(next_s)), min_)
            previous=next_s
            #len_sentences+=1

    #if len_sentences==0:
        #return None
    return min_


def fraction_sentences(sample, threshold=30):
    return sum([len(sentence)>=threshold for sentence in sample])/len(sample)

import nltk
nltk.download('word2vec_sample')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)


art = 'Atomic_nucleus'
art = 'DNA'

wiki_page=scrape(f"https://en.wikipedia.org/wiki/{art}")
simple_wiki=scrape(f"https://simple.wikipedia.org/wiki/{art}")


#tokens=word_tokenize(sample.content)


text = preprocessing.normalize.whitespace(preprocessing.remove.punctuation(wiki_page))
en=textacy.load_spacy_lang("en_core_web_sm", disable=("parser",))
doc = textacy.make_spacy_doc(wiki_page, lang=en)
print(doc._.flesch_reading_ease())
doc = textacy.make_spacy_doc(simple_wiki, lang=en)
print(doc._.flesch_reading_ease())

sent_tokens_a, flat_a=true_unpack(wiki_page)
sent_tokens_s, flat_s=true_unpack(simple_wiki)

print(f"Regular wikipedia scores (page: {art})")


print("average word length: ", average_length(flat_a))
print("lexical diversity: ", lexical_diversity(flat_a, model))
print("Unknowns per sentence: ", freq_unknowns(sent_tokens_a, model))
print("average sentence length: ", average_sentence_length(sent_tokens_a))
print("Fraction of sentences 30 or more words: ", fraction_sentences(sent_tokens_a))
print("verbs per sentence: ", num_verbs_per_sentence(sent_tokens_a))
print("mock lexical cohesion: ", mock_lexical_cohesion(sent_tokens_a, model))
print("min lexical cohesion: ",  min_mlex_cohesion(sent_tokens_a, model))
#print(sample.content)

print()


print(f"Simple wikipedia scores (page:{art})")


print("average word length: ", average_length(flat_s))
print("lexical diversity: ", lexical_diversity(flat_s, model))
print("Unknowns per sentence: ", freq_unknowns(sent_tokens_s, model))
print("average sentence length: ", average_sentence_length(sent_tokens_s))
print("Fraction of sentences 30 or more words: ", fraction_sentences(sent_tokens_s))
print("verbs per sentence: ", num_verbs_per_sentence(sent_tokens_s))
print("mock lexical cohesion: ", mock_lexical_cohesion(sent_tokens_s, model))
print("min lexical cohesion: ",  min_mlex_cohesion(sent_tokens_s, model))