Most frequently used words in a text with Python

The challenge

Write a function that, given a string of text (possibly with punctuation and line-breaks), returns an array of the top-3 most occurring words, in descending order of the number of occurrences.

Assumptions:

  • A word is a string of letters (A to Z) optionally containing one or more apostrophes (‘) in ASCII. (No need to handle fancy punctuation.)
  • Matches should be case-insensitive, and the words in the result should be lowercased.
  • Ties may be broken arbitrarily.
  • If a text contains fewer than three unique words, then either the top-2 or top-1 words should be returned, or an empty array if a text contains no words.

Examples:

top_3_words("In a village of La Mancha, the name of which I have no desire to call to mind, there lived not long since one of those gentlemen that keep a lance in the lance-rack, an old buckler, a lean hack, and a greyhound for coursing. An olla of rather more beef than mutton, a salad on most nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra on Sundays, made away with three-quarters of his income.") # => ["a", "of", "on"] top_3_words("e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e") # => ["e", "ddd", "aa"] top_3_words(" //wont won't won't") # => ["won't", "wont"]
Code language: Python (python)

Bonus points:

  1. Avoid creating an array whose memory footprint is roughly as big as the input text.
  2. Avoid sorting the entire array of unique words.

Test cases

from random import choice, randint, sample, shuffle, choices import re from collections import Counter def check(s, this=None): # this: only for debugging purpose returned_result = top_3_words(s) if this is None else this fs = Counter(w for w in re.findall(r"[a-zA-Z']+", s.lower()) if w != "'" * len(w)) exp,expected_frequencies = map(list,zip(*fs.most_common(3))) if fs else ([],[]) msg = '' wrong_words = [w for w in returned_result if not fs[w]] actual_freq = [fs[w] for w in returned_result] if wrong_words: msg = 'Incorrect match: words not present in the string. Your output: {}. One possible valid answer: {}'.format(returned_result, exp) elif len(set(returned_result)) != len(returned_result): msg = 'The result should not contain copies of the same word. Your output: {}. One possible output: {}'.format(returned_result, exp) elif actual_freq!=expected_frequencies: msg = "Incorrect frequencies: {} should be {}. Your output: {}. One possible output: {}".format(actual_freq, expected_frequencies, returned_result, exp) Test.expect(not msg, msg) @test.describe("Fixed tests") def fixed_tests(): TESTS = ( "a a a b c c d d d d e e e e e", "e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e", " //wont won't won't ", " , e .. ", " ... ", " ' ", " ''' ", """In a village of La Mancha, the name of which I have no desire to cao mind, there lived not long since one of those gentlemen that keep a lance in the lance-rack, an old buckler, a lean hack, and a greyhound for coursing. An olla of rather more beef than mutton, a salad on most nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra on Sundays, made away with three-quarters of his income.""", "a a a b c c X", "a a c b b", ) for s in TESTS: check(s) @test.describe("Random tests") def random_tests(): def gen_word(): return "".join(choice("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'") for _ in range(randint(3, 10))) def gen_string(): words = [] nums = choices(range(1, 31), k=20) for _ in range(randint(0, 20)): words += [gen_word()] * nums.pop() shuffle(words) s = "" while words: s += words.pop() + "".join(choice("-,.?!_:;/ ") for _ in range(randint(1, 5))) return s @test.it("Tests") def it_1(): for _ in range(100): check(gen_string())
Code language: Python (python)

The solution using Python

Option 1:

# use the Counter module from collections import Counter # use the regex module import re def top_3_words(text): # count the input, pass through a regex and lowercase it c = Counter(re.findall(r"[a-z']+", re.sub(r" '+ ", " ", text.lower()))) # return the `most common` 3 items return [w for w,_ in c.most_common(3)]
Code language: Python (python)

Option 2:

def top_3_words(text): # loop through each character in the string for c in text: # if it's not alphanumeric or an apostrophe if not (c.isalpha() or c=="'"): # replace with a space text = text.replace(c,' ') # create some `list` variables words,counts,out = [],[],[] # loop through the words in the text for word in list(filter(None,text.lower().split())): # if in all, then continue if all([not c.isalpha() for c in word]): continue # if the word is in the words list if word in words: # increment the count counts[words.index(word)] += 1 else: # otherwise create a new entry words.append(word); counts.append(0) # loop while bigger than 0 and less than 3 while len(words)>0 and len(out)<3: # append the counts out.append(words.pop(counts.index(max(counts))).lower()) counts.remove(max(counts)) # return the counts return out
Code language: Python (python)

Option 3:

def top_3_words(text): wrds = {} for p in r'!"#$%&()*+,./:;<=>[email protected][\]^_`{|}~-': text = text.replace(p, ' ') for w in text.lower().split(): if w.replace("'", '') != '': wrds[w] = wrds.get(w, 0) + 1 return [y[0] for y in sorted(wrds.items(), key=lambda x: x[1], reverse=True)[:3]]
Code language: Python (python)
Tags:
Subscribe
Notify of
guest
0 Comments
Inline Feedbacks
View all comments