The challenge
Write a function that, given a string of text (possibly with punctuation and line-breaks), returns an array of the top-3 most occurring words, in descending order of the number of occurrences.
Assumptions:
- A word is a string of letters (A to Z) optionally containing one or more apostrophes (‘) in ASCII. (No need to handle fancy punctuation.)
- Matches should be case-insensitive, and the words in the result should be lowercased.
- Ties may be broken arbitrarily.
- If a text contains fewer than three unique words, then either the top-2 or top-1 words should be returned, or an empty array if a text contains no words.
Examples:
top_3_words("In a village of La Mancha, the name of which I have no desire to call to mind, there lived not long since one of those gentlemen that keep a lance in the lance-rack, an old buckler, a lean hack, and a greyhound for coursing. An olla of rather more beef than mutton, a salad on most nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra on Sundays, made away with three-quarters of his income.") # => ["a", "of", "on"] top_3_words("e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e") # => ["e", "ddd", "aa"] top_3_words(" //wont won't won't") # => ["won't", "wont"]
Bonus points:
- Avoid creating an array whose memory footprint is roughly as big as the input text.
- Avoid sorting the entire array of unique words.
Test cases
from random import choice, randint, sample, shuffle, choices import re from collections import Counter def check(s, this=None): # this: only for debugging purpose returned_result = top_3_words(s) if this is None else this fs = Counter(w for w in re.findall(r"[a-zA-Z']+", s.lower()) if w != "'" * len(w)) exp,expected_frequencies = map(list,zip(*fs.most_common(3))) if fs else ([],[]) msg = '' wrong_words = [w for w in returned_result if not fs[w]] actual_freq = [fs[w] for w in returned_result] if wrong_words: msg = 'Incorrect match: words not present in the string. Your output: {}. One possible valid answer: {}'.format(returned_result, exp) elif len(set(returned_result)) != len(returned_result): msg = 'The result should not contain copies of the same word. Your output: {}. One possible output: {}'.format(returned_result, exp) elif actual_freq!=expected_frequencies: msg = "Incorrect frequencies: {} should be {}. Your output: {}. One possible output: {}".format(actual_freq, expected_frequencies, returned_result, exp) Test.expect(not msg, msg) @test.describe("Fixed tests") def fixed_tests(): TESTS = ( "a a a b c c d d d d e e e e e", "e e e e DDD ddd DdD: ddd ddd aa aA Aa, bb cc cC e e e", " //wont won't won't ", " , e .. ", " ... ", " ' ", " ''' ", """In a village of La Mancha, the name of which I have no desire to cao mind, there lived not long since one of those gentlemen that keep a lance in the lance-rack, an old buckler, a lean hack, and a greyhound for coursing. An olla of rather more beef than mutton, a salad on most nights, scraps on Saturdays, lentils on Fridays, and a pigeon or so extra on Sundays, made away with three-quarters of his income.""", "a a a b c c X", "a a c b b", ) for s in TESTS: check(s) @test.describe("Random tests") def random_tests(): def gen_word(): return "".join(choice("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'") for _ in range(randint(3, 10))) def gen_string(): words = [] nums = choices(range(1, 31), k=20) for _ in range(randint(0, 20)): words += [gen_word()] * nums.pop() shuffle(words) s = "" while words: s += words.pop() + "".join(choice("-,.?!_:;/ ") for _ in range(randint(1, 5))) return s @test.it("Tests") def it_1(): for _ in range(100): check(gen_string())
The solution using Python
Option 1:
# use the Counter module from collections import Counter # use the regex module import re def top_3_words(text): # count the input, pass through a regex and lowercase it c = Counter(re.findall(r"[a-z']+", re.sub(r" '+ ", " ", text.lower()))) # return the `most common` 3 items return [w for w,_ in c.most_common(3)]
Option 2:
def top_3_words(text): # loop through each character in the string for c in text: # if it's not alphanumeric or an apostrophe if not (c.isalpha() or c=="'"): # replace with a space text = text.replace(c,' ') # create some `list` variables words,counts,out = [],[],[] # loop through the words in the text for word in list(filter(None,text.lower().split())): # if in all, then continue if all([not c.isalpha() for c in word]): continue # if the word is in the words list if word in words: # increment the count counts[words.index(word)] += 1 else: # otherwise create a new entry words.append(word); counts.append(0) # loop while bigger than 0 and less than 3 while len(words)>0 and len(out)<3: # append the counts out.append(words.pop(counts.index(max(counts))).lower()) counts.remove(max(counts)) # return the counts return out
Option 3:
def top_3_words(text): wrds = {} for p in r'!"#$%&()*+,./:;<=>[email protected][\]^_`{|}~-': text = text.replace(p, ' ') for w in text.lower().split(): if w.replace("'", '') != '': wrds[w] = wrds.get(w, 0) + 1 return [y[0] for y in sorted(wrds.items(), key=lambda x: x[1], reverse=True)[:3]]