Python/other/detecting_english_programmatically.py

import os

UPPERLETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
LETTERS_AND_SPACE = UPPERLETTERS + UPPERLETTERS.lower() + " \t\n"


def loadDictionary():
    path = os.path.split(os.path.realpath(__file__))
    englishWords = {}
    with open(path[0] + "/dictionary.txt") as dictionaryFile:
        for word in dictionaryFile.read().split("\n"):
            englishWords[word] = None
    return englishWords


ENGLISH_WORDS = loadDictionary()


def getEnglishCount(message):
    message = message.upper()
    message = removeNonLetters(message)
    possibleWords = message.split()

    if possibleWords == []:
        return 0.0

    matches = 0
    for word in possibleWords:
        if word in ENGLISH_WORDS:
            matches += 1

    return float(matches) / len(possibleWords)


def removeNonLetters(message):
    lettersOnly = []
    for symbol in message:
        if symbol in LETTERS_AND_SPACE:
            lettersOnly.append(symbol)
    return "".join(lettersOnly)


def isEnglish(message, wordPercentage=20, letterPercentage=85):
    """
    >>> isEnglish('Hello World')
    True

    >>> isEnglish('llold HorWd')
    False
    """
    wordsMatch = getEnglishCount(message) * 100 >= wordPercentage
    numLetters = len(removeNonLetters(message))
    messageLettersPercentage = (float(numLetters) / len(message)) * 100
    lettersMatch = messageLettersPercentage >= letterPercentage
    return wordsMatch and lettersMatch


import doctest

doctest.testmod()
Added test cases 2016-08-02 17:46:55 +00:00			`import os`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`UPPERLETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"`
			`LETTERS_AND_SPACE = UPPERLETTERS + UPPERLETTERS.lower() + " \t\n"`

Initial 2016-08-02 15:33:29 +00:00
			`def loadDictionary():`
Added test cases 2016-08-02 17:46:55 +00:00			`path = os.path.split(os.path.realpath(__file__))`
Initial 2016-08-02 15:33:29 +00:00			`englishWords = {}`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`with open(path[0] + "/dictionary.txt") as dictionaryFile:`
			`for word in dictionaryFile.read().split("\n"):`
Fix ResourceWarning: unclosed file (#681) Signed-off-by: Mickaël Schoentgen <contact@tiger-222.fr> 2019-01-08 08:59:23 +00:00			`englishWords[word] = None`
Initial 2016-08-02 15:33:29 +00:00			`return englishWords`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
Initial 2016-08-02 15:33:29 +00:00			`ENGLISH_WORDS = loadDictionary()`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
Initial 2016-08-02 15:33:29 +00:00			`def getEnglishCount(message):`
			`message = message.upper()`
			`message = removeNonLetters(message)`
			`possibleWords = message.split()`

			`if possibleWords == []:`
			`return 0.0`

			`matches = 0`
			`for word in possibleWords:`
			`if word in ENGLISH_WORDS:`
			`matches += 1`

			`return float(matches) / len(possibleWords)`

psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
Initial 2016-08-02 15:33:29 +00:00			`def removeNonLetters(message):`
			`lettersOnly = []`
			`for symbol in message:`
			`if symbol in LETTERS_AND_SPACE:`
			`lettersOnly.append(symbol)`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`return "".join(lettersOnly)`

Initial 2016-08-02 15:33:29 +00:00
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00			`def isEnglish(message, wordPercentage=20, letterPercentage=85):`
Added test cases 2016-08-02 17:46:55 +00:00			`"""`
			`>>> isEnglish('Hello World')`
			`True`

			`>>> isEnglish('llold HorWd')`
			`False`
			`"""`
Initial 2016-08-02 15:33:29 +00:00			`wordsMatch = getEnglishCount(message) * 100 >= wordPercentage`
			`numLetters = len(removeNonLetters(message))`
			`messageLettersPercentage = (float(numLetters) / len(message)) * 100`
			`lettersMatch = messageLettersPercentage >= letterPercentage`
			`return wordsMatch and lettersMatch`
Added test cases 2016-08-02 17:46:55 +00:00

			`import doctest`
psf/black code formatting (#1277) 2019-10-05 05:14:13 +00:00
Added test cases 2016-08-02 17:46:55 +00:00			`doctest.testmod()`