Python/other/detecting_english_programmatically.py

import os

UPPERLETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
LETTERS_AND_SPACE = UPPERLETTERS + UPPERLETTERS.lower() + ' \t\n'

def loadDictionary():
    path = os.path.split(os.path.realpath(__file__))
    englishWords = {}
    with open(path[0] + '/dictionary.txt') as dictionaryFile:
        for word in dictionaryFile.read().split('\n'):
            englishWords[word] = None
    return englishWords

ENGLISH_WORDS = loadDictionary()

def getEnglishCount(message):
    message = message.upper()
    message = removeNonLetters(message)
    possibleWords = message.split()

    if possibleWords == []:
        return 0.0

    matches = 0
    for word in possibleWords:
        if word in ENGLISH_WORDS:
            matches += 1

    return float(matches) / len(possibleWords)

def removeNonLetters(message):
    lettersOnly = []
    for symbol in message:
        if symbol in LETTERS_AND_SPACE:
            lettersOnly.append(symbol)
    return ''.join(lettersOnly)

def isEnglish(message, wordPercentage = 20, letterPercentage = 85):
    """
    >>> isEnglish('Hello World')
    True

    >>> isEnglish('llold HorWd')
    False
    """
    wordsMatch = getEnglishCount(message) * 100 >= wordPercentage
    numLetters = len(removeNonLetters(message))
    messageLettersPercentage = (float(numLetters) / len(message)) * 100
    lettersMatch = messageLettersPercentage >= letterPercentage
    return wordsMatch and lettersMatch


import doctest
doctest.testmod()
Added test cases 2016-08-02 17:46:55 +00:00			`import os`

Initial 2016-08-02 15:33:29 +00:00			`UPPERLETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
			`LETTERS_AND_SPACE = UPPERLETTERS + UPPERLETTERS.lower() + ' \t\n'`

			`def loadDictionary():`
Added test cases 2016-08-02 17:46:55 +00:00			`path = os.path.split(os.path.realpath(__file__))`
Initial 2016-08-02 15:33:29 +00:00			`englishWords = {}`
Travis CI: Run black, doctest, flake8, mypy, and pytest (#964) * Travis CI: Add type checking with mypy * Create requirements.txt * script: mypy --ignore-missing-stubs=cv2,numpy . * Delete requirements.txt * script: mypy --ignore-missing-imports . * Run doctests * Disable doctest -v other/detecting_english_programmatically.py * Pytest * No \| * pytest \|\| true * Run black doctest flake8 mypy pytest * after_success: Build Directory.md * Typo in filename: Dictionary.txt --> dictionary.txt' Discovered via doctest run in #964 * python -m doctest -v * pip install black flake8 mypy pytest * pytest --doctest-glob='.py' pytest --doctest-modules * pytest --doctest-modules ./sorts * pytest --doctest-modules ./ciphers ./other ./searches ./sorts ./strings \|\| true * if __name__ == "__main__": * if __name__ == "__main__": * if __name__ == '__main__': * if __name__ == '__main__': * if __name__ == '__main__': * Create requirements.txt * Update requirements.txt * if __name__ == "__main__": * Lose the doctests * if __name__ == '__main__': * Remove print-a-tuple * doctest: Added missing spaces * Update tabu_search.py * The >>> are not doctests so change to >>) * Travis CI: Run black, doctest, flake8, mypy, and pytest * Link to the separate DIRECTORY.md file * Update README.md 2019-07-08 15:27:51 +00:00			`with open(path[0] + '/dictionary.txt') as dictionaryFile:`
Fix ResourceWarning: unclosed file (#681) Signed-off-by: Mickaël Schoentgen <contact@tiger-222.fr> 2019-01-08 08:59:23 +00:00			`for word in dictionaryFile.read().split('\n'):`
			`englishWords[word] = None`
Initial 2016-08-02 15:33:29 +00:00			`return englishWords`

			`ENGLISH_WORDS = loadDictionary()`

			`def getEnglishCount(message):`
			`message = message.upper()`
			`message = removeNonLetters(message)`
			`possibleWords = message.split()`

			`if possibleWords == []:`
			`return 0.0`

			`matches = 0`
			`for word in possibleWords:`
			`if word in ENGLISH_WORDS:`
			`matches += 1`

			`return float(matches) / len(possibleWords)`

			`def removeNonLetters(message):`
			`lettersOnly = []`
			`for symbol in message:`
			`if symbol in LETTERS_AND_SPACE:`
			`lettersOnly.append(symbol)`
			`return ''.join(lettersOnly)`

			`def isEnglish(message, wordPercentage = 20, letterPercentage = 85):`
Added test cases 2016-08-02 17:46:55 +00:00			`"""`
			`>>> isEnglish('Hello World')`
			`True`

			`>>> isEnglish('llold HorWd')`
			`False`
			`"""`
Initial 2016-08-02 15:33:29 +00:00			`wordsMatch = getEnglishCount(message) * 100 >= wordPercentage`
			`numLetters = len(removeNonLetters(message))`
			`messageLettersPercentage = (float(numLetters) / len(message)) * 100`
			`lettersMatch = messageLettersPercentage >= letterPercentage`
			`return wordsMatch and lettersMatch`
Added test cases 2016-08-02 17:46:55 +00:00

			`import doctest`
			`doctest.testmod()`