diff --git a/python_patterns/patterns.ipynb b/python_patterns/patterns.ipynb index fc3e45f..2a769ba 100644 --- a/python_patterns/patterns.ipynb +++ b/python_patterns/patterns.ipynb @@ -1,7 +1,7 @@ { "metadata": { "name": "", - "signature": "sha256:0c9d8c8b65b0eec5bb7c2a2790f08a1e49daf27dac2c9dcfe8d85ce958046a2c" + "signature": "sha256:714a46a359c5b1c3e7e7bd4d19d73221f9def5bcb806840be82541070041d29e" }, "nbformat": 3, "nbformat_minor": 0, @@ -57,6 +57,7 @@ "- [Differences between 2 files](#Differences-between-2-files)\n", "- [Differences between successive elements in a list](#Differences-between-successive-elements-in-a-list)\n", "- [Doctest example](#Doctest-example)\n", + "- [English language detection](#English-language-detection)\n", "- [File browsing basics](#File-browsing-basics)\n", "- [File reading basics](#File-reading-basics)\n", "- [Indices of min and max elements from a list](#Indices-of-min-and-max-elements-from-a-list)\n", @@ -595,6 +596,61 @@ "
" ] }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "English language detection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[back to top](#Table-of-Contents)" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import nltk\n", + "\n", + "def eng_ratio(text):\n", + " ''' Returns the ratio of non-English to English words from a text '''\n", + "\n", + " english_vocab = set(w.lower() for w in nltk.corpus.words.words()) \n", + " text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) \n", + " unusual = text_vocab.difference(english_vocab)\n", + " diff = len(unusual)/len(text_vocab)\n", + " return diff\n", + " \n", + "text = 'This is a test fahrrad'\n", + "\n", + "print(eng_ratio(text))" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + "0.2\n" + ] + } + ], + "prompt_number": 1 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "
" + ] + }, { "cell_type": "heading", "level": 2,