mirror of
https://github.com/rasbt/python_reference.git
synced 2025-02-17 13:58:13 +00:00
english language detect. snippet
This commit is contained in:
parent
14996364a2
commit
3f81eadf98
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"name": "",
|
"name": "",
|
||||||
"signature": "sha256:0c9d8c8b65b0eec5bb7c2a2790f08a1e49daf27dac2c9dcfe8d85ce958046a2c"
|
"signature": "sha256:714a46a359c5b1c3e7e7bd4d19d73221f9def5bcb806840be82541070041d29e"
|
||||||
},
|
},
|
||||||
"nbformat": 3,
|
"nbformat": 3,
|
||||||
"nbformat_minor": 0,
|
"nbformat_minor": 0,
|
||||||
|
@ -57,6 +57,7 @@
|
||||||
"- [Differences between 2 files](#Differences-between-2-files)\n",
|
"- [Differences between 2 files](#Differences-between-2-files)\n",
|
||||||
"- [Differences between successive elements in a list](#Differences-between-successive-elements-in-a-list)\n",
|
"- [Differences between successive elements in a list](#Differences-between-successive-elements-in-a-list)\n",
|
||||||
"- [Doctest example](#Doctest-example)\n",
|
"- [Doctest example](#Doctest-example)\n",
|
||||||
|
"- [English language detection](#English-language-detection)\n",
|
||||||
"- [File browsing basics](#File-browsing-basics)\n",
|
"- [File browsing basics](#File-browsing-basics)\n",
|
||||||
"- [File reading basics](#File-reading-basics)\n",
|
"- [File reading basics](#File-reading-basics)\n",
|
||||||
"- [Indices of min and max elements from a list](#Indices-of-min-and-max-elements-from-a-list)\n",
|
"- [Indices of min and max elements from a list](#Indices-of-min-and-max-elements-from-a-list)\n",
|
||||||
|
@ -595,6 +596,61 @@
|
||||||
"<br>"
|
"<br>"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "heading",
|
||||||
|
"level": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"English language detection"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"[back to top](#Table-of-Contents)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"collapsed": false,
|
||||||
|
"input": [
|
||||||
|
"import nltk\n",
|
||||||
|
"\n",
|
||||||
|
"def eng_ratio(text):\n",
|
||||||
|
" ''' Returns the ratio of non-English to English words from a text '''\n",
|
||||||
|
"\n",
|
||||||
|
" english_vocab = set(w.lower() for w in nltk.corpus.words.words()) \n",
|
||||||
|
" text_vocab = set(w.lower() for w in text.split() if w.lower().isalpha()) \n",
|
||||||
|
" unusual = text_vocab.difference(english_vocab)\n",
|
||||||
|
" diff = len(unusual)/len(text_vocab)\n",
|
||||||
|
" return diff\n",
|
||||||
|
" \n",
|
||||||
|
"text = 'This is a test fahrrad'\n",
|
||||||
|
"\n",
|
||||||
|
"print(eng_ratio(text))"
|
||||||
|
],
|
||||||
|
"language": "python",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"stream": "stdout",
|
||||||
|
"text": [
|
||||||
|
"0.2\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"prompt_number": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<br>\n",
|
||||||
|
"<br>"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "heading",
|
"cell_type": "heading",
|
||||||
"level": 2,
|
"level": 2,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user