html tag regex

This commit is contained in:
rasbt 2014-07-06 12:36:55 -04:00
parent 6e5414c44d
commit 76da7ee6e0

View File

@ -1,7 +1,7 @@
{ {
"metadata": { "metadata": {
"name": "", "name": "",
"signature": "sha256:9fd7d5201ce5b97fadad65f2c30cfec993fc83907e04418b032bd1bbdac05ff4" "signature": "sha256:8f1ee7a7bfaeca0ee3e12b2387445faa10632d57277d59af6dbfdca9732e5910"
}, },
"nbformat": 3, "nbformat": 3,
"nbformat_minor": 0, "nbformat_minor": 0,
@ -41,7 +41,7 @@
"output_type": "stream", "output_type": "stream",
"stream": "stdout", "stream": "stdout",
"text": [ "text": [
"Last updated: 06/07/2014 10:07:02 EDT\n", "Last updated: 06/07/2014 12:24:58 EDT\n",
"\n", "\n",
"CPython 3.4.1\n", "CPython 3.4.1\n",
"IPython 2.1.0\n" "IPython 2.1.0\n"
@ -264,7 +264,9 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"A regular expression to check for file extensions." "A regular expression to check for file extensions. \n",
"\n",
"Note: This approach is not recommended for thorough limitation of file types (parse the file header instead). However, this regex is still a useful alternative to e.g., a Python's `endswith` approach for quick pre-filtering for certain files of interest."
] ]
}, },
{ {
@ -746,14 +748,6 @@
"<br>" "<br>"
] ]
}, },
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Time"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
@ -810,6 +804,49 @@
"\n", "\n",
"for t in str_true:\n", "for t in str_true:\n",
" assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n", " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
"for f in str_false:\n",
" assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 18
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<br>\n",
"<br>"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Checking for HTML tags"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[[back to top](#Sections)]"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"pattern = r\"\"\"</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>\"\"\"\n",
"\n",
"str_true = ('<a>', '<a href=\"something\">', '</a>', '<img src>')\n",
" \n",
"str_false = ('a>', '<a ', '< a >')\n",
"\n",
"for t in str_true:\n",
" assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
"\n", "\n",
"for f in str_false:\n", "for f in str_false:\n",
" assert(bool(re.match(pattern, f)) == False), '%s is not False' %f" " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
@ -817,7 +854,14 @@
"language": "python", "language": "python",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"prompt_number": 33 "prompt_number": 16
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<font size=\"1px\">source: [http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/](http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/)</font>"
]
} }
], ],
"metadata": {} "metadata": {}