mirror of
https://github.com/rasbt/python_reference.git
synced 2025-01-18 07:17:06 +00:00
html tag regex
This commit is contained in:
parent
6e5414c44d
commit
76da7ee6e0
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"metadata": {
|
||||
"name": "",
|
||||
"signature": "sha256:9fd7d5201ce5b97fadad65f2c30cfec993fc83907e04418b032bd1bbdac05ff4"
|
||||
"signature": "sha256:8f1ee7a7bfaeca0ee3e12b2387445faa10632d57277d59af6dbfdca9732e5910"
|
||||
},
|
||||
"nbformat": 3,
|
||||
"nbformat_minor": 0,
|
||||
|
@ -41,7 +41,7 @@
|
|||
"output_type": "stream",
|
||||
"stream": "stdout",
|
||||
"text": [
|
||||
"Last updated: 06/07/2014 10:07:02 EDT\n",
|
||||
"Last updated: 06/07/2014 12:24:58 EDT\n",
|
||||
"\n",
|
||||
"CPython 3.4.1\n",
|
||||
"IPython 2.1.0\n"
|
||||
|
@ -264,7 +264,9 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"A regular expression to check for file extensions."
|
||||
"A regular expression to check for file extensions. \n",
|
||||
"\n",
|
||||
"Note: This approach is not recommended for thorough limitation of file types (parse the file header instead). However, this regex is still a useful alternative to e.g., a Python's `endswith` approach for quick pre-filtering for certain files of interest."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -746,14 +748,6 @@
|
|||
"<br>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "heading",
|
||||
"level": 2,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
@ -810,6 +804,49 @@
|
|||
"\n",
|
||||
"for t in str_true:\n",
|
||||
" assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
|
||||
"for f in str_false:\n",
|
||||
" assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
|
||||
],
|
||||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"prompt_number": 18
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<br>\n",
|
||||
"<br>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "heading",
|
||||
"level": 2,
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Checking for HTML tags"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"[[back to top](#Sections)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"collapsed": false,
|
||||
"input": [
|
||||
"pattern = r\"\"\"</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>\"\"\"\n",
|
||||
"\n",
|
||||
"str_true = ('<a>', '<a href=\"something\">', '</a>', '<img src>')\n",
|
||||
" \n",
|
||||
"str_false = ('a>', '<a ', '< a >')\n",
|
||||
"\n",
|
||||
"for t in str_true:\n",
|
||||
" assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
|
||||
"\n",
|
||||
"for f in str_false:\n",
|
||||
" assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
|
||||
|
@ -817,7 +854,14 @@
|
|||
"language": "python",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"prompt_number": 33
|
||||
"prompt_number": 16
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<font size=\"1px\">source: [http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/](http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/)</font>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {}
|
||||
|
|
Loading…
Reference in New Issue
Block a user