html tag regex

2025-04-15 00:57:38 +00:00 · 2014-07-06 12:36:55 -04:00 · 2014-07-06 12:36:55 -04:00 · 76da7ee6e0
commit 76da7ee6e0
parent 6e5414c44d
1 changed files with 56 additions and 12 deletions
--- a/tutorials/useful_regex.ipynb
+++ b/tutorials/useful_regex.ipynb
@ -1,7 +1,7 @@
 {
 "metadata": {
  "name": "",
-  "signature": "sha256:9fd7d5201ce5b97fadad65f2c30cfec993fc83907e04418b032bd1bbdac05ff4"
+  "signature": "sha256:8f1ee7a7bfaeca0ee3e12b2387445faa10632d57277d59af6dbfdca9732e5910"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
@ -41,7 +41,7 @@
       "output_type": "stream",
       "stream": "stdout",
       "text": [
-        "Last updated: 06/07/2014 10:07:02 EDT\n",
+        "Last updated: 06/07/2014 12:24:58 EDT\n",
        "\n",
        "CPython 3.4.1\n",
        "IPython 2.1.0\n"
@ -264,7 +264,9 @@
     "cell_type": "markdown",
     "metadata": {},
     "source": [
-      "A regular expression to check for file extensions."
+      "A regular expression to check for file extensions.  \n",
      "\n",
      "Note: This approach is not recommended for thorough limitation of file types (parse the file header instead). However, this regex is still a useful alternative to e.g., a Python's `endswith` approach for quick pre-filtering for certain files of interest."
     ]
    },
    {
@ -746,14 +748,6 @@
      "<br>"
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Time"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
@ -810,6 +804,49 @@
      "\n",
      "for t in str_true:\n",
      "    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
      "for f in str_false:\n",
      "    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 18
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<br>\n",
      "<br>"
     ]
    },
    {
     "cell_type": "heading",
     "level": 2,
     "metadata": {},
     "source": [
      "Checking for HTML tags"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "[[back to top](#Sections)]"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "pattern = r\"\"\"</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>\"\"\"\n",
      "\n",
      "str_true = ('<a>', '<a href=\"something\">', '</a>', '<img src>')\n",
      "            \n",
      "str_false = ('a>', '<a ', '< a >')\n",
      "\n",
      "for t in str_true:\n",
      "    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
      "\n",
      "for f in str_false:\n",
      "    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
@ -817,7 +854,14 @@
     "language": "python",
     "metadata": {},
     "outputs": [],
-     "prompt_number": 33
+     "prompt_number": 16
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "<font size=\"1px\">source: [http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/](http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/)</font>"
     ]
    }
   ],
   "metadata": {}