66 lines
2 KiB
Python
66 lines
2 KiB
Python
|
"""
|
||
|
Test cases for text util.
|
||
|
"""
|
||
|
|
||
|
from unittest import TestCase
|
||
|
from atextcrawler.utils.html import clean_page
|
||
|
|
||
|
|
||
|
class CleanHtmlTest(TestCase):
|
||
|
"""
|
||
|
Test clean_page.
|
||
|
|
||
|
Have an eye on self-closing tags (br, hr, ...).
|
||
|
"""
|
||
|
|
||
|
def test_clean_page_1(self):
|
||
|
s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
|
||
|
r = '<em>Hello</em><br/>anything'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
|
||
|
def test_clean_page_2(self):
|
||
|
s = '<em>Hello</em><br /><script>malicious<script></script>anything'
|
||
|
r = '<em>Hello</em><br/>anything'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
|
||
|
def test_clean_page_3(self):
|
||
|
# nesting
|
||
|
s = '--<figure>xx<figure>yy</figure>zz</figure>..'
|
||
|
r = '--..'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
|
||
|
def test_clean_page_4(self):
|
||
|
# aria-hidden
|
||
|
s = '--<p aria-hidden=true>xx</p>..'
|
||
|
r = '--..'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
s = '--<p aria-hidden="true">xx</p>..'
|
||
|
r = '--..'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
s = '--<p aria-hidden=false>xx</p>..'
|
||
|
r = '--<p aria-hidden="false">xx</p>..'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
s = '--<p aria-hidden="false">xx</p>..'
|
||
|
r = '--<p aria-hidden="false">xx</p>..'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
s = '--<p aria-hidden=??>xx</p>..'
|
||
|
r = '--<p aria-hidden="??">xx</p>..'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
|
||
|
def test_clean_page_5(self):
|
||
|
# no removal
|
||
|
s = '--<p>xx<em>yy</em></p>..'
|
||
|
r = '--<p>xx<em>yy</em></p>..'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
|
||
|
def test_clean_page_6(self):
|
||
|
# self-closing tags to be removed
|
||
|
s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
|
||
|
r = '--<p>xx</p>\n...<h1>tt</h1>nn'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|
||
|
|
||
|
def test_clean_page_7(self):
|
||
|
s = '--<p rel=search>tt<area /></p>nn'
|
||
|
r = '--nn'
|
||
|
self.assertEqual(str(clean_page(s)), r)
|