Put under version control
This commit is contained in:
parent
d26d23348b
commit
a6af5b12d2
83 changed files with 20130 additions and 0 deletions
7
tests/__init__.py
Normal file
7
tests/__init__.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
from .annotation import AnnotateTest
|
||||
from .date_finder import DateFinderTest
|
||||
from .page import PageCleanTest
|
||||
from .section import IterSectionTest, AggSectionTest
|
||||
from .simhash import SimhashTest
|
||||
from .text import CleanHtmlTest
|
||||
from .durl import DurlTest
|
49
tests/annotation.py
Normal file
49
tests/annotation.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
"""
|
||||
Test cases for resource type page.
|
||||
"""
|
||||
|
||||
from unittest import TestCase
|
||||
|
||||
from atextcrawler.utils.annotation import annotate
|
||||
|
||||
|
||||
class AnnotateTest(TestCase):
|
||||
"""
|
||||
Test annotation.
|
||||
|
||||
Consider that the <br> and <hr> tags are self-closing.
|
||||
"""
|
||||
|
||||
def test_annotate_1(self):
|
||||
s = '<em>Hello</em><br><strong>world</strong>'
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
|
||||
self.assertEqual(anns['section_ids'], {})
|
||||
|
||||
def test_annotate_2(self):
|
||||
s = '<em> Hello </em><br><strong> world </strong>'
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
|
||||
self.assertEqual(anns['section_ids'], {})
|
||||
|
||||
def test_annotate_3(self):
|
||||
s = '<p> Hello <em>world</em> </p> '
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 60})
|
||||
|
||||
def test_annotate_4(self):
|
||||
s = '<div id = "ref1"><p>Hello <em>world</em> </p> </div>'
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 60})
|
||||
self.assertEqual(anns['section_ids'], {0: ['ref1']})
|
||||
|
||||
def test_annotate_5(self):
|
||||
s = '<br id="ref2"> Hello <p>world </p> '
|
||||
text, anns = annotate(s)
|
||||
self.assertEqual(text, ' Hello world')
|
||||
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60})
|
||||
self.assertEqual(anns['section_ids'], {1: ['ref2']})
|
20
tests/date_finder.py
Normal file
20
tests/date_finder.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
from datetime import datetime
|
||||
from unittest import TestCase
|
||||
|
||||
from atextcrawler.utils.date_finder import extract_latest_date
|
||||
|
||||
|
||||
class DateFinderTest(TestCase):
|
||||
def test_extract_latest_date(self):
|
||||
s = 'test 1987-2+1-no'
|
||||
r = datetime(1987, 2, 1)
|
||||
self.assertEqual(extract_latest_date(s), r)
|
||||
s = '2020-04-06, whatever and 1987-2-1, 1/20/2021'
|
||||
r = datetime(2020, 4, 6)
|
||||
self.assertEqual(extract_latest_date(s, lang='de'), r)
|
||||
s = 'test 2022-04-06, whatever and 1987-2-1, 1/20/2021'
|
||||
r = datetime(2021, 1, 20)
|
||||
self.assertEqual(extract_latest_date(s, lang='en'), r)
|
||||
s = ''
|
||||
r = None
|
||||
self.assertEqual(extract_latest_date(s), r)
|
68
tests/durl.py
Normal file
68
tests/durl.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
from unittest import IsolatedAsyncioTestCase
|
||||
import asyncpg
|
||||
from atextcrawler.utils.durl import Durl
|
||||
from atextcrawler.config import Config
|
||||
from atextcrawler.db import PGPool
|
||||
|
||||
|
||||
class DurlTest(IsolatedAsyncioTestCase):
|
||||
async def asyncSetUp(self):
|
||||
config = Config().get()
|
||||
self.pool = PGPool(config['postgresql'])
|
||||
await self.pool.__aenter__()
|
||||
self.conn = await self.pool.pool.acquire()
|
||||
|
||||
async def test_durl_basic(self):
|
||||
durl1 = await Durl('https://U:Pw@www.EXAMPLE.com:8000/hello?world#a')
|
||||
self.assertEqual(durl1.scheme, 'https')
|
||||
self.assertEqual(durl1.netloc, 'U:Pw@www.example.com:8000')
|
||||
self.assertEqual(durl1.port, 8000)
|
||||
self.assertEqual(durl1.path, '/hello')
|
||||
self.assertEqual(durl1.fragment, '')
|
||||
self.assertEqual(durl1.pwa(), 'hello?world')
|
||||
self.assertEqual(durl1.site(), 'https://U:Pw@www.example.com:8000/')
|
||||
self.assertEqual(
|
||||
durl1.url(), 'https://U:Pw@www.example.com:8000/' 'hello?world'
|
||||
)
|
||||
self.assertEqual(durl1.has_path(), True)
|
||||
durl2 = await Durl('http://www.example.com/')
|
||||
self.assertEqual(durl2.has_path(), False)
|
||||
durl3 = await Durl('ftp://www.example.com/')
|
||||
self.assertEqual(durl3, None)
|
||||
|
||||
async def test_durl_with_base(self):
|
||||
durl1 = await Durl('https://www.example.com')
|
||||
self.assertEqual(durl1.path, '/')
|
||||
self.assertEqual(durl1.pwa(), '')
|
||||
self.assertEqual(durl1.has_path(), False)
|
||||
durl2 = await Durl('https://www.example.com/hello2', base=durl1)
|
||||
self.assertEqual(durl2.hostname, 'www.example.com')
|
||||
self.assertEqual(durl2.path, '/hello2')
|
||||
self.assertEqual(durl2.pwa(), 'hello2')
|
||||
durl3 = await Durl('/hello3?x=1', base=durl1)
|
||||
self.assertEqual(durl3.hostname, 'www.example.com')
|
||||
self.assertEqual(durl3.path, '/hello3')
|
||||
self.assertEqual(durl3.pwa(), 'hello3?x=1')
|
||||
self.assertEqual(durl3.site(), 'https://www.example.com/')
|
||||
durl4 = await Durl('https://www.kernel.org/', base=durl1)
|
||||
self.assertEqual(durl4, None)
|
||||
|
||||
async def test_durl_with_base_and_match_base(self):
|
||||
durl1 = await Durl('https://www.example.com/base/path/')
|
||||
self.assertEqual(durl1.path, '/base/path/')
|
||||
self.assertEqual(durl1.pwa(), 'base/path/')
|
||||
self.assertEqual(durl1.has_path(), True)
|
||||
durl2 = await Durl(
|
||||
'https://www.example.com/base/', base=durl1, match_base=True
|
||||
)
|
||||
self.assertEqual(durl2, None)
|
||||
durl3 = await Durl(
|
||||
'https://www.example.com/base/path/whatever?x=1#a',
|
||||
base=durl1,
|
||||
match_base=True,
|
||||
)
|
||||
self.assertEqual(durl3.pwa(), 'whatever?x=1')
|
||||
|
||||
async def asyncTearDown(self):
|
||||
await self.pool.pool.release(self.conn)
|
||||
await self.pool.pool.close()
|
24
tests/page.py
Normal file
24
tests/page.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
"""
|
||||
Test cases for resource type page.
|
||||
"""
|
||||
|
||||
from unittest import TestCase
|
||||
from atextcrawler.utils.html import clean_body
|
||||
|
||||
# from atextcrawler.utils.tag import drop_tags
|
||||
|
||||
|
||||
class PageCleanTest(TestCase):
|
||||
def test_clean_body_1(self):
|
||||
s = ' <em>Hello</em> <strong>world</strong> '
|
||||
r = '<em>Hello</em> <strong>world</strong>'
|
||||
self.assertEqual(clean_body(s), r)
|
||||
|
||||
|
||||
# def test_drop_tags(self):
|
||||
# s = '<figure what="ever">something<figure>else</figure>...</figure>'
|
||||
# r = drop_tags(s)
|
||||
# self.assertEqual(r, '')
|
||||
# s = '<rt><rt><rt><rt>something</rt></rt></rt></rt>'
|
||||
# r = drop_tags(s)
|
||||
# self.assertEqual(r, '')
|
105
tests/section.py
Normal file
105
tests/section.py
Normal file
|
@ -0,0 +1,105 @@
|
|||
from unittest import TestCase
|
||||
|
||||
from atextcrawler.utils.section import concat_section_texts, iter_sections
|
||||
|
||||
|
||||
class IterSectionTest(TestCase):
|
||||
def test_iter_sections_1(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 80, 5: 2, 15: 1, 20: 3}
|
||||
sections1 = list(iter_sections(s, sb, max_level=100))
|
||||
sections2 = [
|
||||
(0, 5, 80, 'bcde'),
|
||||
(5, 15, 2, 'ghijklmno'),
|
||||
(15, 20, 1, 'qrst'),
|
||||
(20, 26, 3, 'uvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_iter_sections_2(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 4, 5: 2, 15: 1, 20: 3, 26: 9}
|
||||
sections1 = list(iter_sections(s, sb, max_level=100))
|
||||
sections2 = [
|
||||
(0, 5, 4, 'bcde'),
|
||||
(5, 15, 2, 'ghijklmno'),
|
||||
(15, 20, 1, 'qrst'),
|
||||
(20, 26, 3, 'vwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_iter_sections_3(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {5: 2, 15: 60, 18: 50, 20: 3}
|
||||
sections1 = list(iter_sections(s, sb, max_level=59))
|
||||
sections2 = [
|
||||
(0, 5, 80, 'bcde'),
|
||||
(5, 18, 2, 'ghijklmnopqr'),
|
||||
(18, 20, 50, 't'),
|
||||
(20, 26, 3, 'uvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_iter_sections_4(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {5: 2, 15: 60, 18: 50, 20: 3, 24: 60}
|
||||
sections1 = list(iter_sections(s, sb, max_level=59))
|
||||
sections2 = [
|
||||
(0, 5, 80, 'bcde'),
|
||||
(5, 18, 2, 'ghijklmnopqr'),
|
||||
(18, 20, 50, 't'),
|
||||
(20, 26, 3, 'uvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
|
||||
class AggSectionTest(TestCase):
|
||||
def test_concat_sections_1(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 5: 1, 15: 1, 20: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1], 'abcdefghijklmno'),
|
||||
([2, 3], 'pqrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_concat_sections_2(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 2: 1, 10: 1, 20: 1, 26: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1], 'abcdefghij'),
|
||||
([2, 3, 4], 'klmnopqrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_concat_sections_3(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 4: 1, 6: 1, 16: 1, 26: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1, 2], 'abcdefghijklmnop'),
|
||||
([3, 4], 'qrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_concat_sections_4(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 5: 1, 15: 1, 26: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1], 'abcdefghijklmno'),
|
||||
([2, 3], 'pqrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
||||
|
||||
def test_concat_sections_5(self):
|
||||
s = 'abcdefghijklmnopqrstuvwxyz'
|
||||
sb = {0: 1, 5: 1, 12: 1, 22: 1, 23: 1, 24: 1, 26: 1}
|
||||
sections1 = list(concat_section_texts(s, sb, min_len=10))
|
||||
sections2 = [
|
||||
([0, 1], 'abcdefghijkl'),
|
||||
([2, 3, 4, 5, 6], 'mnopqrstuvwxyz'),
|
||||
]
|
||||
self.assertEqual(sections1, sections2)
|
54
tests/simhash.py
Normal file
54
tests/simhash.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
"""
|
||||
Test cases for text util.
|
||||
"""
|
||||
|
||||
from unittest import TestCase
|
||||
from simhash import Simhash, SimhashIndex
|
||||
from atextcrawler.utils.similarity import (
|
||||
create_simhash,
|
||||
get_features,
|
||||
get_simhash,
|
||||
postgresql_bigint_offset,
|
||||
search_simhash,
|
||||
)
|
||||
|
||||
|
||||
class SimhashTest(TestCase):
|
||||
"""
|
||||
Test simhash creation and search.
|
||||
"""
|
||||
|
||||
def test_search(self):
|
||||
n1 = int('1111111100000000', 2)
|
||||
n2 = int('1111111100000111', 2)
|
||||
n3 = int('1000000000000000', 2)
|
||||
n4 = int('1000000000000111', 2)
|
||||
n5 = int('1000001111000000', 2)
|
||||
objs = [
|
||||
('1', Simhash(n1)),
|
||||
('3', Simhash(n3)),
|
||||
('4', Simhash(n4)),
|
||||
]
|
||||
index = SimhashIndex(objs, k=3)
|
||||
found = search_simhash(index, Simhash(n5))
|
||||
self.assertEqual(found, [])
|
||||
found = search_simhash(index, Simhash(n1))
|
||||
self.assertEqual(found, [1])
|
||||
found = search_simhash(index, Simhash(n2))
|
||||
self.assertEqual(found, [1])
|
||||
found = search_simhash(index, Simhash(n4))
|
||||
self.assertEqual(found, [3, 4])
|
||||
|
||||
def test_create(self):
|
||||
index = SimhashIndex([], k=3)
|
||||
hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
|
||||
hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
|
||||
simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
|
||||
simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
|
||||
found = search_simhash(index, simhash_1)
|
||||
self.assertEqual(found, [101])
|
||||
found = search_simhash(index, simhash_2)
|
||||
self.assertEqual(found, [102])
|
||||
simhash_3 = get_simhash('hello ' * 20 + 'X')
|
||||
found = search_simhash(index, simhash_3)
|
||||
self.assertEqual(found, [101])
|
65
tests/text.py
Normal file
65
tests/text.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
"""
|
||||
Test cases for text util.
|
||||
"""
|
||||
|
||||
from unittest import TestCase
|
||||
from atextcrawler.utils.html import clean_page
|
||||
|
||||
|
||||
class CleanHtmlTest(TestCase):
|
||||
"""
|
||||
Test clean_page.
|
||||
|
||||
Have an eye on self-closing tags (br, hr, ...).
|
||||
"""
|
||||
|
||||
def test_clean_page_1(self):
|
||||
s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
|
||||
r = '<em>Hello</em><br/>anything'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_2(self):
|
||||
s = '<em>Hello</em><br /><script>malicious<script></script>anything'
|
||||
r = '<em>Hello</em><br/>anything'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_3(self):
|
||||
# nesting
|
||||
s = '--<figure>xx<figure>yy</figure>zz</figure>..'
|
||||
r = '--..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_4(self):
|
||||
# aria-hidden
|
||||
s = '--<p aria-hidden=true>xx</p>..'
|
||||
r = '--..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
s = '--<p aria-hidden="true">xx</p>..'
|
||||
r = '--..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
s = '--<p aria-hidden=false>xx</p>..'
|
||||
r = '--<p aria-hidden="false">xx</p>..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
s = '--<p aria-hidden="false">xx</p>..'
|
||||
r = '--<p aria-hidden="false">xx</p>..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
s = '--<p aria-hidden=??>xx</p>..'
|
||||
r = '--<p aria-hidden="??">xx</p>..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_5(self):
|
||||
# no removal
|
||||
s = '--<p>xx<em>yy</em></p>..'
|
||||
r = '--<p>xx<em>yy</em></p>..'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_6(self):
|
||||
# self-closing tags to be removed
|
||||
s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
|
||||
r = '--<p>xx</p>\n...<h1>tt</h1>nn'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
||||
|
||||
def test_clean_page_7(self):
|
||||
s = '--<p rel=search>tt<area /></p>nn'
|
||||
r = '--nn'
|
||||
self.assertEqual(str(clean_page(s)), r)
|
Loading…
Add table
Add a link
Reference in a new issue