Put under version control

This commit is contained in:
ibu 2021-11-29 09:16:31 +00:00
parent d26d23348b
commit a6af5b12d2
83 changed files with 20130 additions and 0 deletions

7
tests/__init__.py Normal file
View file

@ -0,0 +1,7 @@
from .annotation import AnnotateTest
from .date_finder import DateFinderTest
from .page import PageCleanTest
from .section import IterSectionTest, AggSectionTest
from .simhash import SimhashTest
from .text import CleanHtmlTest
from .durl import DurlTest

49
tests/annotation.py Normal file
View file

@ -0,0 +1,49 @@
"""
Test cases for resource type page.
"""
from unittest import TestCase
from atextcrawler.utils.annotation import annotate
class AnnotateTest(TestCase):
"""
Test annotation.
Consider that the <br> and <hr> tags are self-closing.
"""
def test_annotate_1(self):
s = '<em>Hello</em><br><strong>world</strong>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
self.assertEqual(anns['section_ids'], {})
def test_annotate_2(self):
s = '<em> Hello </em><br><strong> world </strong>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80})
self.assertEqual(anns['section_ids'], {})
def test_annotate_3(self):
s = '<p> Hello <em>world</em> </p> '
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 60})
def test_annotate_4(self):
s = '<div id = "ref1"><p>Hello <em>world</em> </p> </div>'
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 60})
self.assertEqual(anns['section_ids'], {0: ['ref1']})
def test_annotate_5(self):
s = '<br id="ref2"> Hello <p>world </p> '
text, anns = annotate(s)
self.assertEqual(text, ' Hello world')
self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60})
self.assertEqual(anns['section_ids'], {1: ['ref2']})

20
tests/date_finder.py Normal file
View file

@ -0,0 +1,20 @@
from datetime import datetime
from unittest import TestCase
from atextcrawler.utils.date_finder import extract_latest_date
class DateFinderTest(TestCase):
def test_extract_latest_date(self):
s = 'test 1987-2+1-no'
r = datetime(1987, 2, 1)
self.assertEqual(extract_latest_date(s), r)
s = '2020-04-06, whatever and 1987-2-1, 1/20/2021'
r = datetime(2020, 4, 6)
self.assertEqual(extract_latest_date(s, lang='de'), r)
s = 'test 2022-04-06, whatever and 1987-2-1, 1/20/2021'
r = datetime(2021, 1, 20)
self.assertEqual(extract_latest_date(s, lang='en'), r)
s = ''
r = None
self.assertEqual(extract_latest_date(s), r)

68
tests/durl.py Normal file
View file

@ -0,0 +1,68 @@
from unittest import IsolatedAsyncioTestCase
import asyncpg
from atextcrawler.utils.durl import Durl
from atextcrawler.config import Config
from atextcrawler.db import PGPool
class DurlTest(IsolatedAsyncioTestCase):
async def asyncSetUp(self):
config = Config().get()
self.pool = PGPool(config['postgresql'])
await self.pool.__aenter__()
self.conn = await self.pool.pool.acquire()
async def test_durl_basic(self):
durl1 = await Durl('https://U:Pw@www.EXAMPLE.com:8000/hello?world#a')
self.assertEqual(durl1.scheme, 'https')
self.assertEqual(durl1.netloc, 'U:Pw@www.example.com:8000')
self.assertEqual(durl1.port, 8000)
self.assertEqual(durl1.path, '/hello')
self.assertEqual(durl1.fragment, '')
self.assertEqual(durl1.pwa(), 'hello?world')
self.assertEqual(durl1.site(), 'https://U:Pw@www.example.com:8000/')
self.assertEqual(
durl1.url(), 'https://U:Pw@www.example.com:8000/' 'hello?world'
)
self.assertEqual(durl1.has_path(), True)
durl2 = await Durl('http://www.example.com/')
self.assertEqual(durl2.has_path(), False)
durl3 = await Durl('ftp://www.example.com/')
self.assertEqual(durl3, None)
async def test_durl_with_base(self):
durl1 = await Durl('https://www.example.com')
self.assertEqual(durl1.path, '/')
self.assertEqual(durl1.pwa(), '')
self.assertEqual(durl1.has_path(), False)
durl2 = await Durl('https://www.example.com/hello2', base=durl1)
self.assertEqual(durl2.hostname, 'www.example.com')
self.assertEqual(durl2.path, '/hello2')
self.assertEqual(durl2.pwa(), 'hello2')
durl3 = await Durl('/hello3?x=1', base=durl1)
self.assertEqual(durl3.hostname, 'www.example.com')
self.assertEqual(durl3.path, '/hello3')
self.assertEqual(durl3.pwa(), 'hello3?x=1')
self.assertEqual(durl3.site(), 'https://www.example.com/')
durl4 = await Durl('https://www.kernel.org/', base=durl1)
self.assertEqual(durl4, None)
async def test_durl_with_base_and_match_base(self):
durl1 = await Durl('https://www.example.com/base/path/')
self.assertEqual(durl1.path, '/base/path/')
self.assertEqual(durl1.pwa(), 'base/path/')
self.assertEqual(durl1.has_path(), True)
durl2 = await Durl(
'https://www.example.com/base/', base=durl1, match_base=True
)
self.assertEqual(durl2, None)
durl3 = await Durl(
'https://www.example.com/base/path/whatever?x=1#a',
base=durl1,
match_base=True,
)
self.assertEqual(durl3.pwa(), 'whatever?x=1')
async def asyncTearDown(self):
await self.pool.pool.release(self.conn)
await self.pool.pool.close()

24
tests/page.py Normal file
View file

@ -0,0 +1,24 @@
"""
Test cases for resource type page.
"""
from unittest import TestCase
from atextcrawler.utils.html import clean_body
# from atextcrawler.utils.tag import drop_tags
class PageCleanTest(TestCase):
def test_clean_body_1(self):
s = ' <em>Hello</em> <strong>world</strong> '
r = '<em>Hello</em> <strong>world</strong>'
self.assertEqual(clean_body(s), r)
# def test_drop_tags(self):
# s = '<figure what="ever">something<figure>else</figure>...</figure>'
# r = drop_tags(s)
# self.assertEqual(r, '')
# s = '<rt><rt><rt><rt>something</rt></rt></rt></rt>'
# r = drop_tags(s)
# self.assertEqual(r, '')

105
tests/section.py Normal file
View file

@ -0,0 +1,105 @@
from unittest import TestCase
from atextcrawler.utils.section import concat_section_texts, iter_sections
class IterSectionTest(TestCase):
def test_iter_sections_1(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 80, 5: 2, 15: 1, 20: 3}
sections1 = list(iter_sections(s, sb, max_level=100))
sections2 = [
(0, 5, 80, 'bcde'),
(5, 15, 2, 'ghijklmno'),
(15, 20, 1, 'qrst'),
(20, 26, 3, 'uvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_iter_sections_2(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 4, 5: 2, 15: 1, 20: 3, 26: 9}
sections1 = list(iter_sections(s, sb, max_level=100))
sections2 = [
(0, 5, 4, 'bcde'),
(5, 15, 2, 'ghijklmno'),
(15, 20, 1, 'qrst'),
(20, 26, 3, 'vwxyz'),
]
self.assertEqual(sections1, sections2)
def test_iter_sections_3(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {5: 2, 15: 60, 18: 50, 20: 3}
sections1 = list(iter_sections(s, sb, max_level=59))
sections2 = [
(0, 5, 80, 'bcde'),
(5, 18, 2, 'ghijklmnopqr'),
(18, 20, 50, 't'),
(20, 26, 3, 'uvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_iter_sections_4(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {5: 2, 15: 60, 18: 50, 20: 3, 24: 60}
sections1 = list(iter_sections(s, sb, max_level=59))
sections2 = [
(0, 5, 80, 'bcde'),
(5, 18, 2, 'ghijklmnopqr'),
(18, 20, 50, 't'),
(20, 26, 3, 'uvwxyz'),
]
self.assertEqual(sections1, sections2)
class AggSectionTest(TestCase):
def test_concat_sections_1(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 5: 1, 15: 1, 20: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1], 'abcdefghijklmno'),
([2, 3], 'pqrstuvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_concat_sections_2(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 2: 1, 10: 1, 20: 1, 26: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1], 'abcdefghij'),
([2, 3, 4], 'klmnopqrstuvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_concat_sections_3(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 4: 1, 6: 1, 16: 1, 26: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1, 2], 'abcdefghijklmnop'),
([3, 4], 'qrstuvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_concat_sections_4(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 5: 1, 15: 1, 26: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1], 'abcdefghijklmno'),
([2, 3], 'pqrstuvwxyz'),
]
self.assertEqual(sections1, sections2)
def test_concat_sections_5(self):
s = 'abcdefghijklmnopqrstuvwxyz'
sb = {0: 1, 5: 1, 12: 1, 22: 1, 23: 1, 24: 1, 26: 1}
sections1 = list(concat_section_texts(s, sb, min_len=10))
sections2 = [
([0, 1], 'abcdefghijkl'),
([2, 3, 4, 5, 6], 'mnopqrstuvwxyz'),
]
self.assertEqual(sections1, sections2)

54
tests/simhash.py Normal file
View file

@ -0,0 +1,54 @@
"""
Test cases for text util.
"""
from unittest import TestCase
from simhash import Simhash, SimhashIndex
from atextcrawler.utils.similarity import (
create_simhash,
get_features,
get_simhash,
postgresql_bigint_offset,
search_simhash,
)
class SimhashTest(TestCase):
"""
Test simhash creation and search.
"""
def test_search(self):
n1 = int('1111111100000000', 2)
n2 = int('1111111100000111', 2)
n3 = int('1000000000000000', 2)
n4 = int('1000000000000111', 2)
n5 = int('1000001111000000', 2)
objs = [
('1', Simhash(n1)),
('3', Simhash(n3)),
('4', Simhash(n4)),
]
index = SimhashIndex(objs, k=3)
found = search_simhash(index, Simhash(n5))
self.assertEqual(found, [])
found = search_simhash(index, Simhash(n1))
self.assertEqual(found, [1])
found = search_simhash(index, Simhash(n2))
self.assertEqual(found, [1])
found = search_simhash(index, Simhash(n4))
self.assertEqual(found, [3, 4])
def test_create(self):
index = SimhashIndex([], k=3)
hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
found = search_simhash(index, simhash_1)
self.assertEqual(found, [101])
found = search_simhash(index, simhash_2)
self.assertEqual(found, [102])
simhash_3 = get_simhash('hello ' * 20 + 'X')
found = search_simhash(index, simhash_3)
self.assertEqual(found, [101])

65
tests/text.py Normal file
View file

@ -0,0 +1,65 @@
"""
Test cases for text util.
"""
from unittest import TestCase
from atextcrawler.utils.html import clean_page
class CleanHtmlTest(TestCase):
"""
Test clean_page.
Have an eye on self-closing tags (br, hr, ...).
"""
def test_clean_page_1(self):
s = '<em>Hello</em><br><script>malicious="<script>"</script>anything'
r = '<em>Hello</em><br/>anything'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_2(self):
s = '<em>Hello</em><br /><script>malicious<script></script>anything'
r = '<em>Hello</em><br/>anything'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_3(self):
# nesting
s = '--<figure>xx<figure>yy</figure>zz</figure>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_4(self):
# aria-hidden
s = '--<p aria-hidden=true>xx</p>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden="true">xx</p>..'
r = '--..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden=false>xx</p>..'
r = '--<p aria-hidden="false">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden="false">xx</p>..'
r = '--<p aria-hidden="false">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
s = '--<p aria-hidden=??>xx</p>..'
r = '--<p aria-hidden="??">xx</p>..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_5(self):
# no removal
s = '--<p>xx<em>yy</em></p>..'
r = '--<p>xx<em>yy</em></p>..'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_6(self):
# self-closing tags to be removed
s = '--<area /><p>xx</p>\n...<h1>tt<area /></h1>nn'
r = '--<p>xx</p>\n...<h1>tt</h1>nn'
self.assertEqual(str(clean_page(s)), r)
def test_clean_page_7(self):
s = '--<p rel=search>tt<area /></p>nn'
r = '--nn'
self.assertEqual(str(clean_page(s)), r)