54 lines
1.7 KiB
Python
54 lines
1.7 KiB
Python
"""
|
|
Test cases for text util.
|
|
"""
|
|
|
|
from unittest import TestCase
|
|
from simhash import Simhash, SimhashIndex
|
|
from atextcrawler.utils.similarity import (
|
|
create_simhash,
|
|
get_features,
|
|
get_simhash,
|
|
postgresql_bigint_offset,
|
|
search_simhash,
|
|
)
|
|
|
|
|
|
class SimhashTest(TestCase):
|
|
"""
|
|
Test simhash creation and search.
|
|
"""
|
|
|
|
def test_search(self):
|
|
n1 = int('1111111100000000', 2)
|
|
n2 = int('1111111100000111', 2)
|
|
n3 = int('1000000000000000', 2)
|
|
n4 = int('1000000000000111', 2)
|
|
n5 = int('1000001111000000', 2)
|
|
objs = [
|
|
('1', Simhash(n1)),
|
|
('3', Simhash(n3)),
|
|
('4', Simhash(n4)),
|
|
]
|
|
index = SimhashIndex(objs, k=3)
|
|
found = search_simhash(index, Simhash(n5))
|
|
self.assertEqual(found, [])
|
|
found = search_simhash(index, Simhash(n1))
|
|
self.assertEqual(found, [1])
|
|
found = search_simhash(index, Simhash(n2))
|
|
self.assertEqual(found, [1])
|
|
found = search_simhash(index, Simhash(n4))
|
|
self.assertEqual(found, [3, 4])
|
|
|
|
def test_create(self):
|
|
index = SimhashIndex([], k=3)
|
|
hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20))
|
|
hash_val_2 = create_simhash(index, 102, get_simhash('another one'))
|
|
simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset)
|
|
simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset)
|
|
found = search_simhash(index, simhash_1)
|
|
self.assertEqual(found, [101])
|
|
found = search_simhash(index, simhash_2)
|
|
self.assertEqual(found, [102])
|
|
simhash_3 = get_simhash('hello ' * 20 + 'X')
|
|
found = search_simhash(index, simhash_3)
|
|
self.assertEqual(found, [101])
|