diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php index c7d9d6502e2..36f71f6ccc7 100644 --- a/app/Search/SearchIndex.php +++ b/app/Search/SearchIndex.php @@ -16,7 +16,13 @@ class SearchIndex /** * A list of delimiter characters used to break-up parsed content into terms for indexing. */ - public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\""; + public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»"; + + /** + * A list of delimiter which could be commonly used within a single term and also indicate a break between terms. + * The indexer will index the full term with these delimiters, plus the terms split via these delimiters. + */ + public static string $softDelimiters = ".-"; public function __construct( protected EntityProvider $entityProvider @@ -196,15 +202,36 @@ protected function generateTermScoreMapFromTags(array $tags): array protected function textToTermCountMap(string $text): array { $tokenMap = []; // {TextToken => OccurrenceCount} - $splitChars = static::$delimiters; - $token = strtok($text, $splitChars); + $softDelims = static::$softDelimiters; + $tokenizer = new SearchTextTokenizer($text, static::$delimiters); + $extendedToken = ''; + $extendedLen = 0; + + $token = $tokenizer->next(); while ($token !== false) { - if (!isset($tokenMap[$token])) { - $tokenMap[$token] = 0; + $delim = $tokenizer->previousDelimiter(); + + if ($delim && str_contains($softDelims, $delim) && $token !== '') { + $extendedToken .= $delim . $token; + $extendedLen++; + } else { + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; + } + $extendedToken = $token; + $extendedLen = 1; } - $tokenMap[$token]++; - $token = strtok($splitChars); + + if ($token) { + $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1; + } + + $token = $tokenizer->next(); + } + + if ($extendedLen > 1) { + $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1; } return $tokenMap; diff --git a/app/Search/SearchOptions.php b/app/Search/SearchOptions.php index a6f82029920..bf527d9c305 100644 --- a/app/Search/SearchOptions.php +++ b/app/Search/SearchOptions.php @@ -181,7 +181,7 @@ protected static function decodeEscapes(string $input): string protected static function parseStandardTermString(string $termString): array { $terms = explode(' ', $termString); - $indexDelimiters = SearchIndex::$delimiters; + $indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters))); $parsed = [ 'terms' => [], 'exacts' => [], diff --git a/app/Search/SearchTextTokenizer.php b/app/Search/SearchTextTokenizer.php new file mode 100644 index 00000000000..f43fd56f113 --- /dev/null +++ b/app/Search/SearchTextTokenizer.php @@ -0,0 +1,70 @@ +length = strlen($this->text); + } + + /** + * Get the current delimiter to be found. + */ + public function currentDelimiter(): string + { + return $this->currentDelimiter; + } + + /** + * Get the previous delimiter found. + */ + public function previousDelimiter(): string + { + return $this->previousDelimiter; + } + + /** + * Get the next token between delimiters. + * Returns false if there's no further tokens. + */ + public function next(): string|false + { + $token = ''; + + for ($i = $this->currentIndex; $i < $this->length; $i++) { + $char = $this->text[$i]; + if (str_contains($this->delimiters, $char)) { + $this->previousDelimiter = $this->currentDelimiter; + $this->currentDelimiter = $char; + $this->currentIndex = $i + 1; + return $token; + } + + $token .= $char; + } + + if ($token) { + $this->currentIndex = $this->length; + $this->previousDelimiter = $this->currentDelimiter; + $this->currentDelimiter = ''; + return $token; + } + + return false; + } +} diff --git a/tests/Entity/EntitySearchTest.php b/tests/Search/EntitySearchTest.php similarity index 74% rename from tests/Entity/EntitySearchTest.php rename to tests/Search/EntitySearchTest.php index 5ace70e3ab2..9c76d0f7136 100644 --- a/tests/Entity/EntitySearchTest.php +++ b/tests/Search/EntitySearchTest.php @@ -1,12 +1,9 @@ assertDontSee($templatePage->name); } - public function test_sibling_search_for_pages() - { - $chapter = $this->entities->chapterHasPages(); - $this->assertGreaterThan(2, count($chapter->pages), 'Ensure we\'re testing with at least 1 sibling'); - $page = $chapter->pages->first(); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page"); - $search->assertSuccessful(); - foreach ($chapter->pages as $page) { - $search->assertSee($page->name); - } - - $search->assertDontSee($chapter->name); - } - - public function test_sibling_search_for_pages_without_chapter() - { - $page = $this->entities->pageNotWithinChapter(); - $bookChildren = $page->book->getDirectVisibleChildren(); - $this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling'); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page"); - $search->assertSuccessful(); - foreach ($bookChildren as $child) { - $search->assertSee($child->name); - } - - $search->assertDontSee($page->book->name); - } - - public function test_sibling_search_for_chapters() - { - $chapter = $this->entities->chapter(); - $bookChildren = $chapter->book->getDirectVisibleChildren(); - $this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling'); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$chapter->id}&entity_type=chapter"); - $search->assertSuccessful(); - foreach ($bookChildren as $child) { - $search->assertSee($child->name); - } - - $search->assertDontSee($chapter->book->name); - } - - public function test_sibling_search_for_books() - { - $books = Book::query()->take(10)->get(); - $book = $books->first(); - $this->assertGreaterThan(2, count($books), 'Ensure we\'re testing with at least 1 sibling'); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$book->id}&entity_type=book"); - $search->assertSuccessful(); - foreach ($books as $expectedBook) { - $search->assertSee($expectedBook->name); - } - } - - public function test_sibling_search_for_shelves() - { - $shelves = Bookshelf::query()->take(10)->get(); - $shelf = $shelves->first(); - $this->assertGreaterThan(2, count($shelves), 'Ensure we\'re testing with at least 1 sibling'); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$shelf->id}&entity_type=bookshelf"); - $search->assertSuccessful(); - foreach ($shelves as $expectedShelf) { - $search->assertSee($expectedShelf->name); - } - } - - public function test_sibling_search_for_books_provides_results_in_alphabetical_order() - { - $contextBook = $this->entities->book(); - $searchBook = $this->entities->book(); - - $searchBook->name = 'Zebras'; - $searchBook->save(); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book"); - $this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras'); - - $searchBook->name = '1AAAAAAArdvarks'; - $searchBook->save(); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book"); - $this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks'); - } - - public function test_sibling_search_for_shelves_provides_results_in_alphabetical_order() - { - $contextShelf = $this->entities->shelf(); - $searchShelf = $this->entities->shelf(); - - $searchShelf->name = 'Zebras'; - $searchShelf->save(); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf"); - $this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras'); - - $searchShelf->name = '1AAAAAAArdvarks'; - $searchShelf->save(); - - $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf"); - $this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks'); - } - public function test_search_works_on_updated_page_content() { $page = $this->entities->page(); @@ -453,75 +343,6 @@ public function test_search_ranks_common_words_lower() $this->withHtml($search)->assertElementContains('.entity-list > .page:nth-child(2)', 'Test page A'); } - public function test_terms_in_headers_have_an_adjusted_index_score() - { - $page = $this->entities->newPage(['name' => 'Test page A', 'html' => ' -
TermA
-' . $text . '
']); - - $termCount = $page->searchTerms()->count(); - - // Expect at least 90% unique rate - $this->assertGreaterThan($count * 0.9, $termCount); - } - - public function test_name_and_content_terms_are_merged_to_single_score() - { - $page = $this->entities->newPage(['name' => 'TermA', 'html' => ' -TermA
- ']); - - $scoreByTerm = $page->searchTerms()->pluck('score', 'term'); - - // Scores 40 for being in the name then 1 for being in the content - $this->assertEquals(41, $scoreByTerm->get('TermA')); - } - - public function test_tag_names_and_values_are_indexed_for_search() - { - $page = $this->entities->newPage(['name' => 'PageA', 'html' => 'content
', 'tags' => [ - ['name' => 'Animal', 'value' => 'MeowieCat'], - ['name' => 'SuperImportant'], - ]]); - - $scoreByTerm = $page->searchTerms()->pluck('score', 'term'); - $this->assertEquals(5, $scoreByTerm->get('MeowieCat')); - $this->assertEquals(3, $scoreByTerm->get('Animal')); - $this->assertEquals(3, $scoreByTerm->get('SuperImportant')); - } - public function test_matching_terms_in_search_results_are_highlighted() { $this->entities->newPage(['name' => 'My Meowie Cat', 'html' => 'A superimportant page about meowieable animals
', 'tags' => [ diff --git a/tests/Search/SearchIndexingTest.php b/tests/Search/SearchIndexingTest.php new file mode 100644 index 00000000000..57cf412e1b7 --- /dev/null +++ b/tests/Search/SearchIndexingTest.php @@ -0,0 +1,109 @@ +entities->newPage(['name' => 'Test page A', 'html' => ' +TermA
+' . $text . '
']); + + $termCount = $page->searchTerms()->count(); + + // Expect at least 90% unique rate + $this->assertGreaterThan($count * 0.9, $termCount); + } + + public function test_name_and_content_terms_are_merged_to_single_score() + { + $page = $this->entities->newPage(['name' => 'TermA', 'html' => ' +TermA
+ ']); + + $scoreByTerm = $page->searchTerms()->pluck('score', 'term'); + + // Scores 40 for being in the name then 1 for being in the content + $this->assertEquals(41, $scoreByTerm->get('TermA')); + } + + public function test_tag_names_and_values_are_indexed_for_search() + { + $page = $this->entities->newPage(['name' => 'PageA', 'html' => 'content
', 'tags' => [ + ['name' => 'Animal', 'value' => 'MeowieCat'], + ['name' => 'SuperImportant'], + ]]); + + $scoreByTerm = $page->searchTerms()->pluck('score', 'term'); + $this->assertEquals(5, $scoreByTerm->get('MeowieCat')); + $this->assertEquals(3, $scoreByTerm->get('Animal')); + $this->assertEquals(3, $scoreByTerm->get('SuperImportant')); + } + + public function test_terms_containing_guillemets_handled() + { + $page = $this->entities->newPage(['html' => '«Hello there» and « there »
']); + + $scoreByTerm = $page->searchTerms()->pluck('score', 'term'); + $expected = ['Hello', 'there', 'and']; + foreach ($expected as $term) { + $this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed"); + } + + $nonExpected = ['«', '»']; + foreach ($nonExpected as $term) { + $this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed"); + } + } + + public function test_terms_containing_punctuation_within_retain_original_form_and_split_form_in_index() + { + $page = $this->entities->newPage(['html' => 'super.duper awesome-beans big- barry cheese.
biscuits
a-bs
']); + + $scoreByTerm = $page->searchTerms()->pluck('score', 'term'); + $expected = ['super', 'duper', 'super.duper', 'awesome-beans', 'awesome', 'beans', 'big', 'barry', 'cheese', 'biscuits', 'a-bs', 'a', 'bs']; + foreach ($expected as $term) { + $this->assertNotNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is indexed"); + } + + $nonExpected = ['big-', 'big-barry', 'cheese.', 'cheese.biscuits']; + foreach ($nonExpected as $term) { + $this->assertNull($scoreByTerm->get($term), "Failed asserting that \"$term\" is not indexed"); + } + } +} diff --git a/tests/Entity/SearchOptionsTest.php b/tests/Search/SearchOptionsTest.php similarity index 99% rename from tests/Entity/SearchOptionsTest.php rename to tests/Search/SearchOptionsTest.php index 0c2ad271c58..39c20c19591 100644 --- a/tests/Entity/SearchOptionsTest.php +++ b/tests/Search/SearchOptionsTest.php @@ -1,6 +1,6 @@ entities->chapterHasPages(); + $this->assertGreaterThan(2, count($chapter->pages), 'Ensure we\'re testing with at least 1 sibling'); + $page = $chapter->pages->first(); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page"); + $search->assertSuccessful(); + foreach ($chapter->pages as $page) { + $search->assertSee($page->name); + } + + $search->assertDontSee($chapter->name); + } + + public function test_sibling_search_for_pages_without_chapter() + { + $page = $this->entities->pageNotWithinChapter(); + $bookChildren = $page->book->getDirectVisibleChildren(); + $this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling'); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page"); + $search->assertSuccessful(); + foreach ($bookChildren as $child) { + $search->assertSee($child->name); + } + + $search->assertDontSee($page->book->name); + } + + public function test_sibling_search_for_chapters() + { + $chapter = $this->entities->chapter(); + $bookChildren = $chapter->book->getDirectVisibleChildren(); + $this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling'); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$chapter->id}&entity_type=chapter"); + $search->assertSuccessful(); + foreach ($bookChildren as $child) { + $search->assertSee($child->name); + } + + $search->assertDontSee($chapter->book->name); + } + + public function test_sibling_search_for_books() + { + $books = Book::query()->take(10)->get(); + $book = $books->first(); + $this->assertGreaterThan(2, count($books), 'Ensure we\'re testing with at least 1 sibling'); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$book->id}&entity_type=book"); + $search->assertSuccessful(); + foreach ($books as $expectedBook) { + $search->assertSee($expectedBook->name); + } + } + + public function test_sibling_search_for_shelves() + { + $shelves = Bookshelf::query()->take(10)->get(); + $shelf = $shelves->first(); + $this->assertGreaterThan(2, count($shelves), 'Ensure we\'re testing with at least 1 sibling'); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$shelf->id}&entity_type=bookshelf"); + $search->assertSuccessful(); + foreach ($shelves as $expectedShelf) { + $search->assertSee($expectedShelf->name); + } + } + + public function test_sibling_search_for_books_provides_results_in_alphabetical_order() + { + $contextBook = $this->entities->book(); + $searchBook = $this->entities->book(); + + $searchBook->name = 'Zebras'; + $searchBook->save(); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book"); + $this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras'); + + $searchBook->name = '1AAAAAAArdvarks'; + $searchBook->save(); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book"); + $this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks'); + } + + public function test_sibling_search_for_shelves_provides_results_in_alphabetical_order() + { + $contextShelf = $this->entities->shelf(); + $searchShelf = $this->entities->shelf(); + + $searchShelf->name = 'Zebras'; + $searchShelf->save(); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf"); + $this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras'); + + $searchShelf->name = '1AAAAAAArdvarks'; + $searchShelf->save(); + + $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf"); + $this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks'); + } +}