Skip to content

Commit

Permalink
Small to big retrieval - part 1
Browse files Browse the repository at this point in the history
  • Loading branch information
f-lombardo committed Aug 7, 2024
1 parent fc3aafc commit e1ee96a
Show file tree
Hide file tree
Showing 11 changed files with 197 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/Embeddings/DataReader/FileDataReader.php
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ private function getDocument(string $content, string $entry): mixed
$document->content = $content;
$document->sourceType = $this->sourceType;
$document->sourceName = $entry;
$document->hash = \hash('sha256', $content);

return $document;
}
Expand Down
1 change: 0 additions & 1 deletion src/Embeddings/DocumentSplitter/DocumentSplitter.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ public static function splitDocument(Document $document, int $maxLength = 1000,
return [$document];
}

$chunks = [];
$words = explode($separator, $text);
if ($wordOverlap > 0) {
$chunks = self::createChunksWithOverlap($words, $maxLength, $separator, $wordOverlap);
Expand Down
13 changes: 13 additions & 0 deletions src/Embeddings/DocumentStore/DocumentStore.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?php

namespace LLPhant\Embeddings\DocumentStore;

use LLPhant\Embeddings\Document;

interface DocumentStore
{
/**
* @return Document[]
*/
public function fetchDocumentsByChunkRange(string $sourceType, string $sourceName, int $leftIndex, int $rightIndex): array;
}
30 changes: 30 additions & 0 deletions src/Query/SemanticSearch/PipeDocumentsTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?php

namespace LLPhant\Query\SemanticSearch;

class PipeDocumentsTransformer implements RetrievedDocumentsTransformer
{
/**
* @var RetrievedDocumentsTransformer[]
*/
private readonly array $transformers;

public function __construct(RetrievedDocumentsTransformer ...$transformers)
{
$this->transformers = $transformers;
}

/**
* {@inheritDoc}
*/
public function transformDocuments(array $questions, array $retrievedDocs): array
{
$docs = $retrievedDocs;

foreach ($this->transformers as $transformer) {
$docs = $transformer->transformDocuments($questions, $docs);
}

return $docs;
}
}
47 changes: 47 additions & 0 deletions src/Query/SemanticSearch/SiblingsDocumentTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<?php

namespace LLPhant\Query\SemanticSearch;

use LLPhant\Embeddings\Document;
use LLPhant\Embeddings\DocumentStore\DocumentStore;

class SiblingsDocumentTransformer implements RetrievedDocumentsTransformer
{
public function __construct(private readonly DocumentStore $documentStore, private readonly int $nrOfSiblings)
{
}

/**
* {@inheritDoc}
*/
public function transformDocuments(array $questions, array $retrievedDocs): array
{
/** @var Document[] $extraDocs */
$extraDocs = [];
foreach ($retrievedDocs as $retrievedDoc) {
[$leftIndex, $rightIndex] = $this->getIndices($retrievedDoc->chunkNumber, $this->nrOfSiblings);
\array_push(
$extraDocs,
...$this->documentStore->fetchDocumentsByChunkRange($retrievedDoc->sourceType, $retrievedDoc->sourceName, $leftIndex, $rightIndex));
}

return $extraDocs;
}

/**
* @return int[]
*/
private function getIndices(int $position, int $numElements): array
{
if ($position < 0 || $numElements <= 0) {
throw new \InvalidArgumentException('Both position and numElements must be positive integers.');
}

$halfDistance = intdiv($numElements - 1, 2);
$halfDistance = min($position, $halfDistance);
$leftIndex = $position - $halfDistance;
$rightIndex = $position + ($numElements - 1 - $halfDistance);

return [$leftIndex, $rightIndex];
}
}
12 changes: 12 additions & 0 deletions tests/Fixtures/DocumentFixtures.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,16 @@ public static function documents(string ...$contents): array

return $result;
}

public static function documentChunk(int $i, string $sourceType, string $sourceName): Document
{
$document = new Document();
$document->sourceName = $sourceName;
$document->sourceType = $sourceType;
$document->chunkNumber = $i;
$document->content = 'Document '.$i;
$document->hash = \md5($document->content);

return $document;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?php

declare(strict_types=1);

namespace Tests\Integration\Query\SemanticSearch;

it('Todo: write this test', function () {
});
10 changes: 10 additions & 0 deletions tests/Unit/Embeddings/DataReader/FileDataReaderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,16 @@
['data-pdf.pdf', 'This data is from a pdf'],
]);

it('computes the hash of the content', function (string $docName) {
$filePath = __DIR__.'/FilesTestDirectory/'.$docName;
$reader = new FileDataReader($filePath);
$documents = $reader->getDocuments();

expect($documents[0]->hash)->toBe(\hash('sha256', $documents[0]->content));
})->with([
'hello.txt', 'hello2.txt',
]);

it('can read pdf and texts ', function () {
$filePath = __DIR__.'/FilesTestDirectory/';
$reader = new FileDataReader($filePath);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@
expect($result[1]->content)->toBe('with one overlapping');
expect($result[2]->content)->toBe('overlapping word');

expect($result[0]->chunkNumber)->toBe(0);
expect($result[1]->chunkNumber)->toBe(1);
expect($result[2]->chunkNumber)->toBe(2);

$document = new Document();
$document->content = 'This is a test with two overlapping words';
$result = DocumentSplitter::splitDocument($document, 20, ' ', 2);
Expand Down
36 changes: 36 additions & 0 deletions tests/Unit/Query/SemanticSearch/PipeDocumentTransformerTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

declare(strict_types=1);

namespace Tests\Unit\Query\SemanticSearch;

use LLPhant\Query\SemanticSearch\PipeDocumentsTransformer;
use LLPhant\Query\SemanticSearch\RetrievedDocumentsTransformer;
use Tests\Fixtures\DocumentFixtures;

function transformer(string $color): RetrievedDocumentsTransformer
{
return new class($color) implements RetrievedDocumentsTransformer
{
public function __construct(private readonly string $color)
{
}

public function transformDocuments(array $questions, array $retrievedDocs): array
{
foreach ($retrievedDocs as $retrievedDoc) {
$retrievedDoc->content .= ' '.$this->color;
}

return $retrievedDocs;
}
};
}

it('can pipe transformations', function () {
$transformer = new PipeDocumentsTransformer(transformer('green'), transformer('white'), transformer('red'));
$transformed = $transformer->transformDocuments(['sample'], DocumentFixtures::documents('one', 'two', 'three'));
expect($transformed[0]->content)->toBe('one green white red')
->and($transformed[1]->content)->toBe('two green white red')
->and($transformed[2]->content)->toBe('three green white red');
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

declare(strict_types=1);

namespace Tests\Unit\Query\SemanticSearch;

use LLPhant\Embeddings\Document;
use LLPhant\Embeddings\DocumentStore\DocumentStore;
use LLPhant\Query\SemanticSearch\SiblingsDocumentTransformer;
use Tests\Fixtures\DocumentFixtures;

function documentStore(): DocumentStore
{
return new class implements DocumentStore
{
public function fetchDocumentsByChunkRange(string $sourceType, string $sourceName, int $leftIndex, int $rightIndex): array
{
/** @var Document[] $documents */
$documents = [];
for ($i = $leftIndex; $i <= $rightIndex; $i++) {
$documents[] = DocumentFixtures::documentChunk($i, $sourceType, $sourceName);
}

return $documents;
}
};
}

it('can extract right data from document store', function () {
$documents = [DocumentFixtures::documentChunk(7, 'txt', 'test')];
$transformer = new SiblingsDocumentTransformer(documentStore(), 20);
$transformedDocuments = $transformer->transformDocuments(['Sample question'], $documents);
expect(count($transformedDocuments))->toBe(20)
->and($transformedDocuments[0]->chunkNumber)->toBe(0)
->and($transformedDocuments[19]->chunkNumber)->toBe(19);
});

0 comments on commit e1ee96a

Please sign in to comment.