diff --git a/src/DocumentProcessor.php b/src/DocumentProcessor.php index 4c6a9d8..3ca8fbe 100644 --- a/src/DocumentProcessor.php +++ b/src/DocumentProcessor.php @@ -24,4 +24,11 @@ enum DocumentProcessor: string * Uses LLamaCloud https://cloud.llamaindex.ai/ as document processor to extract text */ case LLAMAPARSE = 'llama'; + + /** + * The Unstructured processor + * + * Uses Unstructored https://unstructured.io/ as document processor to extract text + */ + case UNSTRUCTURED = 'unstructured'; } diff --git a/tests/ParseProcessorSelectionTest.php b/tests/ParseProcessorSelectionTest.php index b929c99..8791e61 100644 --- a/tests/ParseProcessorSelectionTest.php +++ b/tests/ParseProcessorSelectionTest.php @@ -99,3 +99,34 @@ $mockClient->assertSentCount(1); }); + +test('unstructured can be selected as processor', function () { + $mockClient = MockClient::global([ + ExtractTextRequest::class => MockResponse::fixture('extract-text-empty'), + ]); + + $connector = new ParseConnector('fake', 'http://localhost:5002'); + $connector->withMockClient($mockClient); + + $connector->parse( + url: 'http://localhost/empty.pdf', + options: new ParseOption(DocumentProcessor::UNSTRUCTURED), + ); + + $mockClient->assertSent(ExtractTextRequest::class); + + $mockClient->assertSent(function (Request $request, Response $response) { + if (! $request instanceof ExtractTextRequest) { + return false; + } + + /** @var array */ + $body = $request->body()->all(); + + return $body['url'] === 'http://localhost/empty.pdf' + && $body['mime_type'] === 'application/pdf' + && $body['driver'] === 'unstructured'; + }); + + $mockClient->assertSentCount(1); +});