-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #89 from Encamina/@lmarcos/add_semantic_document_e…
…xtractor @lmarcos/add semantic document extractor
- Loading branch information
Showing
13 changed files
with
181 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
37 changes: 37 additions & 0 deletions
37
...na.Enmarcha.SemanticKernel.Connectors.Document/DefaultDocumentContentSemanticExtractor.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
using Encamina.Enmarcha.AI.Abstractions; | ||
|
||
using Microsoft.SemanticKernel.Plugins.Document; | ||
|
||
namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document; | ||
|
||
/// <summary> | ||
/// Default implementation of a document content semantic extractor. | ||
/// </summary> | ||
/// <remarks> | ||
/// This default implementation supports the following file extensions: | ||
/// <list type="bullet"> | ||
/// <item><c>docx</c></item> | ||
/// <item><c>md</c></item> | ||
/// <item><c>pdf</c></item> | ||
/// <item><c>pptx</c></item> | ||
/// <item><c>txt</c></item> | ||
/// <item><c>vtt</c></item> | ||
/// </list> | ||
/// </remarks> | ||
internal sealed class DefaultDocumentContentSemanticExtractor : DocumentContentSemanticExtractorBase | ||
{ | ||
/// <summary> | ||
/// Initializes a new instance of the <see cref="DefaultDocumentContentSemanticExtractor"/> class. | ||
/// </summary> | ||
/// <param name="semanticTextSplitter">A valid instance of <see cref="ISemanticTextSplitter"/> to use when extracting semantic content from documents.</param> | ||
/// <param name="embeddingsGeneratorFunction">An embeddings function to use when extracting semantic content from documents.</param> | ||
public DefaultDocumentContentSemanticExtractor(ISemanticTextSplitter semanticTextSplitter, Func<IList<string>, CancellationToken, Task<IList<ReadOnlyMemory<float>>>> embeddingsGeneratorFunction) : base(semanticTextSplitter, embeddingsGeneratorFunction) | ||
{ | ||
} | ||
|
||
/// <inheritdoc/> | ||
public override IDocumentConnector GetDocumentConnector(string fileExtension) | ||
{ | ||
return IDocumentConnectorUtils.GetDefaultDocumentConnector(fileExtension); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
52 changes: 52 additions & 0 deletions
52
...amina.Enmarcha.SemanticKernel.Connectors.Document/DocumentContentSemanticExtractorBase.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
using Encamina.Enmarcha.AI.Abstractions; | ||
|
||
using Microsoft.SemanticKernel.Plugins.Document; | ||
|
||
namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document; | ||
|
||
/// <summary> | ||
/// Base class for document content semantic extractors. | ||
/// </summary> | ||
public abstract class DocumentContentSemanticExtractorBase : IDocumentConnectorProvider, IDocumentContentExtractor | ||
{ | ||
/// <summary> | ||
/// Initializes a new instance of the <see cref="DocumentContentSemanticExtractorBase"/> class. | ||
/// </summary> | ||
/// <param name="semanticTextSplitter">A valid instance of <see cref="ISemanticTextSplitter"/> to use when extracting semantic content from documents.</param> | ||
/// <param name="embeddingsGeneratorFunction">An embeddings function to use when extracting semantic content from documents.</param> | ||
protected DocumentContentSemanticExtractorBase(ISemanticTextSplitter semanticTextSplitter, Func<IList<string>, CancellationToken, Task<IList<ReadOnlyMemory<float>>>> embeddingsGeneratorFunction) | ||
{ | ||
SemanticTextSplitter = semanticTextSplitter; | ||
EmbeddingsGeneratorFunction = embeddingsGeneratorFunction; | ||
} | ||
|
||
/// <summary> | ||
/// Gets the text semantic splitter used by this instance of a document content extractor. | ||
/// </summary> | ||
protected ISemanticTextSplitter SemanticTextSplitter { get; } | ||
|
||
/// <summary> | ||
/// Gets the function for generating embeddings from a list of strings. | ||
/// </summary> | ||
protected Func<IList<string>, CancellationToken, Task<IList<ReadOnlyMemory<float>>>> EmbeddingsGeneratorFunction { get; } | ||
|
||
/// <inheritdoc/> | ||
/// <remarks>Consider using the asynchronous version directly to avoid the risk of blocking the calling thread.</remarks> | ||
public IEnumerable<string> GetDocumentContent(Stream stream, string fileExtension) | ||
{ | ||
return GetDocumentContentAsync(stream, fileExtension, default).GetAwaiter().GetResult(); | ||
} | ||
|
||
/// <inheritdoc/> | ||
public virtual Task<IEnumerable<string>> GetDocumentContentAsync(Stream stream, string fileExtension, CancellationToken cancellationToken) | ||
{ | ||
var connector = GetDocumentConnector(fileExtension); | ||
|
||
var content = connector.ReadText(stream); | ||
|
||
return SemanticTextSplitter.SplitAsync(content, EmbeddingsGeneratorFunction, cancellationToken); | ||
} | ||
|
||
/// <inheritdoc/> | ||
public abstract IDocumentConnector GetDocumentConnector(string fileExtension); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
36 changes: 36 additions & 0 deletions
36
src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/IDocumentConnectorUtils.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
using System.Text; | ||
|
||
using Encamina.Enmarcha.Core.Extensions; | ||
using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors; | ||
using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Resources; | ||
|
||
using Microsoft.SemanticKernel.Plugins.Document; | ||
using Microsoft.SemanticKernel.Plugins.Document.OpenXml; | ||
|
||
namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document; | ||
|
||
/// <summary> | ||
/// Utility class providing methods for working with document connectors. | ||
/// </summary> | ||
internal static class IDocumentConnectorUtils | ||
{ | ||
/// <summary> | ||
/// Gets the default document connector based on the specified file extension. | ||
/// </summary> | ||
/// <param name="fileExtension">The file extension for which to retrieve the connector.</param> | ||
/// <returns>An instance of the default document connector for the specified file extension.</returns> | ||
/// <exception cref="NotSupportedException">Thrown when the file extension is not supported.</exception> | ||
internal static IDocumentConnector GetDefaultDocumentConnector(string fileExtension) | ||
{ | ||
return fileExtension.ToUpperInvariant() switch | ||
{ | ||
@".DOCX" => new WordDocumentConnector(), | ||
@".PDF" => new CleanPdfDocumentConnector(), | ||
@".PPTX" => new ParagraphPptxDocumentConnector(), | ||
@".TXT" => new TxtDocumentConnector(Encoding.UTF8), | ||
@".MD" => new TxtDocumentConnector(Encoding.UTF8), | ||
@".VTT" => new VttDocumentConnector(Encoding.UTF8), | ||
_ => throw new NotSupportedException(ExceptionMessages.ResourceManager.GetFormattedStringByCurrentCulture(nameof(ExceptionMessages.FileExtensionNotSupported), fileExtension)), | ||
}; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters