Skip to content

Commit

Permalink
Merge pull request #89 from Encamina/@lmarcos/add_semantic_document_e…
Browse files Browse the repository at this point in the history
…xtractor

@lmarcos/add semantic document extractor
  • Loading branch information
LuisM000 authored Mar 12, 2024
2 parents 03b558a + 9b1de7f commit 0d801ab
Show file tree
Hide file tree
Showing 13 changed files with 181 additions and 23 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,16 @@ Previous classification is not required if changes are simple or all belong to t

- In `AzureOpenAIOptions` the default value of `ServiceVersion` changes from `V2023_12_01_Preview` to `V2024_02_15_Preview` since the former is **deprecated**.
- In the `QuestionAnsweringFromMemoryQuery` function of the `QuestionAnsweringPlugin`, a `null` value is no longer returned when there are no relevant memory results. Instead, the execution flow continues, prompting a message with an empty context information, ultimately resulting in a response such as "I don't know" or a similar message.
- Added a new method `GetDocumentContentAsync` to the `IDocumentContentExtractor` interface, which is now required to be implemented.

### Major Changes

- In interface type `IChatHistoryProvider` added new method `DeleteChatMessagesHistoryAsync` to delete a user's chat history messages. This method is implemented in `ChatHistoryProvider`.
- Added new interface `Encamina.Enmarcha.AI.Abstractions.ISemanticTextSplitter` and its implementations `Encamina.Enmarcha.AI.SemanticTextSplitter` to split a text into meaningful chunks based on embeddings.
- Added a new utility class for mathematical operations `Encamina.Enmarcha.Core.MathUtils`.
- Fixed `DeleteAsync<TEntityId>` method in `CosmosRepository<T>`. This method was always throwing exceptions because the partition key value was always `null`. It is fixed by considering the `Id` to delete the whole partition. If a specific item in the partition should be removed, then use the `DeleteAsync` on-generic method.
- Added `DefaultDocumentContentSemanticExtractor` to retrieve semantic chunks from documents.
- Bug fix in the `MathUtils.Quartiles` method.
- Updated dependencies:
- Updated `Bogus` from `35.4.0` to `35.4.1`.
- Updated `Azure.Core` from `1.37.0` to `1.38.0`.
Expand Down
2 changes: 1 addition & 1 deletion Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

<PropertyGroup>
<VersionPrefix>8.1.5</VersionPrefix>
<VersionSuffix>preview-07</VersionSuffix>
<VersionSuffix>preview-08</VersionSuffix>
</PropertyGroup>

<!--
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,13 @@ public interface IDocumentContentExtractor
/// <param name="fileExtension">The extension of the document file.</param>
/// <returns>The text content of the document.</returns>
IEnumerable<string> GetDocumentContent(Stream stream, string fileExtension);

/// <summary>
/// Asynchronously extracts the content from a document stream.
/// </summary>
/// <param name="stream">The document stream.</param>
/// <param name="fileExtension">The extension of the document file.</param>
/// <param name="cancellationToken">A token to cancel the asynchronous operation.</param>
/// <returns>The text content of the document.</returns>
Task<IEnumerable<string>> GetDocumentContentAsync(Stream stream, string fileExtension, CancellationToken cancellationToken);
}
8 changes: 4 additions & 4 deletions src/Encamina.Enmarcha.Core/MathUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ public static double StandardDeviation(IEnumerable<double> values)
/// <returns>A tuple containing the first quartile (Q1) and third quartile (Q3).</returns>
public static (double Q1, double Q3) Quartiles(IEnumerable<double> values)
{
var listValues = values.ToList();
var count = listValues.Count;
var sortedValues = values.OrderBy(x => x).ToList();
var count = sortedValues.Count;

var q1 = CalculateMedian(listValues.Take(count / 2));
var q3 = CalculateMedian(listValues.Skip((count + 1) / 2));
var q1 = CalculateMedian(sortedValues.Take(count / 2));
var q3 = CalculateMedian(sortedValues.Skip((count + 1) / 2));

return (q1, q3);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
using System.Text;

using Encamina.Enmarcha.AI.Abstractions;
using Encamina.Enmarcha.Core.Extensions;
using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Resources;
using Encamina.Enmarcha.AI.Abstractions;

using Microsoft.SemanticKernel.Plugins.Document;
using Microsoft.SemanticKernel.Plugins.Document.OpenXml;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document;

Expand All @@ -26,22 +20,18 @@ namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document;
/// </remarks>
internal sealed class DefaultDocumentContentExtractor : DocumentContentExtractorBase
{
/// <summary>
/// Initializes a new instance of the <see cref="DefaultDocumentContentExtractor"/> class.
/// </summary>
/// <param name="textSplitter">The text splitter used by this instance.</param>
/// <param name="lengthFunction">The function for determining the length of a string.</param>
public DefaultDocumentContentExtractor(ITextSplitter textSplitter, Func<string, int> lengthFunction) : base(textSplitter, lengthFunction)
{
}

/// <inheritdoc/>
public override IDocumentConnector GetDocumentConnector(string fileExtension)
{
return fileExtension.ToUpperInvariant() switch
{
@".DOCX" => new WordDocumentConnector(),
@".PDF" => new CleanPdfDocumentConnector(),
@".PPTX" => new ParagraphPptxDocumentConnector(),
@".TXT" => new TxtDocumentConnector(Encoding.UTF8),
@".MD" => new TxtDocumentConnector(Encoding.UTF8),
@".VTT" => new VttDocumentConnector(Encoding.UTF8),
_ => throw new NotSupportedException(ExceptionMessages.ResourceManager.GetFormattedStringByCurrentCulture(nameof(ExceptionMessages.FileExtensionNotSupported), fileExtension)),
};
return IDocumentConnectorUtils.GetDefaultDocumentConnector(fileExtension);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
using Encamina.Enmarcha.AI.Abstractions;

using Microsoft.SemanticKernel.Plugins.Document;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document;

/// <summary>
/// Default implementation of a document content semantic extractor.
/// </summary>
/// <remarks>
/// This default implementation supports the following file extensions:
/// <list type="bullet">
/// <item><c>docx</c></item>
/// <item><c>md</c></item>
/// <item><c>pdf</c></item>
/// <item><c>pptx</c></item>
/// <item><c>txt</c></item>
/// <item><c>vtt</c></item>
/// </list>
/// </remarks>
internal sealed class DefaultDocumentContentSemanticExtractor : DocumentContentSemanticExtractorBase
{
/// <summary>
/// Initializes a new instance of the <see cref="DefaultDocumentContentSemanticExtractor"/> class.
/// </summary>
/// <param name="semanticTextSplitter">A valid instance of <see cref="ISemanticTextSplitter"/> to use when extracting semantic content from documents.</param>
/// <param name="embeddingsGeneratorFunction">An embeddings function to use when extracting semantic content from documents.</param>
public DefaultDocumentContentSemanticExtractor(ISemanticTextSplitter semanticTextSplitter, Func<IList<string>, CancellationToken, Task<IList<ReadOnlyMemory<float>>>> embeddingsGeneratorFunction) : base(semanticTextSplitter, embeddingsGeneratorFunction)
{
}

/// <inheritdoc/>
public override IDocumentConnector GetDocumentConnector(string fileExtension)
{
return IDocumentConnectorUtils.GetDefaultDocumentConnector(fileExtension);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ public virtual IEnumerable<string> GetDocumentContent(Stream stream, string file
return TextSplitter.Split(content, LengthFunction);
}

/// <inheritdoc/>
public Task<IEnumerable<string>> GetDocumentContentAsync(Stream stream, string fileExtension, CancellationToken cancellationToken)
{
// Using Task.Run instead of Task.FromResult because the operation in GetDocumentContent is potentially slow,
// and Task.Run ensures it is executed on a separate thread, maintaining responsiveness.
return Task.Run(() => GetDocumentContent(stream, fileExtension), cancellationToken);
}

/// <inheritdoc/>
public abstract IDocumentConnector GetDocumentConnector(string fileExtension);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
using Encamina.Enmarcha.AI.Abstractions;

using Microsoft.SemanticKernel.Plugins.Document;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document;

/// <summary>
/// Base class for document content semantic extractors.
/// </summary>
public abstract class DocumentContentSemanticExtractorBase : IDocumentConnectorProvider, IDocumentContentExtractor
{
/// <summary>
/// Initializes a new instance of the <see cref="DocumentContentSemanticExtractorBase"/> class.
/// </summary>
/// <param name="semanticTextSplitter">A valid instance of <see cref="ISemanticTextSplitter"/> to use when extracting semantic content from documents.</param>
/// <param name="embeddingsGeneratorFunction">An embeddings function to use when extracting semantic content from documents.</param>
protected DocumentContentSemanticExtractorBase(ISemanticTextSplitter semanticTextSplitter, Func<IList<string>, CancellationToken, Task<IList<ReadOnlyMemory<float>>>> embeddingsGeneratorFunction)
{
SemanticTextSplitter = semanticTextSplitter;
EmbeddingsGeneratorFunction = embeddingsGeneratorFunction;
}

/// <summary>
/// Gets the text semantic splitter used by this instance of a document content extractor.
/// </summary>
protected ISemanticTextSplitter SemanticTextSplitter { get; }

/// <summary>
/// Gets the function for generating embeddings from a list of strings.
/// </summary>
protected Func<IList<string>, CancellationToken, Task<IList<ReadOnlyMemory<float>>>> EmbeddingsGeneratorFunction { get; }

/// <inheritdoc/>
/// <remarks>Consider using the asynchronous version directly to avoid the risk of blocking the calling thread.</remarks>
public IEnumerable<string> GetDocumentContent(Stream stream, string fileExtension)
{
return GetDocumentContentAsync(stream, fileExtension, default).GetAwaiter().GetResult();
}

/// <inheritdoc/>
public virtual Task<IEnumerable<string>> GetDocumentContentAsync(Stream stream, string fileExtension, CancellationToken cancellationToken)
{
var connector = GetDocumentConnector(fileExtension);

var content = connector.ReadText(stream);

return SemanticTextSplitter.SplitAsync(content, EmbeddingsGeneratorFunction, cancellationToken);
}

/// <inheritdoc/>
public abstract IDocumentConnector GetDocumentConnector(string fileExtension);
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
<ItemGroup>
<ProjectReference Include="..\Encamina.Enmarcha.AI.Abstractions\Encamina.Enmarcha.AI.Abstractions.csproj" />
<ProjectReference Include="..\Encamina.Enmarcha.Core\Encamina.Enmarcha.Core.csproj" />
<ProjectReference Include="..\Encamina.Enmarcha.DependencyInjection\Encamina.Enmarcha.DependencyInjection.csproj" />
</ItemGroup>

<ItemGroup>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,27 @@ public static IServiceCollection AddDefaultDocumentContentExtractor(this IServic
return services.AddDefaultDocumentContentExtractor();
}

/// <summary>
/// Adds a default implementation of Semantic <see cref="IDocumentContentExtractor"/> to the specified <see cref="IServiceCollection"/> as a singleton service.
/// </summary>
/// <param name="services">The <see cref="IServiceCollection"/> to add services to.</param>
/// <returns>The <see cref="IServiceCollection"/> so that additional calls can be chained.</returns>
public static IServiceCollection AddDefaultDocumentContentSemanticExtractor(this IServiceCollection services)
{
return services.AddDefaultDocumentContentSemanticExtractor(ServiceLifetime.Singleton);
}

/// <summary>
/// Adds a default implementation of Semantic <see cref="IDocumentContentExtractor"/> to the specified <see cref="IServiceCollection"/>.
/// </summary>
/// <param name="services">The <see cref="IServiceCollection"/> to add services to.</param>
/// <returns>The <see cref="IServiceCollection"/> so that additional calls can be chained.</returns>
/// <param name="serviceLifetime">The lifetime for the registered services.</param>
public static IServiceCollection AddDefaultDocumentContentSemanticExtractor(this IServiceCollection services, ServiceLifetime serviceLifetime)
{
return services.AddType<IDocumentContentExtractor, DefaultDocumentContentSemanticExtractor>(serviceLifetime);
}

/// <summary>
/// Adds a default implementation of <see cref="IDocumentConnectorProvider"/> to the specified <see cref="IServiceCollection"/> as a singleton service.
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document;
public interface IDocumentConnectorProvider
{
/// <summary>
/// Determines the most appropriate document connector from an specified file extension.
/// Determines the most appropriate document connector from a specified file extension.
/// </summary>
/// <param name="fileExtension">The file extension.</param>
/// <returns>A valid instance of <see cref="IDocumentConnector"/> that could handle documents from the given file extension.</returns>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
using System.Text;

using Encamina.Enmarcha.Core.Extensions;
using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Resources;

using Microsoft.SemanticKernel.Plugins.Document;
using Microsoft.SemanticKernel.Plugins.Document.OpenXml;

namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document;

/// <summary>
/// Utility class providing methods for working with document connectors.
/// </summary>
internal static class IDocumentConnectorUtils
{
/// <summary>
/// Gets the default document connector based on the specified file extension.
/// </summary>
/// <param name="fileExtension">The file extension for which to retrieve the connector.</param>
/// <returns>An instance of the default document connector for the specified file extension.</returns>
/// <exception cref="NotSupportedException">Thrown when the file extension is not supported.</exception>
internal static IDocumentConnector GetDefaultDocumentConnector(string fileExtension)
{
return fileExtension.ToUpperInvariant() switch
{
@".DOCX" => new WordDocumentConnector(),
@".PDF" => new CleanPdfDocumentConnector(),
@".PPTX" => new ParagraphPptxDocumentConnector(),
@".TXT" => new TxtDocumentConnector(Encoding.UTF8),
@".MD" => new TxtDocumentConnector(Encoding.UTF8),
@".VTT" => new VttDocumentConnector(Encoding.UTF8),
_ => throw new NotSupportedException(ExceptionMessages.ResourceManager.GetFormattedStringByCurrentCulture(nameof(ExceptionMessages.FileExtensionNotSupported), fileExtension)),
};
}
}
1 change: 1 addition & 0 deletions tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public void Calculate_StandardDeviation_Succeeds(double[] values, double expecte
[InlineData(new double[] { 1, 15, 19, 64 }, 33.5)]
[InlineData(new double[] { 10, 20, 30, 40, 50, 60 }, 30)]
[InlineData(new double[] { 0.1, 0.2, 0.3, 0.4, 0.5 }, 0.3)]
[InlineData(new double[] { 0.3, 0.5, 0.4, 0.2, 0.1 }, 0.3)]
public void Calculate_InterquartileRange_Succeeds(double[] values, double expected)
{
var result = MathUtils.InterquartileRange(values);
Expand Down

0 comments on commit 0d801ab

Please sign in to comment.