From f2c3419dfe96df58d9751f346c91074959f68dbc Mon Sep 17 00:00:00 2001 From: Luis Marcos Rivera Date: Thu, 7 Mar 2024 14:52:40 +0100 Subject: [PATCH 1/4] Added initial version of SemanticTextChunker --- Enmarcha.sln | 7 + .../BreakpointThresholdType.cs | 22 ++ .../Encamina.Enmarcha.AI.Abstractions.csproj | 1 + .../ISemanticTextSplitter.cs | 16 ++ .../SemanticTextSplitterOptions.cs | 41 ++++ .../TextSplitters/SemanticTextSplitter.cs | 198 ++++++++++++++++++ src/Encamina.Enmarcha.Core/MathUtils.cs | 82 ++++++++ .../Encamina.Enmarcha.Core.Tests.csproj | 11 + .../MathUtilsTests.cs | 44 ++++ tst/Encamina.Enmarcha.Core.Tests/Usings.cs | 1 + 10 files changed, 423 insertions(+) create mode 100644 src/Encamina.Enmarcha.AI.Abstractions/BreakpointThresholdType.cs create mode 100644 src/Encamina.Enmarcha.AI.Abstractions/ISemanticTextSplitter.cs create mode 100644 src/Encamina.Enmarcha.AI.Abstractions/SemanticTextSplitterOptions.cs create mode 100644 src/Encamina.Enmarcha.AI/TextSplitters/SemanticTextSplitter.cs create mode 100644 src/Encamina.Enmarcha.Core/MathUtils.cs create mode 100644 tst/Encamina.Enmarcha.Core.Tests/Encamina.Enmarcha.Core.Tests.csproj create mode 100644 tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs create mode 100644 tst/Encamina.Enmarcha.Core.Tests/Usings.cs diff --git a/Enmarcha.sln b/Enmarcha.sln index 89ef828..1e9db22 100644 --- a/Enmarcha.sln +++ b/Enmarcha.sln @@ -148,6 +148,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Encamina.Enmarcha.Data.Azur EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Encamina.Enmarcha.AspNet.OpenApi", "src\Encamina.Enmarcha.AspNet.OpenApi\Encamina.Enmarcha.AspNet.OpenApi.csproj", "{0EFAA5CF-7106-40E0-A427-1CFBFFAEA3EC}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Encamina.Enmarcha.Core.Tests", "tst\Encamina.Enmarcha.Core.Tests\Encamina.Enmarcha.Core.Tests.csproj", "{0516ADAE-C543-4B48-94EE-AC535DEFED0E}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -370,6 +372,10 @@ Global {0EFAA5CF-7106-40E0-A427-1CFBFFAEA3EC}.Debug|Any CPU.Build.0 = Debug|Any CPU {0EFAA5CF-7106-40E0-A427-1CFBFFAEA3EC}.Release|Any CPU.ActiveCfg = Release|Any CPU {0EFAA5CF-7106-40E0-A427-1CFBFFAEA3EC}.Release|Any CPU.Build.0 = Release|Any CPU + {0516ADAE-C543-4B48-94EE-AC535DEFED0E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0516ADAE-C543-4B48-94EE-AC535DEFED0E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0516ADAE-C543-4B48-94EE-AC535DEFED0E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0516ADAE-C543-4B48-94EE-AC535DEFED0E}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -387,6 +393,7 @@ Global {AA1E5E93-FE02-4395-9260-C7C869F22785} = {43252034-27E2-4981-AC2D-EA986B287863} {7F3ECD81-28E6-4000-9005-1B2ABA8EC1C5} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78} {7B6F4DC4-74E2-4013-8DBA-12B7AAAD5278} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78} + {0516ADAE-C543-4B48-94EE-AC535DEFED0E} = {CBD50B5F-AFB8-4DA1-9FD7-17D98EB3ED78} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {F30DF47A-541C-4383-BCEB-E4108D06A70E} diff --git a/src/Encamina.Enmarcha.AI.Abstractions/BreakpointThresholdType.cs b/src/Encamina.Enmarcha.AI.Abstractions/BreakpointThresholdType.cs new file mode 100644 index 0000000..f2925e8 --- /dev/null +++ b/src/Encamina.Enmarcha.AI.Abstractions/BreakpointThresholdType.cs @@ -0,0 +1,22 @@ +namespace Encamina.Enmarcha.AI.Abstractions; + +/// +/// Type of thresholds used for breakpoints in . +/// +public enum BreakpointThresholdType +{ + /// + /// Threshold based on percentiles for breakpoints. + /// + Percentile, + + /// + /// Threshold based on standard deviations for breakpoints. + /// + StandardDeviation, + + /// + /// Threshold based on interquartile range for breakpoints. + /// + Interquartile, +} diff --git a/src/Encamina.Enmarcha.AI.Abstractions/Encamina.Enmarcha.AI.Abstractions.csproj b/src/Encamina.Enmarcha.AI.Abstractions/Encamina.Enmarcha.AI.Abstractions.csproj index 82fe126..2f303e9 100644 --- a/src/Encamina.Enmarcha.AI.Abstractions/Encamina.Enmarcha.AI.Abstractions.csproj +++ b/src/Encamina.Enmarcha.AI.Abstractions/Encamina.Enmarcha.AI.Abstractions.csproj @@ -11,6 +11,7 @@ + diff --git a/src/Encamina.Enmarcha.AI.Abstractions/ISemanticTextSplitter.cs b/src/Encamina.Enmarcha.AI.Abstractions/ISemanticTextSplitter.cs new file mode 100644 index 0000000..0593008 --- /dev/null +++ b/src/Encamina.Enmarcha.AI.Abstractions/ISemanticTextSplitter.cs @@ -0,0 +1,16 @@ +namespace Encamina.Enmarcha.AI.Abstractions; + +/// +/// Represents a semantic text splitter, which splits a text into semantic chunks based on embeddings. +/// +public interface ISemanticTextSplitter +{ + /// + /// Splits the input text based on semantic content. + /// + /// The input text to be split. + /// A function to generate embeddings for a list of strings. + /// The to monitor for cancellation requests. The default is . + /// A collection of text splits. + Task> SplitAsync(string text, Func, CancellationToken, Task>>> embeddingsGenerator, CancellationToken cancellationToken = default); +} diff --git a/src/Encamina.Enmarcha.AI.Abstractions/SemanticTextSplitterOptions.cs b/src/Encamina.Enmarcha.AI.Abstractions/SemanticTextSplitterOptions.cs new file mode 100644 index 0000000..c0d8e05 --- /dev/null +++ b/src/Encamina.Enmarcha.AI.Abstractions/SemanticTextSplitterOptions.cs @@ -0,0 +1,41 @@ +using System.ComponentModel.DataAnnotations; + +namespace Encamina.Enmarcha.AI.Abstractions; + +/// +/// Options for semantic text splitters. +/// +public class SemanticTextSplitterOptions +{ + /// + /// Gets size of the buffer used in semantic text splitting. It represents the number of sentences to include on each side of the current sentence within the buffer. + /// + [Required] + [Range(0, int.MaxValue)] + public int BufferSize { get; init; } = 1; + + /// + /// Gets type of threshold used for identifying breakpoints in the text. It can be based on percentiles, standard deviations, or interquartile range. + /// + [Required] + public BreakpointThresholdType BreakpointThresholdType { get; init; } = BreakpointThresholdType.Percentile; + + /// + /// Gets amount used in the threshold calculation for identifying breakpoints. The interpretation depends on the selected threshold type. + /// + /// + /// + /// + /// For BreakpointThresholdType.Percentile, a valid value is 95. + /// + /// + /// For BreakpointThresholdType.StandardDeviation, a valid value is 3. + /// + /// + /// For BreakpointThresholdType.Interquartile, a valid value is 1.5. + /// + /// + /// + [Required] + public float BreakpointThresholdAmount { get; init; } = 95; +} diff --git a/src/Encamina.Enmarcha.AI/TextSplitters/SemanticTextSplitter.cs b/src/Encamina.Enmarcha.AI/TextSplitters/SemanticTextSplitter.cs new file mode 100644 index 0000000..9272325 --- /dev/null +++ b/src/Encamina.Enmarcha.AI/TextSplitters/SemanticTextSplitter.cs @@ -0,0 +1,198 @@ +using System.Numerics.Tensors; +using System.Text; +using System.Text.RegularExpressions; + +using Encamina.Enmarcha.AI.Abstractions; +using Encamina.Enmarcha.Core; + +using Microsoft.Extensions.Options; + +namespace Encamina.Enmarcha.AI.TextSplitters; + +/// +/// Implementation of the interface that utilizes semantic analysis to split a given text into meaningful chunks. +/// It employs a combination of sentence embeddings and cosine similarity to identify breakpoints and create cohesive sentence groups. +/// +public class SemanticTextSplitter : ISemanticTextSplitter +{ + private static readonly Regex SentenceSplitRegex = new(@"(?<=[.?!])\s+", RegexOptions.Compiled, TimeSpan.FromSeconds(30)); + + private SemanticTextSplitterOptions options; + + /// + /// Initializes a new instance of the class. + /// + /// The options to use when configuring the semantic text splitter. + public SemanticTextSplitter(IOptionsMonitor options) + { + this.options = options.CurrentValue; + + options.OnChange(newOptions => this.options = newOptions); + } + + /// + public async Task> SplitAsync(string text, Func, CancellationToken, Task>>> embeddingsGenerator, CancellationToken cancellationToken = default) + { + // Code inspired by + // https://github.com/run-llama/llama_index/blob/8ed753df970f068f6afc8a83fd51a1f40880de9e/llama-index-packs/llama-index-packs-node-parser-semantic-chunking/llama_index/packs/node_parser_semantic_chunking/base.py + // https://github.com/langchain-ai/langchain/blob/ced5e7bae790cd9ec4e5374f5d070d9f23d6457b/libs/experimental/langchain_experimental/text_splitter.py + + // Splitting the text on '.', '?', and '!' + var sentences = SentenceSplitRegex.Split(text).Where(t => !string.IsNullOrEmpty(t)).ToList(); + if (sentences.Count == 1) + { + return sentences; + } + + // Combine sentences based on buffer size + var combinedSentences = CreateCombinedSentences(sentences, options.BufferSize); + + // Generate embeddings for combined sentences + var combinedSentencesEmbeddings = await embeddingsGenerator(combinedSentences, cancellationToken); + + // Calculate cosine distances between consecutive sentence embeddings + var distancesToNextSentence = CalculateDistancesToNextSentence(combinedSentencesEmbeddings); + + // Calculate threshold for identifying breakpoints + var breakpointDistanceThreshold = CalculateBreakpointThreshold(distancesToNextSentence, options.BreakpointThresholdType, options.BreakpointThresholdAmount); + + // Identify indexes above the threshold as breakpoints + var indexesAboveThreshold = distancesToNextSentence + .Select((distance, index) => new { Index = index, Distance = distance }) + .Where(item => item.Distance > breakpointDistanceThreshold) + .Select(item => item.Index) + .ToList(); + + // Slice sentences based on identified breakpoints + var chunks = SliceSentences(sentences, indexesAboveThreshold); + + return chunks; + } + + /// + /// Combines sentences based on a specified buffer size, creating cohesive groups for further analysis. + /// Each combined sentence is formed by including neighboring sentences within the specified buffer size before and after the current sentence. + /// + /// The list of sentences to be combined. + /// The number of sentences to include on each side of the current sentence within the buffer size. + /// A list of combined sentences. + private static List CreateCombinedSentences(IReadOnlyList sentences, int bufferSize) + { + var combinedSentences = new List(sentences.Count); + + // Iterate through each sentence in the input list to create combined sentences + for (var i = 0; i < sentences.Count; i++) + { + var combinedSentenceBuilder = new StringBuilder(); + + // Add sentences before the current one, based on the buffer size. + for (var j = i - bufferSize; j < i; j++) + { + if (j >= 0) + { + combinedSentenceBuilder.Append(sentences[j]).Append(' '); + } + } + + // Add the current sentence + combinedSentenceBuilder.Append(sentences[i]); + + // Add sentences after the current one, based on the buffer size + for (var j = i + 1; j < i + 1 + bufferSize; j++) + { + if (j < sentences.Count) + { + combinedSentenceBuilder.Append(' ').Append(sentences[j]); + } + } + + combinedSentences.Add(combinedSentenceBuilder.ToString()); + } + + return combinedSentences; + } + + /// + /// Calculates the cosine distances between consecutive sentence embeddings. + /// + /// The list of sentence embeddings to calculate distances. + /// A list of cosine distances between consecutive sentence embeddings. + private static List CalculateDistancesToNextSentence(IList> embeddings) + { + var distances = new List(embeddings.Count - 1); + + for (var i = 0; i < embeddings.Count - 1; i++) + { + var embeddingCurrent = embeddings[i]; + var embeddingNext = embeddings[i + 1]; + + // Calculate cosine similarity + var similarity = TensorPrimitives.CosineSimilarity(embeddingCurrent.Span, embeddingNext.Span); + + // Convert to cosine distance + var distance = 1 - similarity; + + distances.Add(distance); + } + + return distances; + } + + /// + /// Calculates the threshold for identifying breakpoints based on the specified percentile of sorted cosine distances. + /// + /// The list of cosine distances between sentence embeddings. + /// The type of threshold calculation to be applied. + /// The amount used in the threshold calculation. + /// The calculated threshold for identifying breakpoints. + private static double CalculateBreakpointThreshold(IList distances, BreakpointThresholdType breakpointThresholdType, float breakpointThresholdAmount) + { + switch (breakpointThresholdType) + { + case BreakpointThresholdType.Percentile: + return MathUtils.Percentile(distances, breakpointThresholdAmount); + case BreakpointThresholdType.StandardDeviation: + return (MathUtils.StandardDeviation(distances) * breakpointThresholdAmount) + distances.Average(); + case BreakpointThresholdType.Interquartile: + var iqr = MathUtils.InterquartileRange(distances); + return distances.Average() + (breakpointThresholdAmount * iqr); + default: + throw new ArgumentOutOfRangeException(nameof(breakpointThresholdType), breakpointThresholdType, null); + } + } + + /// + /// Slices the sentences based on the provided indexes, creating chunks of text between breakpoints. + /// + /// The list of sentences to be sliced. + /// The list of indexes indicating breakpoints in the sentences. + /// A list of sliced text chunks. + private static IEnumerable SliceSentences(IReadOnlyCollection sentences, List indexes) + { + var chunks = new List(); + var startIndex = 0; + + // Iterate through the breakpoints to slice the sentences + foreach (var index in indexes) + { + // Slice the sentences from the current start index to the end index + var group = sentences.Skip(startIndex).Take(index - startIndex + 1).ToList(); + + chunks.Add(string.Join(" ", group)); + + // Update the start index for the next group + startIndex = index + 1; + } + + // The last group, if any sentences remain + if (startIndex < sentences.Count) + { + // Get the remaining sentences after the last breakpoint + var remainingGroup = sentences.Skip(startIndex).ToList(); + + chunks.Add(string.Join(" ", remainingGroup)); + } + + return chunks; + } +} \ No newline at end of file diff --git a/src/Encamina.Enmarcha.Core/MathUtils.cs b/src/Encamina.Enmarcha.Core/MathUtils.cs new file mode 100644 index 0000000..9810769 --- /dev/null +++ b/src/Encamina.Enmarcha.Core/MathUtils.cs @@ -0,0 +1,82 @@ +namespace Encamina.Enmarcha.Core; + +/// +/// A utility class containing methods for mathematical operations. +/// +public static class MathUtils +{ + /// + /// Calculates the percentile value of a given sequence of doubles. + /// + /// The input sequence of double values. + /// The desired percentile value (between 0 and 100). + /// The calculated percentile value. + public static double Percentile(IEnumerable values, double percentile) + { + var sortedValues = values.OrderBy(x => x).ToList(); + var count = sortedValues.Count; + var realIndex = percentile / 100.0 * (count - 1); + + var index = (int)realIndex; + var fraction = realIndex - index; + + return ((1 - fraction) * sortedValues[index]) + (fraction * sortedValues[index + 1]); + } + + /// + /// Calculates the standard deviation of a given sequence of doubles. + /// + /// The input sequence of double values. + /// The calculated standard deviation. + public static double StandardDeviation(IEnumerable values) + { + var listValues = values.ToList(); + + var mean = listValues.Average(); + var variance = listValues.Select(val => Math.Pow(val - mean, 2)).Average(); + + return Math.Sqrt(variance); + } + + /// + /// Calculates the first and third quartiles of a given sequence of doubles. + /// + /// The input sequence of double values. + /// A tuple containing the first quartile (Q1) and third quartile (Q3). + public static (double Q1, double Q3) Quartiles(IEnumerable values) + { + var listValues = values.ToList(); + var count = listValues.Count; + + var q1 = CalculateMedian(listValues.Take(count / 2)); + var q3 = CalculateMedian(listValues.Skip((count + 1) / 2)); + + return (q1, q3); + } + + /// + /// Calculates the Interquartile Range (IQR) of a given sequence of doubles. + /// + /// The input sequence of double values. + /// The calculated Interquartile Range (IQR). + public static double InterquartileRange(IEnumerable values) + { + var (q1, q3) = Quartiles(values); + + return q3 - q1; + } + + /// + /// Calculates the median of a collection of double values. + /// + /// The collection of double values. + /// The median of the given collection. + public static double CalculateMedian(IEnumerable values) + { + var listValues = values.ToList(); + var count = listValues.Count; + var middle = count / 2; + + return count % 2 == 0 ? (listValues[middle - 1] + listValues[middle]) / 2.0 : listValues[middle]; + } +} diff --git a/tst/Encamina.Enmarcha.Core.Tests/Encamina.Enmarcha.Core.Tests.csproj b/tst/Encamina.Enmarcha.Core.Tests/Encamina.Enmarcha.Core.Tests.csproj new file mode 100644 index 0000000..ae8ee43 --- /dev/null +++ b/tst/Encamina.Enmarcha.Core.Tests/Encamina.Enmarcha.Core.Tests.csproj @@ -0,0 +1,11 @@ + + + + net8.0 + + + + + + + diff --git a/tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs b/tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs new file mode 100644 index 0000000..b7bbcd3 --- /dev/null +++ b/tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs @@ -0,0 +1,44 @@ +namespace Encamina.Enmarcha.Core.Tests; + +public class MathUtilsTests +{ + [Theory] + [InlineData(new double[] { 5, 5, 5, 5, 5 }, 50, 5)] + [InlineData(new double[] { 1, 2, 3, 4, 5 }, 50, 3)] + [InlineData(new double[] { 1, 2, 3, 4, 5, 6 }, 25, 2.25)] + [InlineData(new double[] { 1, 2, 3, 4, 5, 6 }, 75, 4.75)] + [InlineData(new double[] { 0.5, 1.5, 2.5, 3.5, 4.5, 5.5 }, 25, 1.75)] + public void CalculatesPercentile_Successfully(double[] values, double percentile, double expected) + { + var result = MathUtils.Percentile(values, percentile); + + Assert.Equal(expected, result, precision: 5); + } + + [Theory] + [InlineData(new double[] { 5, 5, 5, 5, 5 }, 0)] + [InlineData(new double[] { 1, 2, 3, 4, 5 }, 1.41421)] + [InlineData(new double[] { 1, 2, 3, 4, 5, 6 }, 1.70783)] + [InlineData(new double[] { 10, 20, 30, 40, 50 }, 14.14214)] + [InlineData(new double[] { 0.1, 0.2, 0.3, 0.4, 0.5 }, 0.14142)] + public void CalculatesStandardDeviation_Successfully(double[] values, double expected) + { + var result = MathUtils.StandardDeviation(values); + + Assert.Equal(expected, result, precision: 5); + } + + [Theory] + [InlineData(new double[] { 0, 5, 5, 5, 5 }, 2.5)] + [InlineData(new double[] { 5, 5, 5, 5, 5 }, 0)] + [InlineData(new double[] { 1, 2, 3, 4, 5 }, 3)] + [InlineData(new double[] { 1, 15, 19, 64 }, 33.5)] + [InlineData(new double[] { 10, 20, 30, 40, 50, 60 }, 30)] + [InlineData(new double[] { 0.1, 0.2, 0.3, 0.4, 0.5 }, 0.3)] + public void CalculatesInterquartileRange_Successfully(double[] values, double expected) + { + var result = MathUtils.InterquartileRange(values); + + Assert.Equal(expected, result, precision: 5); + } +} \ No newline at end of file diff --git a/tst/Encamina.Enmarcha.Core.Tests/Usings.cs b/tst/Encamina.Enmarcha.Core.Tests/Usings.cs new file mode 100644 index 0000000..8c927eb --- /dev/null +++ b/tst/Encamina.Enmarcha.Core.Tests/Usings.cs @@ -0,0 +1 @@ +global using Xunit; \ No newline at end of file From 7fd1d6b13115de1ec198ccb676158b7cdc230db6 Mon Sep 17 00:00:00 2001 From: Luis Marcos Rivera Date: Thu, 7 Mar 2024 16:26:42 +0100 Subject: [PATCH 2/4] Minor improvements. Added tests. --- .../Encamina.Enmarcha.AI.Abstractions.csproj | 1 - .../Encamina.Enmarcha.AI.csproj | 1 + .../IServiceCollectionExtensions.cs | 12 +++- .../TextSplitters/SemanticTextSplitter.cs | 4 +- .../SemanticTextSplitterTests.cs | 69 +++++++++++++++++++ .../MathUtilsTests.cs | 6 +- 6 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 tst/Encamina.Enmarcha.AI.Tests/SemanticTextSplitterTests.cs diff --git a/src/Encamina.Enmarcha.AI.Abstractions/Encamina.Enmarcha.AI.Abstractions.csproj b/src/Encamina.Enmarcha.AI.Abstractions/Encamina.Enmarcha.AI.Abstractions.csproj index 2f303e9..82fe126 100644 --- a/src/Encamina.Enmarcha.AI.Abstractions/Encamina.Enmarcha.AI.Abstractions.csproj +++ b/src/Encamina.Enmarcha.AI.Abstractions/Encamina.Enmarcha.AI.Abstractions.csproj @@ -11,7 +11,6 @@ - diff --git a/src/Encamina.Enmarcha.AI/Encamina.Enmarcha.AI.csproj b/src/Encamina.Enmarcha.AI/Encamina.Enmarcha.AI.csproj index 08ec9b8..a932565 100644 --- a/src/Encamina.Enmarcha.AI/Encamina.Enmarcha.AI.csproj +++ b/src/Encamina.Enmarcha.AI/Encamina.Enmarcha.AI.csproj @@ -19,6 +19,7 @@ + diff --git a/src/Encamina.Enmarcha.AI/Extensions/IServiceCollectionExtensions.cs b/src/Encamina.Enmarcha.AI/Extensions/IServiceCollectionExtensions.cs index 5c9f7d4..77a333d 100644 --- a/src/Encamina.Enmarcha.AI/Extensions/IServiceCollectionExtensions.cs +++ b/src/Encamina.Enmarcha.AI/Extensions/IServiceCollectionExtensions.cs @@ -12,7 +12,7 @@ namespace Microsoft.Extensions.DependencyInjection; public static class IServiceCollectionExtensions { /// - /// Adds a defult cognitive service provider to the as singleton. + /// Adds a default cognitive service provider to the as singleton. /// /// The to add services to. /// The so that additional calls can be chained. @@ -45,4 +45,14 @@ public static IServiceCollection AddRecursiveCharacterTextSplitter(this IService { return services.AddSingleton(); } + + /// + /// Adds a «Semantic Text Splitter» service as singleton instance of to the . + /// + /// The to add services to. + /// The so that additional calls can be chained. + public static IServiceCollection AddSemanticTextSplitter(this IServiceCollection services) + { + return services.AddSingleton(); + } } diff --git a/src/Encamina.Enmarcha.AI/TextSplitters/SemanticTextSplitter.cs b/src/Encamina.Enmarcha.AI/TextSplitters/SemanticTextSplitter.cs index 9272325..82f82cc 100644 --- a/src/Encamina.Enmarcha.AI/TextSplitters/SemanticTextSplitter.cs +++ b/src/Encamina.Enmarcha.AI/TextSplitters/SemanticTextSplitter.cs @@ -76,7 +76,7 @@ public async Task> SplitAsync(string text, FuncThe list of sentences to be combined. /// The number of sentences to include on each side of the current sentence within the buffer size. /// A list of combined sentences. - private static List CreateCombinedSentences(IReadOnlyList sentences, int bufferSize) + private static List CreateCombinedSentences(IList sentences, int bufferSize) { var combinedSentences = new List(sentences.Count); @@ -167,7 +167,7 @@ private static double CalculateBreakpointThreshold(IList distances, Brea /// The list of sentences to be sliced. /// The list of indexes indicating breakpoints in the sentences. /// A list of sliced text chunks. - private static IEnumerable SliceSentences(IReadOnlyCollection sentences, List indexes) + private static IEnumerable SliceSentences(IList sentences, List indexes) { var chunks = new List(); var startIndex = 0; diff --git a/tst/Encamina.Enmarcha.AI.Tests/SemanticTextSplitterTests.cs b/tst/Encamina.Enmarcha.AI.Tests/SemanticTextSplitterTests.cs new file mode 100644 index 0000000..629a1b5 --- /dev/null +++ b/tst/Encamina.Enmarcha.AI.Tests/SemanticTextSplitterTests.cs @@ -0,0 +1,69 @@ +using Encamina.Enmarcha.AI.Abstractions; +using Encamina.Enmarcha.AI.TextSplitters; +using Encamina.Enmarcha.Testing; + +using Moq; + +namespace Encamina.Enmarcha.AI.Tests; + +public sealed class SemanticTextSplitterTests +{ + private readonly Mock, CancellationToken, Task>>>> embeddingsGeneratorMock = new(MockBehavior.Strict); + + [Fact] + public async Task SplitText_Succeeds() + { + // Arrange... + const string text = "This is a text that has 5 sentences. This one here is the second. This is the third. Here we have the fourth! And finally, the last one"; + var semanticTextSplitterOptions = GivenASemanticTextSplitterOptions(); + var optionsMonitor = new TestOptionsMonitor(semanticTextSplitterOptions); + var semanticTextSplitter = new SemanticTextSplitter(optionsMonitor); + + embeddingsGeneratorMock + .Setup(generator => generator(It.Is>(data => data.SequenceEqual(new[] { "This is a text that has 5 sentences.", "This one here is the second.", "This is the third.", "Here we have the fourth!", "And finally, the last one" })), It.IsAny())) + .ReturnsAsync(new List>(5) + { + new([99999999.0f, 98100f]), // This represents that the first sentence is very different + new([2.0f, 1.0f]), + new([25.0f, 8.0f]), + new([3.0f, 1.0f]), + new([7.0f, 1.0f]), + new([10.0f, 4.0f]), + }); + + // Act... + var splits = (await semanticTextSplitter.SplitAsync(text, embeddingsGeneratorMock.Object)).ToList(); + + // Assert... + Assert.Equal(2, splits.Count); + Assert.Equal("This is a text that has 5 sentences.", splits[0]); + Assert.Equal("This one here is the second. This is the third. Here we have the fourth! And finally, the last one", splits[1]); + } + + [Fact] + public async Task SplitText_With_SingleSentence_Returns_OriginalText() + { + // Arrange... + const string singleSentence = "This is a single sentence."; + var semanticTextSplitterOptions = GivenASemanticTextSplitterOptions(); + var optionsMonitor = new TestOptionsMonitor(semanticTextSplitterOptions); + var semanticTextSplitter = new SemanticTextSplitter(optionsMonitor); + + // Act... + var splits = (await semanticTextSplitter.SplitAsync(singleSentence, embeddingsGeneratorMock.Object)).ToList(); + + // Assert... + Assert.Single(splits); + Assert.Equal(singleSentence, splits[0]); + } + + private static SemanticTextSplitterOptions GivenASemanticTextSplitterOptions() + { + return new SemanticTextSplitterOptions() + { + BufferSize = 0, + BreakpointThresholdAmount = 95, + BreakpointThresholdType = BreakpointThresholdType.Percentile, + }; + } +} \ No newline at end of file diff --git a/tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs b/tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs index b7bbcd3..841652e 100644 --- a/tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs +++ b/tst/Encamina.Enmarcha.Core.Tests/MathUtilsTests.cs @@ -8,7 +8,7 @@ public class MathUtilsTests [InlineData(new double[] { 1, 2, 3, 4, 5, 6 }, 25, 2.25)] [InlineData(new double[] { 1, 2, 3, 4, 5, 6 }, 75, 4.75)] [InlineData(new double[] { 0.5, 1.5, 2.5, 3.5, 4.5, 5.5 }, 25, 1.75)] - public void CalculatesPercentile_Successfully(double[] values, double percentile, double expected) + public void Calculate_Percentile_Succeeds(double[] values, double percentile, double expected) { var result = MathUtils.Percentile(values, percentile); @@ -21,7 +21,7 @@ public void CalculatesPercentile_Successfully(double[] values, double percentile [InlineData(new double[] { 1, 2, 3, 4, 5, 6 }, 1.70783)] [InlineData(new double[] { 10, 20, 30, 40, 50 }, 14.14214)] [InlineData(new double[] { 0.1, 0.2, 0.3, 0.4, 0.5 }, 0.14142)] - public void CalculatesStandardDeviation_Successfully(double[] values, double expected) + public void Calculate_StandardDeviation_Succeeds(double[] values, double expected) { var result = MathUtils.StandardDeviation(values); @@ -35,7 +35,7 @@ public void CalculatesStandardDeviation_Successfully(double[] values, double exp [InlineData(new double[] { 1, 15, 19, 64 }, 33.5)] [InlineData(new double[] { 10, 20, 30, 40, 50, 60 }, 30)] [InlineData(new double[] { 0.1, 0.2, 0.3, 0.4, 0.5 }, 0.3)] - public void CalculatesInterquartileRange_Successfully(double[] values, double expected) + public void Calculate_InterquartileRange_Succeeds(double[] values, double expected) { var result = MathUtils.InterquartileRange(values); From 4c2ab1dc5144040614322713f315d9d1ef1959d3 Mon Sep 17 00:00:00 2001 From: Luis Marcos Rivera Date: Thu, 7 Mar 2024 16:35:18 +0100 Subject: [PATCH 3/4] Updated version. Updated CHANGELOG.md --- CHANGELOG.md | 2 ++ Directory.Build.props | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11e1c31..0d336cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,8 @@ Previous classification is not required if changes are simple or all belong to t - Updated `xunit.analyzers` from `1.10.0` to `1.11.0`. - Updated `xunit.extensibility.core` from `2.6.6` to `2.7.0`. - Updated `xunit.runner.visualstudio` from `2.5.6` to `2.5.7`. +- Added new interface `Encamina.Enmarcha.AI.Abstractions.ISemanticTextSplitter` and its implementations `Encamina.Enmarcha.AI.SemanticTextSplitter` to split a text into meaningful chunks based on embeddings. +- Added a new utility class for mathematical operations `Encamina.Enmarcha.Core.MathUtils`. ### Minor Changes diff --git a/Directory.Build.props b/Directory.Build.props index 7f025a4..fc7fbc0 100644 --- a/Directory.Build.props +++ b/Directory.Build.props @@ -17,7 +17,7 @@ 8.1.5 - preview-04 + preview-05