Merge pull request #129 from Encamina/@mramos/add-csv-document-connector

v8.1.8 preview-01: Implement new document connector interface and add Image, CSV and TSV document connectors
Encamina · Aug 12, 2024 · 23bbce2 · 23bbce2
2 parents 4c7ef49 + 455931c
commit 23bbce2
Show file tree

Hide file tree

Showing 22 changed files with 506 additions and 94 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,27 @@ Also, any bug fix must start with the prefix �Bug fix:� followed by the desc
 
 Previous classification is not required if changes are simple or all belong to the same category.
 
+## [8.1.8]
+
+### Breaking Changes 
+- `DocumentConnectorProviderBase` no longer automatically registers document connectors. Instead, it will register available connectors in the dependency container. 
+  This means that document connectors must be registered manually in the dependency container. For this purpose, new extension methods have been added to `IServiceCollection` that allow to register document connectors in the dependency container.
+  Also the `AddDefaultDocumentConnectors` method has been added in `IServiceCollectionExtensions` to register document connectors that were registered by default before.
+
+### Major Changes
+- Added the `IEnmarchaDocumentConnector` interface that extends the existing `IDocumentConnector`. This interface, by now, adds a `CompatibleFileFormats` property that returns the file formats supported by the connector. Existing document connectors have been updated to implement this interface.
+- Added `CsvTsvDocumentConnector` document connector that allows to read CSV and TSV files keeping the headers in different chunks.
+- Added `SkVisionImageDocumentConnector` which allows to read images and extract text from them. Using Semantic Kernel vision capabilities.
+- The `IDocumentConnectorProvider` interface now works with the `IEnframeDocumentConnector` interface instead of `IDocumentConnector`.
+    - The `AddDocumentConnector` function has been modified by removing the `fileExtension` parameter, which will now come in the `CompatibleFileFormats` property of the document connector.
+- The `ParagraphPptxDocumentConnector` class is no longer sealed, allowing the creation of derived classes.
+- The `SlidePptxDocumentConnector` class is no longer sealed, allowing the creation of derived classes.
+- The `TxtDocumentConnector` class is no longer sealed, allowing the creation of derived classes.
+- The `VttDocumentConnector` class is no longer sealed, allowing the creation of derived classes.
+- Updated dependencies:
+    - Updated `MimeKit` from `4.5.0` to `4.7.1` in `Encamina.Enmarcha.Testing.Smtp`.`
+    - Updated `System.Text.Json` from `8.0.3` to `8.0.4`.
+
 ## [8.1.7]
 
 ### Major Changes

diff --git a/Directory.Build.props b/Directory.Build.props
@@ -16,8 +16,8 @@
   </PropertyGroup>
 
   <PropertyGroup>
-    <VersionPrefix>8.1.7</VersionPrefix>
-    <VersionSuffix></VersionSuffix>
+    <VersionPrefix>8.1.8</VersionPrefix>
+    <VersionSuffix>preview-01</VersionSuffix>
   </PropertyGroup>
 
   <!--

diff --git a/...amina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/BasePptxDocumentConnector.cs b/...amina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/BasePptxDocumentConnector.cs
@@ -6,15 +6,16 @@
 
 using DocumentFormat.OpenXml.Packaging;
 
-using Microsoft.SemanticKernel.Plugins.Document;
-
 namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
 
 /// <summary>
 /// Base abstract class that defines a Microsoft PowerPoint (<c>.pptx</c>) document connector.
 /// </summary>
-public abstract class BasePptxDocumentConnector : IDocumentConnector
+public abstract class BasePptxDocumentConnector : IEnmarchaDocumentConnector
 {
+    /// <inheritdoc/>
+    public IReadOnlyList<string> CompatibleFileFormats => [".PPTX"];
+
     /// <inheritdoc/>
     public string ReadText(Stream stream)
     {

diff --git a/...ncamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/CsvTsvDocumentConnector.cs b/...ncamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/CsvTsvDocumentConnector.cs
@@ -0,0 +1,112 @@
+using System.Text;
+
+using CommunityToolkit.Diagnostics;
+
+using Encamina.Enmarcha.AI.Abstractions;
+
+using Microsoft.Extensions.Options;
+
+namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
+
+/// <summary>
+/// Extracts the text from CSV and TSV files, taking into account the size of the chunks, duplicating the headers for each one. In this way we avoid the loss of context when chunking.
+/// </summary>
+public class CsvTsvDocumentConnector : IEnmarchaDocumentConnector
+{
+    private readonly ITextSplitter textSplitter;
+    private readonly Func<string, int> lengthFunction;
+    private TextSplitterOptions textSplitterOptions;
+
+    /// <summary>
+    /// Initializes a new instance of the <see cref="CsvTsvDocumentConnector"/> class.
+    /// </summary>
+    /// <param name="textSplitter">A valid instance of <see cref="ITextSplitter"/> to use when extracting content from documents.</param>
+    /// <param name="lengthFunction">Length function to use when extracting content from documents.</param>
+    /// <param name="textSplitterOptions">Options for the text splitter.</param>
+    public CsvTsvDocumentConnector(ITextSplitter textSplitter, Func<string, int> lengthFunction, IOptionsMonitor<TextSplitterOptions> textSplitterOptions)
+    {
+        this.textSplitter = textSplitter;
+        this.lengthFunction = lengthFunction;
+
+        this.textSplitterOptions = textSplitterOptions.CurrentValue;
+        textSplitterOptions.OnChange(newOptions => this.textSplitterOptions = newOptions);
+    }
+
+    /// <inheritdoc/>
+    public IReadOnlyList<string> CompatibleFileFormats => [".CSV", ".TSV"];
+
+    /// <summary>
+    /// Gets the encoding used for reading the text from the stream.
+    /// </summary>
+    protected virtual Encoding Encoding => Encoding.UTF8;
+
+    /// <inheritdoc/>
+    public string ReadText(Stream stream)
+    {
+        Guard.IsNotNull(stream);
+
+        using var streamReader = new StreamReader(stream, Encoding);
+        var allText = streamReader.ReadToEnd().Trim();
+
+        var firstEndOfLineIndex = GetFirstEndOfLineIndex(allText);
+
+        if (firstEndOfLineIndex == -1)
+        {
+            return allText; // There is just one line. Nothing to do.
+        }
+
+        var headers = allText[..firstEndOfLineIndex];
+        var content = allText[(firstEndOfLineIndex + 1)..];
+        var headersLength = lengthFunction(headers);
+
+        // Split the content into chunks. Leaving room to duplicate the header on each one
+        var ajustedTextSpliterOptions = new TextSplitterOptions()
+        {
+            ChunkOverlap = textSplitterOptions.ChunkOverlap,
+            ChunkSize = textSplitterOptions.ChunkSize - headersLength,
+            Separators = textSplitterOptions.Separators,
+        };
+        var splittedContent = textSplitter.Split(content, lengthFunction, ajustedTextSpliterOptions);
+
+        // Rebuild the text, duplicating the headers for each chunk.
+        var sbResult = new StringBuilder();
+        foreach (var contentChunk in splittedContent)
+        {
+            sbResult.AppendLine(headers);
+            sbResult.AppendLine(contentChunk);
+
+            sbResult.AppendLine(); // Add a blank line between records.
+        }
+
+        return sbResult.ToString().Trim();
+    }
+
+    /// <inheritdoc/>
+    public virtual void Initialize(Stream stream)
+    {
+        // Intentionally not implemented to comply with the Liskov Substitution Principle...
+    }
+
+    /// <inheritdoc/>
+    public virtual void AppendText(Stream stream, string text)
+    {
+        // Intentionally not implemented to comply with the Liskov Substitution Principle...
+    }
+
+    private static int GetFirstEndOfLineIndex(string text)
+    {
+        var newLineIndex = text.IndexOf("\r\n");
+
+        if (newLineIndex == -1)
+        {
+            newLineIndex = text.IndexOf('\n');
+
+            if (newLineIndex == -1)
+            {
+                newLineIndex = text.IndexOf('\r');
+            }
+        }
+
+        return newLineIndex;
+    }
+}
diff --git a/...nmarcha.SemanticKernel.Connectors.Document/Connectors/ExcelToMarkdownDocumentConnector.cs b/...nmarcha.SemanticKernel.Connectors.Document/Connectors/ExcelToMarkdownDocumentConnector.cs
@@ -2,15 +2,16 @@
 
 using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Models.Excel;
 
-using Microsoft.SemanticKernel.Plugins.Document;
-
 namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
 
 /// <summary>
 /// Extracts text from an Excel file (<c>.xlsx</c>) and exports to Markdown table format.
 /// </summary>
-public class ExcelToMarkdownDocumentConnector : IDocumentConnector
+public class ExcelToMarkdownDocumentConnector : IEnmarchaDocumentConnector
 {
+    /// <inheritdoc/>
+    public IReadOnlyList<string> CompatibleFileFormats => [".XLSX"];
+
     /// <summary>
     /// Gets the options for loading the Excel document.
     /// </summary>

diff --git a/....Enmarcha.SemanticKernel.Connectors.Document/Connectors/ParagraphPptxDocumentConnector.cs b/....Enmarcha.SemanticKernel.Connectors.Document/Connectors/ParagraphPptxDocumentConnector.cs
@@ -11,7 +11,7 @@ namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
 /// <summary>
 /// Extracts the text from a Microsoft PowerPoint (<c>.pptx</c>) file, one line per paragraph found in each slide.
 /// </summary>
-public sealed class ParagraphPptxDocumentConnector : BasePptxDocumentConnector
+public class ParagraphPptxDocumentConnector : BasePptxDocumentConnector
 {
     /// <inheritdoc/>
     protected override IEnumerable<string> GetAllTextInSlide(SlidePart slidePart)

diff --git a/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/PdfDocumentConnector.cs b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/PdfDocumentConnector.cs
@@ -2,8 +2,6 @@
 
 using CommunityToolkit.Diagnostics;
 
-using Microsoft.SemanticKernel.Plugins.Document;
-
 using UglyToad.PdfPig;
 
 namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
@@ -17,8 +15,11 @@ namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
 /// each page generates text as if it were a single paragraph, and line breaks are added between different pages for separation.
 /// For strict format text extraction, consider using <see cref="StrictFormatCleanPdfDocumentConnector"/>.
 /// </remarks>
-public class PdfDocumentConnector : IDocumentConnector
+public class PdfDocumentConnector : IEnmarchaDocumentConnector
 {
+    /// <inheritdoc/>
+    public IReadOnlyList<string> CompatibleFileFormats => [".PDF"];
+
     /// <inheritdoc/>
     public virtual string ReadText(Stream stream)
     {

diff --git a/....Enmarcha.SemanticKernel.Connectors.Document/Connectors/SkVisionImageDocumentConnector.cs b/....Enmarcha.SemanticKernel.Connectors.Document/Connectors/SkVisionImageDocumentConnector.cs
@@ -0,0 +1,98 @@
+using CommunityToolkit.Diagnostics;
+
+using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Utils;
+
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.ChatCompletion;
+
+namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
+
+/// <summary>
+/// Extracts text (OCR) and interprets information from images, diagrams and unstructured information. Uses Semantic Kernel.
+/// </summary>
+public class SkVisionImageDocumentConnector : IEnmarchaDocumentConnector
+{
+    private const string SystemPrompt = """
+        You are an expert OCR (Optical Character Recognition) Specialist
+
+        You will receive a PDF page containing a mix of text, diagrams, images, tables, and possibly unstructured data.
+
+        Your task is to generate a complete transcription of the PDF page in Markdown format, capturing all content in detail.
+
+        Guidelines:
+
+        - Ensure complete accuracy in transcription; no information should be lost or omitted.
+        - Efficiently represent images, diagrams, and unstructured data for later processing by a Language Model.
+        - Don't generate any "![]()" links for photos.
+        - Extract the data from the graphs as a table in markdown.
+        - Approach this task methodically, thinking through each step carefully.
+        - This task is crucial for my career success; I rely on your expertise and precision.
+        - Ignore any icon image.
+        - Never add the following texts:
+            - ```markdown
+            - ```
+        - Specific Instructions for Formatting:
+
+        1. Represent diagrams or schemes using a dedicated section with discrete data in a table in Markdown format, formatted as follows:
+        [IMAGE]
+
+        2. Represent images or photos using a dedicated Markdown section with a full description of what you see, formatted as follows:
+        [PHOTO]
+
+        3. Restrain from adding any additional information or commentary. If the page is empty do not transcribe anything and just return an empty string.
+
+        4. Transcribe only in Markdown format.
+
+        Importance: The fidelity of this transcription is critical. It is essential that the content from the PDF is transcribed exactly as it appears, with no summarization or imprecise descriptions. Accuracy in representing the mixture of text, visual elements, and data is paramount for the success of my project.
+        
+        """;
+
+    private readonly IChatCompletionService chatCompletionService;
+
+    /// <summary>
+    /// Initializes a new instance of the <see cref="SkVisionImageDocumentConnector"/> class.
+    /// </summary>
+    /// <param name="kernel">A valid <see cref="Kernel"/> instance.</param>
+    public SkVisionImageDocumentConnector(Kernel kernel)
+    {
+        chatCompletionService = kernel.GetRequiredService<IChatCompletionService>();
+    }
+
+    /// <inheritdoc/>
+    public IReadOnlyList<string> CompatibleFileFormats => [".JPEG", ".JPG", ".PNG"];
+
+    /// <inheritdoc/>
+    public virtual string ReadText(Stream stream)
+    {
+        Guard.IsNotNull(stream);
+
+        var mimeType = ImageHelper.GetMimeType(stream);
+        stream.Position = 0;
+
+        var history = new ChatHistory(SystemPrompt);
+
+        var message = new ChatMessageContentItemCollection()
+        {
+            new ImageContent(BinaryData.FromStream(stream), mimeType),
+        };
+
+        history.AddUserMessage(message);
+
+        // TODO: We can improve that making an async version of IEnmarchaDocumentConnector.
+        var response = chatCompletionService.GetChatMessageContentAsync(history).GetAwaiter().GetResult();
+
+        return response?.Content ?? string.Empty;
+    }
+
+    /// <inheritdoc/>
+    public virtual void Initialize(Stream stream)
+    {
+        // Intentionally not implemented to comply with the Liskov Substitution Principle...
+    }
+
+    /// <inheritdoc/>
+    public virtual void AppendText(Stream stream, string text)
+    {
+        // Intentionally not implemented to comply with the Liskov Substitution Principle...
+    }
+}
diff --git a/...mina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/SlidePptxDocumentConnector.cs b/...mina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/SlidePptxDocumentConnector.cs
@@ -11,7 +11,7 @@ namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
 /// <summary>
 /// Extracts the text from a Microsoft PowerPoint (<c>.pptx</c>) file, just one line for each slide found.
 /// </summary>
-public sealed class SlidePptxDocumentConnector : BasePptxDocumentConnector
+public class SlidePptxDocumentConnector : BasePptxDocumentConnector
 {
     /// <inheritdoc/>
     protected override IEnumerable<string> GetAllTextInSlide(SlidePart slidePart)

diff --git a/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/TxtDocumentConnector.cs b/src/Encamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/TxtDocumentConnector.cs
@@ -4,32 +4,27 @@
 
 using CommunityToolkit.Diagnostics;
 
-using Microsoft.SemanticKernel.Plugins.Document;
-
 namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors;
 
 /// <summary>
 /// Extract text from a text (<c>.txt</c>) file.
 /// </summary>
-public sealed class TxtDocumentConnector : IDocumentConnector
+public class TxtDocumentConnector : IEnmarchaDocumentConnector
 {
-    private readonly Encoding encoding;
+    /// <inheritdoc/>
+    public IReadOnlyList<string> CompatibleFileFormats => [".TXT", ".MD"];
 
     /// <summary>
-    /// Initializes a new instance of the <see cref="TxtDocumentConnector"/> class.
+    /// Gets the encoding used for reading the text from the stream.
     /// </summary>
-    /// <param name="encoding">The encoding to use when reading the text file.</param>
-    public TxtDocumentConnector(Encoding encoding)
-    {
-        this.encoding = encoding;
-    }
+    protected virtual Encoding Encoding => Encoding.UTF8;
 
     /// <inheritdoc/>
     public string ReadText(Stream stream)
     {
         Guard.IsNotNull(stream);
 
-        using var reader = new StreamReader(stream, encoding);
+        using var reader = new StreamReader(stream, Encoding);
         return reader.ReadToEnd();
     }