-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #129 from Encamina/@mramos/add-csv-document-connector
v8.1.8 preview-01: Implement new document connector interface and add Image, CSV and TSV document connectors
- Loading branch information
Showing
22 changed files
with
506 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
112 changes: 112 additions & 0 deletions
112
...ncamina.Enmarcha.SemanticKernel.Connectors.Document/Connectors/CsvTsvDocumentConnector.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
using System.Text; | ||
|
||
using CommunityToolkit.Diagnostics; | ||
|
||
using Encamina.Enmarcha.AI.Abstractions; | ||
|
||
using Microsoft.Extensions.Options; | ||
|
||
namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors; | ||
|
||
/// <summary> | ||
/// Extracts the text from CSV and TSV files, taking into account the size of the chunks, duplicating the headers for each one. In this way we avoid the loss of context when chunking. | ||
/// </summary> | ||
public class CsvTsvDocumentConnector : IEnmarchaDocumentConnector | ||
{ | ||
private readonly ITextSplitter textSplitter; | ||
private readonly Func<string, int> lengthFunction; | ||
private TextSplitterOptions textSplitterOptions; | ||
|
||
/// <summary> | ||
/// Initializes a new instance of the <see cref="CsvTsvDocumentConnector"/> class. | ||
/// </summary> | ||
/// <param name="textSplitter">A valid instance of <see cref="ITextSplitter"/> to use when extracting content from documents.</param> | ||
/// <param name="lengthFunction">Length function to use when extracting content from documents.</param> | ||
/// <param name="textSplitterOptions">Options for the text splitter.</param> | ||
public CsvTsvDocumentConnector(ITextSplitter textSplitter, Func<string, int> lengthFunction, IOptionsMonitor<TextSplitterOptions> textSplitterOptions) | ||
{ | ||
this.textSplitter = textSplitter; | ||
this.lengthFunction = lengthFunction; | ||
|
||
this.textSplitterOptions = textSplitterOptions.CurrentValue; | ||
textSplitterOptions.OnChange(newOptions => this.textSplitterOptions = newOptions); | ||
} | ||
|
||
/// <inheritdoc/> | ||
public IReadOnlyList<string> CompatibleFileFormats => [".CSV", ".TSV"]; | ||
|
||
/// <summary> | ||
/// Gets the encoding used for reading the text from the stream. | ||
/// </summary> | ||
protected virtual Encoding Encoding => Encoding.UTF8; | ||
|
||
/// <inheritdoc/> | ||
public string ReadText(Stream stream) | ||
{ | ||
Guard.IsNotNull(stream); | ||
|
||
using var streamReader = new StreamReader(stream, Encoding); | ||
var allText = streamReader.ReadToEnd().Trim(); | ||
|
||
var firstEndOfLineIndex = GetFirstEndOfLineIndex(allText); | ||
|
||
if (firstEndOfLineIndex == -1) | ||
{ | ||
return allText; // There is just one line. Nothing to do. | ||
} | ||
|
||
var headers = allText[..firstEndOfLineIndex]; | ||
var content = allText[(firstEndOfLineIndex + 1)..]; | ||
var headersLength = lengthFunction(headers); | ||
|
||
// Split the content into chunks. Leaving room to duplicate the header on each one | ||
var ajustedTextSpliterOptions = new TextSplitterOptions() | ||
{ | ||
ChunkOverlap = textSplitterOptions.ChunkOverlap, | ||
ChunkSize = textSplitterOptions.ChunkSize - headersLength, | ||
Separators = textSplitterOptions.Separators, | ||
}; | ||
var splittedContent = textSplitter.Split(content, lengthFunction, ajustedTextSpliterOptions); | ||
|
||
// Rebuild the text, duplicating the headers for each chunk. | ||
var sbResult = new StringBuilder(); | ||
foreach (var contentChunk in splittedContent) | ||
{ | ||
sbResult.AppendLine(headers); | ||
sbResult.AppendLine(contentChunk); | ||
|
||
sbResult.AppendLine(); // Add a blank line between records. | ||
} | ||
|
||
return sbResult.ToString().Trim(); | ||
} | ||
|
||
/// <inheritdoc/> | ||
public virtual void Initialize(Stream stream) | ||
{ | ||
// Intentionally not implemented to comply with the Liskov Substitution Principle... | ||
} | ||
|
||
/// <inheritdoc/> | ||
public virtual void AppendText(Stream stream, string text) | ||
{ | ||
// Intentionally not implemented to comply with the Liskov Substitution Principle... | ||
} | ||
|
||
private static int GetFirstEndOfLineIndex(string text) | ||
{ | ||
var newLineIndex = text.IndexOf("\r\n"); | ||
|
||
if (newLineIndex == -1) | ||
{ | ||
newLineIndex = text.IndexOf('\n'); | ||
|
||
if (newLineIndex == -1) | ||
{ | ||
newLineIndex = text.IndexOf('\r'); | ||
} | ||
} | ||
|
||
return newLineIndex; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
98 changes: 98 additions & 0 deletions
98
....Enmarcha.SemanticKernel.Connectors.Document/Connectors/SkVisionImageDocumentConnector.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
using CommunityToolkit.Diagnostics; | ||
|
||
using Encamina.Enmarcha.SemanticKernel.Connectors.Document.Utils; | ||
|
||
using Microsoft.SemanticKernel; | ||
using Microsoft.SemanticKernel.ChatCompletion; | ||
|
||
namespace Encamina.Enmarcha.SemanticKernel.Connectors.Document.Connectors; | ||
|
||
/// <summary> | ||
/// Extracts text (OCR) and interprets information from images, diagrams and unstructured information. Uses Semantic Kernel. | ||
/// </summary> | ||
public class SkVisionImageDocumentConnector : IEnmarchaDocumentConnector | ||
{ | ||
private const string SystemPrompt = """ | ||
You are an expert OCR (Optical Character Recognition) Specialist | ||
You will receive a PDF page containing a mix of text, diagrams, images, tables, and possibly unstructured data. | ||
Your task is to generate a complete transcription of the PDF page in Markdown format, capturing all content in detail. | ||
Guidelines: | ||
- Ensure complete accuracy in transcription; no information should be lost or omitted. | ||
- Efficiently represent images, diagrams, and unstructured data for later processing by a Language Model. | ||
- Don't generate any "![]()" links for photos. | ||
- Extract the data from the graphs as a table in markdown. | ||
- Approach this task methodically, thinking through each step carefully. | ||
- This task is crucial for my career success; I rely on your expertise and precision. | ||
- Ignore any icon image. | ||
- Never add the following texts: | ||
- ```markdown | ||
- ``` | ||
- Specific Instructions for Formatting: | ||
1. Represent diagrams or schemes using a dedicated section with discrete data in a table in Markdown format, formatted as follows: | ||
[IMAGE] | ||
2. Represent images or photos using a dedicated Markdown section with a full description of what you see, formatted as follows: | ||
[PHOTO] | ||
3. Restrain from adding any additional information or commentary. If the page is empty do not transcribe anything and just return an empty string. | ||
4. Transcribe only in Markdown format. | ||
Importance: The fidelity of this transcription is critical. It is essential that the content from the PDF is transcribed exactly as it appears, with no summarization or imprecise descriptions. Accuracy in representing the mixture of text, visual elements, and data is paramount for the success of my project. | ||
"""; | ||
|
||
private readonly IChatCompletionService chatCompletionService; | ||
|
||
/// <summary> | ||
/// Initializes a new instance of the <see cref="SkVisionImageDocumentConnector"/> class. | ||
/// </summary> | ||
/// <param name="kernel">A valid <see cref="Kernel"/> instance.</param> | ||
public SkVisionImageDocumentConnector(Kernel kernel) | ||
{ | ||
chatCompletionService = kernel.GetRequiredService<IChatCompletionService>(); | ||
} | ||
|
||
/// <inheritdoc/> | ||
public IReadOnlyList<string> CompatibleFileFormats => [".JPEG", ".JPG", ".PNG"]; | ||
|
||
/// <inheritdoc/> | ||
public virtual string ReadText(Stream stream) | ||
{ | ||
Guard.IsNotNull(stream); | ||
|
||
var mimeType = ImageHelper.GetMimeType(stream); | ||
stream.Position = 0; | ||
|
||
var history = new ChatHistory(SystemPrompt); | ||
|
||
var message = new ChatMessageContentItemCollection() | ||
{ | ||
new ImageContent(BinaryData.FromStream(stream), mimeType), | ||
}; | ||
|
||
history.AddUserMessage(message); | ||
|
||
// TODO: We can improve that making an async version of IEnmarchaDocumentConnector. | ||
var response = chatCompletionService.GetChatMessageContentAsync(history).GetAwaiter().GetResult(); | ||
|
||
return response?.Content ?? string.Empty; | ||
} | ||
|
||
/// <inheritdoc/> | ||
public virtual void Initialize(Stream stream) | ||
{ | ||
// Intentionally not implemented to comply with the Liskov Substitution Principle... | ||
} | ||
|
||
/// <inheritdoc/> | ||
public virtual void AppendText(Stream stream, string text) | ||
{ | ||
// Intentionally not implemented to comply with the Liskov Substitution Principle... | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.