From a5951f4c68f1e270ffa9b3033df16231c1d07359 Mon Sep 17 00:00:00 2001 From: Tom Bruyneel Date: Tue, 26 Mar 2024 09:58:22 +0100 Subject: [PATCH] add generic chunking algorithm --- .../Jobs/WebsitePageIndexingJob.cs | 217 +++++++++++++++++- .../Services/Implementations/RAGService.cs | 18 ++ 2 files changed, 233 insertions(+), 2 deletions(-) diff --git a/src/backoffice/ConversationalSearchPlatform.BackOffice/Jobs/WebsitePageIndexingJob.cs b/src/backoffice/ConversationalSearchPlatform.BackOffice/Jobs/WebsitePageIndexingJob.cs index 1f864f2..8303609 100644 --- a/src/backoffice/ConversationalSearchPlatform.BackOffice/Jobs/WebsitePageIndexingJob.cs +++ b/src/backoffice/ConversationalSearchPlatform.BackOffice/Jobs/WebsitePageIndexingJob.cs @@ -16,11 +16,51 @@ using System.Net; using System.Text; using System.Text.RegularExpressions; +using System.Web; namespace ConversationalSearchPlatform.BackOffice.Jobs; public class WebsitePageIndexingJob : ITenantAwareIndexingJob { + private class Node + { + public HtmlNode Ancestor { get; private set; } + public HtmlNode HtmlNode { get; private set; } + public string InnerText { get; set; } = ""; + + public Node(HtmlNode ancestor, HtmlNode htmlNode) + { + Ancestor = ancestor; + HtmlNode = htmlNode; + } + + public int TitleScore + { + get + { + var score = 0; + if (HtmlNode?.Name.ToLower() == "h1") + { + score = 100; + } + + if (HtmlNode?.Name.ToLower() == "h2") + { + score = 90; + } + + if (HtmlNode.Name.ToLower() == "h3") + { + score = 80; + } + + + + return score; + } + } + } + private readonly IDbContextFactory _dbContextFactory; private readonly IMultiTenantContextAccessor _tenantContextAccessor; private readonly IMultiTenantStore _multiTenantStore; @@ -322,7 +362,7 @@ private async Task CreateEntry(ApplicationDbContext db, string tenantId, Website await _vectorizationService.BulkCreateAsync(nameof(WebsitePage), websitePage.Id, scrapeResult.PageTitle, tenantId, UsageType.Indexing, chunkCollection); } } - else + else if(websitePage.Url.Contains("tena.co.uk")) { List chunks = new List(); @@ -350,7 +390,34 @@ private async Task CreateEntry(ApplicationDbContext db, string tenantId, Website await _vectorizationService.BulkCreateAsync(nameof(WebsitePage), websitePage.Id, scrapeResult.PageTitle, tenantId, UsageType.Indexing, chunkCollection); } - } + } + else + { + // general chunking algorithm + // get the xpath expression for the tenant + var tenantInfo = await _multiTenantStore.TryGetAsync(tenantId); + + if (tenantInfo != null) + { + var chunks = ChunkGenericHtmlPage(htmlDoc, tenantInfo.XPathForSite).Select(chunk => new ChunkResult() + { + ArticleNumber = string.Empty, + Text = chunk, + Packaging = string.Empty, + }).ToList(); + + if (chunks.Count() > 0) + { + ChunkCollection chunkCollection = new ChunkCollection(tenantId, websitePage.Id.ToString(), websitePage.Url, websitePage.ReferenceType.ToString(), websitePage.Language.ToString(), chunks); + + await _vectorizationService.BulkCreateAsync(nameof(WebsitePage), websitePage.Id, scrapeResult.PageTitle, tenantId, UsageType.Indexing, chunkCollection); + } + } + else + { + throw new Exception("Tenant not found"); + } + } } else { @@ -417,6 +484,152 @@ private async Task CreateEntry(ApplicationDbContext db, string tenantId, Website await db.SaveChangesAsync(); } + private List ChunkGenericHtmlPage(HtmlDocument htmlDocument, string xpath) + { + HttpClient client = new HttpClient(); + + var rootnode = htmlDocument.DocumentNode.SelectSingleNode("//div[contains(@id, 'root')]"); + + var textNodesToFlatten = htmlDocument.DocumentNode.SelectNodes($"{xpath}//*"); + + if (textNodesToFlatten != null) + { + foreach (var textNodeToFlatten in textNodesToFlatten) + { + var urlInnerText = textNodeToFlatten.InnerText.Trim(); + + if (HasRealTextSibbling(textNodeToFlatten)) + { + if (!string.IsNullOrWhiteSpace(urlInnerText)) + { + textNodeToFlatten.ParentNode.ReplaceChild(htmlDocument.CreateTextNode(urlInnerText), textNodeToFlatten); + } + } + } + } + + var newHtml = htmlDocument.DocumentNode.InnerHtml; + + HtmlDocument flattendDocument = new HtmlDocument(); + flattendDocument.LoadHtml(newHtml); + + var nodes = flattendDocument.DocumentNode.SelectNodes($"{xpath}//text()"); + + List textNodes = new List(); + + if (textNodes != null) + { + foreach (var node in nodes) + { + if (node.ParentNode.Name == "option" || node.ParentNode.Name == "label" || node.ParentNode.Name == "script") + { + continue; + } + + if (!string.IsNullOrWhiteSpace(node.InnerText)) + { + var innerText = HttpUtility.HtmlDecode(node.InnerText.Trim()); + HtmlNode? nodeAncestor = null; + + // find my parent + foreach (var ancestor in node.Ancestors()) + { + var ancestorInnerText = HttpUtility.HtmlDecode(ancestor.InnerText.Trim()); + + if (string.IsNullOrWhiteSpace(ancestorInnerText)) + { + continue; + } + + if (ancestorInnerText != innerText) + { + nodeAncestor = ancestor; + + break; + } + } + + if (nodeAncestor != null) + { + textNodes.Add(new Node(nodeAncestor, node) + { + InnerText = innerText, + }); + } + } + + } + + // remove common strings + var frequencyMap = textNodes.GroupBy(s => s.InnerText) + .ToDictionary(g => g.Key, g => g.Count()); + double mean = frequencyMap.Values.Average(); + double sumOfSquaresOfDifferences = frequencyMap.Values.Select(val => (val - mean) * (val - mean)).Sum(); + double stdDev = Math.Sqrt(sumOfSquaresOfDifferences / frequencyMap.Count); + + var thresholdZScore = 3.0; // Customize this threshold based on your needs + var stringsToRemove = frequencyMap.Where(kvp => + (kvp.Value - mean) / stdDev > thresholdZScore) + .Select(kvp => kvp.Key) + .ToList(); + + textNodes.RemoveAll(node => stringsToRemove.Contains(node.InnerText)); + + List chunks = new List(); + int maxSize = 2048; + + if (textNodes.Count > 0) + { + StringBuilder builder = new StringBuilder(); + HtmlNode previousAncestor = textNodes[0].Ancestor; + foreach (var textNode in textNodes) + { + if (textNode.Ancestor != previousAncestor) + { + // are we a descendant? + if (!(previousAncestor.Descendants().Contains(textNode.Ancestor) && builder.Length < maxSize)) + { + previousAncestor = textNode.Ancestor; + chunks.Add(builder.ToString()); + builder.Clear(); + } + } + + builder.AppendLine(textNode.InnerText); + } + + chunks.Add(builder.ToString()); + } + + return chunks; + } + + return new List(); + } + + private bool HasRealTextSibbling(HtmlNode htmlNode) + { + if (htmlNode.NextSibling?.NodeType == HtmlNodeType.Text) + { + var cleanedString = Regex.Replace(htmlNode.NextSibling.InnerText.Trim(), @"\s+", ""); + if (!string.IsNullOrEmpty(cleanedString)) + { + return true; + } + } + + if (htmlNode.PreviousSibling?.NodeType == HtmlNodeType.Text) + { + var cleanedString = Regex.Replace(htmlNode.PreviousSibling.InnerText.Trim(), @"\s+", ""); + if (!string.IsNullOrEmpty(cleanedString)) + { + return true; + } + } + + return false; + } + private static async Task CreateChildPages( IAzureBlobStorage azureBlobStorage, diff --git a/src/backoffice/ConversationalSearchPlatform.BackOffice/Services/Implementations/RAGService.cs b/src/backoffice/ConversationalSearchPlatform.BackOffice/Services/Implementations/RAGService.cs index 4c1386f..1bd5af4 100644 --- a/src/backoffice/ConversationalSearchPlatform.BackOffice/Services/Implementations/RAGService.cs +++ b/src/backoffice/ConversationalSearchPlatform.BackOffice/Services/Implementations/RAGService.cs @@ -6,6 +6,8 @@ public class RAGService : IRagService { private readonly Guid TENA_ID = Guid.Parse("CCFA9314-ABE6-403A-9E21-2B31D95A5258"); private readonly Guid IODIGITAL_ID = Guid.Parse("4903E29F-D633-4A4C-9065-FE3DD8F27E40"); + private readonly Guid AXA_ID = Guid.Parse("37f63c3e-e897-42a7-ab79-7052ef8d9866"); + private readonly Guid HELAN_ID = Guid.Parse("3c9189de-87b2-47ac-ada1-25bcec668b89"); public Task GetRAGDocumentAsync(Guid tenantId) { @@ -33,6 +35,22 @@ public Task GetRAGDocumentAsync(Guid tenantId) Name = "Site", }); } + else if (tenantId == AXA_ID) + { + ragDocument.Classes.Add(new RAGClass() + { + Description = "The following text sources contain information that is available on the AXA Partners site. They form your knowledge base and thus extend and build upon the data you already have. Whenever a user asks a question about something that is contained within these documents, you can use the provided information to answer with certainty.", + Name = "Site", + }); + } + else if (tenantId == HELAN_ID) + { + ragDocument.Classes.Add(new RAGClass() + { + Description = "The following text sources contain information that is available on the Helan site. They form your knowledge base and thus extend and build upon the data you already have. Whenever a user asks a question about something that is contained within these documents, you can use the provided information to answer with certainty.", + Name = "Site", + }); + } else { ragDocument.Classes.Add(new RAGClass()