diff --git a/README.md b/README.md index 6f8d6b8..377d705 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ **[.NET 7 Console Application - How-to](#net-7-code-samples---how-to)**
**[.NET Framework & Kepler Console Application - How-to](#keplerclient-code-samples---how-to)**
**[Powershell scripts - How-to](#powershell-script-samples---how-to)**
+### **[Performance Best Practices](#performance-best-practices)** +**[Import Job Settings](#import-job-settings)**
@@ -213,9 +215,9 @@ which may lead to errors during import process. .WithoutImages() .WithFieldsMapped(x => x .WithField(controlNumberColumnIndex, "Control Number") - .WithExtractedTextField(extractedTextPathColumnIndex, e => e - .WithExtractedTextInSeparateFiles(f => f - .WithEncoding("UTF-8")))) + .WithExtractedTextInSeparateFiles(f => f + .WithEncoding("UTF-16") + .WithFileSizeDefinedInColumn(fileSizeColumnIndex)))) .WithFolders(f => f .WithRootFolderID(rootFolderId, r => r .WithFolderPathDefinedInColumn(folderPathColumnIndex))); @@ -1129,4 +1131,49 @@ List of samples: - Invoke run-sample-import.ps1 +

+ +--- +# Performance Best Practices + +## Import Job Settings + +### Encoding +For improved performance when dealing with fileshare data on ADLS, we highly recommend using extracted text or other long text files encoded in UTF-16. By doing so, you can avoid the need for conversion to the correct encoding, leading to significant time savings in your document and image workflows. + +For the document workflow, set **FieldMapping.Encoding** to UTF-16. Similarly, for the image workflow, configure **ImageSettings.ExtractedTextEncoding** as UTF-16. With these settings in place, the conversion overhead is eliminated, and your files will be copied directly in the unicode encoding, resulting in faster processing times. + + ImportDocumentSettings importDocuments = ImportDocumentSettingsBuilder.Create() + .WithAppendMode() + .WithNatives(x => x + .WithFilePathDefinedInColumn(filePathColumnIndex) + .WithFileNameDefinedInColumn(fileNameColumnIndex)) + .WithoutImages() + .WithFieldsMapped(x => x + .WithField(controlNumberColumnIndex, "Control Number") + .WithExtractedTextField(extractedTextPathColumnIndex, e => e + .WithExtractedTextInSeparateFiles(f => f + .WithEncoding("UTF-16") + .WithFileSizeDefinedInColumn(fileSizeColumnIndex)))) + .WithoutFolders(); + + + ImportDocumentSettings importImages = ImportDocumentSettingsBuilder.Create() + .WithAppendMode() + .WithoutNatives() + .WithImages(i => i + .WithAutoNumberImages() + .WithoutProduction() + .WithExtractedText(e => e.WithEncoding("UTF-16")) + .WithFileTypeAutoDetection()) + .WithoutFieldsMapped() + .WithoutFolders(); + + +### FileSizeColumnIndex +Another valuable setting that can enhance performance is the **FieldMapping.FileSizeColumnIndex**. By configuring this setting, the need for additional file size calculations can be eliminated. The file sizes will be automatically extracted from the load file, streamlining the process and saving valuable processing time. + +**Note:** The FileSizeColumnIndex setting will only take effect if FieldMapping.ContainsFilePath is set to true, and the FieldMapping.Encoding is set to UTF-16. This property applies only to long text fields stored in Data Grid, including Extracted Text. + + diff --git a/Samples/.Net7ClientConsole/Relativity.Import.Samples.Net7Client.csproj b/Samples/.Net7ClientConsole/Relativity.Import.Samples.Net7Client.csproj index 6b706da..b6f9f52 100644 --- a/Samples/.Net7ClientConsole/Relativity.Import.Samples.Net7Client.csproj +++ b/Samples/.Net7ClientConsole/Relativity.Import.Samples.Net7Client.csproj @@ -8,7 +8,7 @@ - + diff --git a/Samples/.Net7ClientConsole/SampleCollection/Sample05_ImportDocumentsWithExtractedText.cs b/Samples/.Net7ClientConsole/SampleCollection/Sample05_ImportDocumentsWithExtractedText.cs index 254f868..2fc4e82 100644 --- a/Samples/.Net7ClientConsole/SampleCollection/Sample05_ImportDocumentsWithExtractedText.cs +++ b/Samples/.Net7ClientConsole/SampleCollection/Sample05_ImportDocumentsWithExtractedText.cs @@ -41,6 +41,7 @@ public async Task Sample05_ImportDocumentsWithExtractedText() const int controlNumberColumnIndex = 0; const int extractedTextPathColumnIndex = 12; const int fileNameColumnIndex = 13; + const int fileSizeColumnIndex = 14; const int filePathColumnIndex = 22; // Path to the load file used in data source settings. @@ -63,7 +64,8 @@ public async Task Sample05_ImportDocumentsWithExtractedText() .WithField(controlNumberColumnIndex, "Control Number") .WithExtractedTextField(extractedTextPathColumnIndex, e => e .WithExtractedTextInSeparateFiles(f => f - .WithEncoding("UTF-8")))) + .WithEncoding("UTF-8") + .WithFileSizeDefinedInColumn(fileSizeColumnIndex)))) .WithoutFolders(); // Create payload for request. diff --git a/Samples/.Net7ClientConsole/SampleCollection/Sample07_DirectImportSettingsForDocuments.cs b/Samples/.Net7ClientConsole/SampleCollection/Sample07_DirectImportSettingsForDocuments.cs index 5d17471..8bc3a66 100644 --- a/Samples/.Net7ClientConsole/SampleCollection/Sample07_DirectImportSettingsForDocuments.cs +++ b/Samples/.Net7ClientConsole/SampleCollection/Sample07_DirectImportSettingsForDocuments.cs @@ -45,6 +45,7 @@ public async Task Sample07_DirectImportSettingsForDocuments() const int extractedTextFilePathColumnIndex = 12; const int emailToColumnIndex = 11; const int fileNameColumnIndex = 13; + const int fileSizeColumnIndex = 14; const int filePathColumnIndex = 22; // Create payload for request. @@ -99,6 +100,8 @@ public async Task Sample07_DirectImportSettingsForDocuments() ContainsID = false, ColumnIndex = extractedTextFilePathColumnIndex, ContainsFilePath = true, + Encoding = "UTF-8", + FileSizeColumnIndex = fileSizeColumnIndex }, }, }, @@ -106,15 +109,7 @@ public async Task Sample07_DirectImportSettingsForDocuments() { FolderPathColumnIndex = null, RootFolderID = rootFolderId, - }, - Other = new OtherSettings - { - ExtractedText = new ExtractedTextSettings - { - Encoding = null, - ValidateEncoding = true, - }, - }, + } }; // Create payload for request. diff --git a/Samples/.Net7ClientConsole/SampleCollection/Sample11_DirectImportSettingsForImages.cs b/Samples/.Net7ClientConsole/SampleCollection/Sample11_DirectImportSettingsForImages.cs index 89175fe..dfeb962 100644 --- a/Samples/.Net7ClientConsole/SampleCollection/Sample11_DirectImportSettingsForImages.cs +++ b/Samples/.Net7ClientConsole/SampleCollection/Sample11_DirectImportSettingsForImages.cs @@ -59,20 +59,14 @@ public async Task Sample11_DirectImportSettingsForImages() PageNumbering = PageNumbering.AutoNumberImages, ProductionID = null, LoadExtractedText = true, + ExtractedTextEncoding = "UTF-8" }, Fields = null, Folder = new FolderSettings { FolderPathColumnIndex = null, RootFolderID = rootFolderId, - }, - Other = new OtherSettings - { - ExtractedText = new ExtractedTextSettings - { - ValidateEncoding = true, - }, - }, + } }; // Create payload for request. diff --git a/Samples/KeplerClientConsole/Relativity.Import.Samples.NetFrameworkClient.csproj b/Samples/KeplerClientConsole/Relativity.Import.Samples.NetFrameworkClient.csproj index f177bc6..e59f225 100644 --- a/Samples/KeplerClientConsole/Relativity.Import.Samples.NetFrameworkClient.csproj +++ b/Samples/KeplerClientConsole/Relativity.Import.Samples.NetFrameworkClient.csproj @@ -6,7 +6,7 @@ - + diff --git a/Samples/KeplerClientConsole/SamplesCollection/Sample05_ImportDocumentsWithExtractedText.cs b/Samples/KeplerClientConsole/SamplesCollection/Sample05_ImportDocumentsWithExtractedText.cs index 9dff1c3..3a01072 100644 --- a/Samples/KeplerClientConsole/SamplesCollection/Sample05_ImportDocumentsWithExtractedText.cs +++ b/Samples/KeplerClientConsole/SamplesCollection/Sample05_ImportDocumentsWithExtractedText.cs @@ -37,6 +37,7 @@ public async Task Sample05_ImportDocumentsWithExtractedText() const int controlNumberColumnIndex = 0; const int extractedTextPathColumnIndex = 12; const int fileNameColumnIndex = 13; + const int fileSizeColumnIndex = 14; const int filePathColumnIndex = 22; // Path to the load files used in data source settings. @@ -53,7 +54,8 @@ public async Task Sample05_ImportDocumentsWithExtractedText() .WithField(controlNumberColumnIndex, "Control Number") .WithExtractedTextField(extractedTextPathColumnIndex, e => e .WithExtractedTextInSeparateFiles(f => f - .WithEncoding("UTF-8")))) + .WithEncoding("UTF-8") + .WithFileSizeDefinedInColumn(fileSizeColumnIndex)))) .WithoutFolders(); // Configuration settings for data source. Builder is used to create settings. diff --git a/Samples/KeplerClientConsole/SamplesCollection/Sample07_DirectImportSettingsForDocuments.cs b/Samples/KeplerClientConsole/SamplesCollection/Sample07_DirectImportSettingsForDocuments.cs index 052a06f..c88d247 100644 --- a/Samples/KeplerClientConsole/SamplesCollection/Sample07_DirectImportSettingsForDocuments.cs +++ b/Samples/KeplerClientConsole/SamplesCollection/Sample07_DirectImportSettingsForDocuments.cs @@ -40,6 +40,7 @@ public async Task Sample07_DirectImportSettingsForDocuments() const int extractedTextFilePathColumnIndex = 12; const int emailToColumnIndex = 11; const int fileNameColumnIndex = 13; + const int fileSizeColumnIndex = 14; const int filePathColumnIndex = 22; // Configuration settings for document import. Example of set without using ImportDocumentSettingsBuilder. @@ -87,6 +88,8 @@ public async Task Sample07_DirectImportSettingsForDocuments() ContainsID = false, ColumnIndex = extractedTextFilePathColumnIndex, ContainsFilePath = true, + Encoding = "UTF-8", + FileSizeColumnIndex = fileSizeColumnIndex }, }, }, @@ -94,15 +97,7 @@ public async Task Sample07_DirectImportSettingsForDocuments() { FolderPathColumnIndex = null, RootFolderID = rootFolderId, - }, - Other = new OtherSettings - { - ExtractedText = new ExtractedTextSettings - { - Encoding = null, - ValidateEncoding = true, - }, - }, + } }; // Example of data source configuration created without using DataSourceSettingsBuilder. diff --git a/Samples/KeplerClientConsole/SamplesCollection/Sample11_DirectImportSettingsForImages.cs b/Samples/KeplerClientConsole/SamplesCollection/Sample11_DirectImportSettingsForImages.cs index 00a67d3..ecbc97e 100644 --- a/Samples/KeplerClientConsole/SamplesCollection/Sample11_DirectImportSettingsForImages.cs +++ b/Samples/KeplerClientConsole/SamplesCollection/Sample11_DirectImportSettingsForImages.cs @@ -48,20 +48,14 @@ public async Task Sample11_DirectImportSettingsForImages() PageNumbering = PageNumbering.AutoNumberImages, ProductionID = null, LoadExtractedText = true, + ExtractedTextEncoding = "UTF-8" }, Fields = null, Folder = new FolderSettings { FolderPathColumnIndex = null, RootFolderID = rootFolderId, - }, - Other = new OtherSettings - { - ExtractedText = new ExtractedTextSettings - { - ValidateEncoding = true, - }, - }, + } }; // Configuration settings for data source created without DataSourceSettingsBuilder. diff --git a/Samples/KeplerClientConsole/SamplesCollection/Sample16_ReadImportDocumentSettings.cs b/Samples/KeplerClientConsole/SamplesCollection/Sample16_ReadImportDocumentSettings.cs index fe38c51..227f1f0 100644 --- a/Samples/KeplerClientConsole/SamplesCollection/Sample16_ReadImportDocumentSettings.cs +++ b/Samples/KeplerClientConsole/SamplesCollection/Sample16_ReadImportDocumentSettings.cs @@ -33,6 +33,7 @@ public async Task Sample16_ReadImportDocumentSettings() const int controlNumberColumnIndex = 0; const int emailToColumnIndex = 11; const int fileNameColumnIndex = 13; + const int fileSizeColumnIndex = 14; const int filePathColumnIndex = 22; // Configuration of document import. Builder is used to create the settings. @@ -47,7 +48,8 @@ public async Task Sample16_ReadImportDocumentSettings() .WithField(emailToColumnIndex, "Email To") .WithExtractedTextField(10, e => e.WithExtractedTextInSeparateFiles( - a => a.WithEncoding("UTF-8")))) + a => a.WithEncoding("UTF-8") + .WithFileSizeDefinedInColumn(fileSizeColumnIndex)))) .WithoutFolders(); using (Relativity.Import.V1.Services.IDocumentConfigurationController documentConfiguration = @@ -77,8 +79,9 @@ public async Task Sample16_ReadImportDocumentSettings() { // Reading of example fields. Console.WriteLine($"Native.FileNameColumnIndex: {documentSettings.Value.Native.FileNameColumnIndex}"); - Console.WriteLine($"ExtractedText.Encoding: {documentSettings.Value.Other?.ExtractedText?.Encoding}"); Console.WriteLine($"FieldMappings[0].ColumnIndex: {documentSettings.Value.Fields.FieldMappings[0].ColumnIndex}"); + Console.WriteLine($"FieldMappings[2].Encoding: {documentSettings.Value.Fields.FieldMappings[2].Encoding}"); + Console.WriteLine($"FieldMappings[2].FileSizeColumnIndex: {documentSettings.Value.Fields.FieldMappings[2].FileSizeColumnIndex}"); } } } @@ -87,6 +90,7 @@ public async Task Sample16_ReadImportDocumentSettings() /* Expected console result: Native.FileNameColumnIndex: 13 - ExtractedText.Encoding: UTF-8 FieldMappings[0].ColumnIndex: 0 + FieldMappings[2].Encoding: UTF-8 + FieldMappings[2].FileSizeColumnIndex: 14 */ \ No newline at end of file diff --git a/Samples/RestSamples/SamplesCollection/sample05-import-documents-with-extracted-text.ps1 b/Samples/RestSamples/SamplesCollection/sample05-import-documents-with-extracted-text.ps1 index 5cfcf02..531c239 100644 --- a/Samples/RestSamples/SamplesCollection/sample05-import-documents-with-extracted-text.ps1 +++ b/Samples/RestSamples/SamplesCollection/sample05-import-documents-with-extracted-text.ps1 @@ -16,7 +16,7 @@ Context "Sample05 Import documents with extracted text" { $body = @{ applicationName = "Import-service-sample-app" - correlationID = "Sample-job-0005" + correlationID = "Sample-job-0005" } | ConvertTo-Json -Depth 10 $response = $global:WebRequest.callPost($uri, $body) @@ -47,16 +47,12 @@ Context "Sample05 Import documents with extracted text" { "ColumnIndex": 12, "Field": "Extracted Text", "ContainsID": false, - "ContainsFilePath": true + "ContainsFilePath": true, + "Encoding": "UTF-8", + "FileSizeColumnIndex": 14 } ] }, - "Other":{ - "ExtractedText":{ - "Encoding": "UTF-8", - "ValidateEncoding": false - } - }, "Folder":null } }' @@ -69,18 +65,18 @@ Context "Sample05 Import documents with extracted text" { $uri = $global:Endpoints.importSourceAddUri($importId, $sourceId) $dataSourceConfigurationBody = @{ dataSourceSettings = @{ - path = $loadFilePath + path = $loadFilePath firstLineContainsColumnNames = $true - startLine = 0 - columnDelimiter = "|" - quoteDelimiter = "^" - newLineDelimiter = "#" - nestedValueDelimiter = "&" - multiValueDelimiter = "$" - endOfLine = 0 - encoding = $null - cultureInfo = "en-us" - type = 2 + startLine = 0 + columnDelimiter = "|" + quoteDelimiter = "^" + newLineDelimiter = "#" + nestedValueDelimiter = "&" + multiValueDelimiter = "$" + endOfLine = 0 + encoding = $null + cultureInfo = "en-us" + type = 2 } } | ConvertTo-Json -Depth 10 @@ -114,8 +110,7 @@ Context "Sample05 Import documents with extracted text" { [int]$sleepTime = 5 - while($isJobFinished -ne $true) - { + while ($isJobFinished -ne $true) { Start-Sleep -Seconds $sleepTime $jobDetailsResponse = $global:WebRequest.callGet($uri) $isJobFinished = $jobDetailsResponse."Value"."IsFinished" diff --git a/Samples/RestSamples/SamplesCollection/sample07-direct-import-settings-for-documents.ps1 b/Samples/RestSamples/SamplesCollection/sample07-direct-import-settings-for-documents.ps1 index 3c71875..5bf42d3 100644 --- a/Samples/RestSamples/SamplesCollection/sample07-direct-import-settings-for-documents.ps1 +++ b/Samples/RestSamples/SamplesCollection/sample07-direct-import-settings-for-documents.ps1 @@ -17,7 +17,7 @@ Context "Sample07 Direct import settings for documents" { $body = @{ applicationName = "Import-service-sample-app" - correlationID = "Sample-job-0007-doc-settings" + correlationID = "Sample-job-0007-doc-settings" } | ConvertTo-Json -Depth 10 $response = $global:WebRequest.callPost($uri, $body) @@ -28,56 +28,52 @@ Context "Sample07 Direct import settings for documents" { Describe "Create document configuration" { $uri = $global:Endpoints.documentConfigurationUri($importId) $field1 = @{ - ColumnIndex = 0 - Field = "Control Number" - ContainsID = $false + ColumnIndex = 0 + Field = "Control Number" + ContainsID = $false ContainsFilePath = $false } $field2 = @{ - ColumnIndex = 1 - Field = "Custodian - Single Choice" - ContainsID = $false + ColumnIndex = 1 + Field = "Custodian - Single Choice" + ContainsID = $false ContainsFilePath = $false } $field3 = @{ - ColumnIndex = 11 - Field = "Email To" - ContainsID = $false + ColumnIndex = 11 + Field = "Email To" + ContainsID = $false ContainsFilePath = $false } $field4 = @{ - ColumnIndex = 12 - Field = "Extracted Text" - ContainsID = $false - ContainsFilePath = $true + ColumnIndex = 12 + Field = "Extracted Text" + ContainsID = $false + ContainsFilePath = $true + Encoding = "UTF-8" + FileSizeColumnIndex = 14 } $fields = @($field1, $field2, $field3, $field4) $jobConfigurationBody = @{ importSettings = @{ Overlay = @{ - Mode = 3 - KeyField = "Control Number" + Mode = 3 + KeyField = "Control Number" MultiFieldOverlayBehaviour = 1 } - Native = @{ - FilePathColumnIndex = 22 + Native = @{ + FilePathColumnIndex = 22 FileNameColumnIndex = 13 } - Image = $null - Fields = @{ + Image = $null + Fields = @{ FieldMappings = $fields } - Folder = @{ - RootFolderID = $rootFolderId + Folder = @{ + RootFolderID = $rootFolderId FolderPathColumnIndex = $null } - Other = @{ - ExtractedText = @{ - Encoding = $null - ValidateEncoding = $true - } - } } } | ConvertTo-Json -Depth 10 $response = $global:WebRequest.callPost($uri, $jobConfigurationBody) @@ -89,18 +85,18 @@ Context "Sample07 Direct import settings for documents" { $uri = $global:Endpoints.importSourceAddUri($importId, $sourceId) $dataSourceConfigurationBody = @{ dataSourceSettings = @{ - path = $loadFilePath + path = $loadFilePath firstLineContainsColumnNames = $true - startLine = 0 - columnDelimiter = "|" - quoteDelimiter = "^" - newLineDelimiter = "#" - nestedValueDelimiter = "&" - multiValueDelimiter = "$" - endOfLine = 0 - encoding = $null - cultureInfo = "en-us" - type = 2 + startLine = 0 + columnDelimiter = "|" + quoteDelimiter = "^" + newLineDelimiter = "#" + nestedValueDelimiter = "&" + multiValueDelimiter = "$" + endOfLine = 0 + encoding = $null + cultureInfo = "en-us" + type = 2 } } | ConvertTo-Json -Depth 10 @@ -134,8 +130,7 @@ Context "Sample07 Direct import settings for documents" { [int]$sleepTime = 5 - while($isJobFinished -ne $true) - { + while ($isJobFinished -ne $true) { Start-Sleep -Seconds $sleepTime $jobDetailsResponse = $global:WebRequest.callGet($uri) $isJobFinished = $jobDetailsResponse."Value"."IsFinished" diff --git a/Samples/RestSamples/SamplesCollection/sample11-direct-import-settings-for-images.ps1 b/Samples/RestSamples/SamplesCollection/sample11-direct-import-settings-for-images.ps1 index 10e9125..ee6e8ea 100644 --- a/Samples/RestSamples/SamplesCollection/sample11-direct-import-settings-for-images.ps1 +++ b/Samples/RestSamples/SamplesCollection/sample11-direct-import-settings-for-images.ps1 @@ -38,17 +38,13 @@ Context "Sample11 Direct import settings for images" { PageNumbering = 1 ProductionID = $null LoadExtractedText = $true + ExtractedTextEncoding = "UTF-8" } Fields = $null Folder = @{ FolderPathColumnIndex = $null RootFolderID = $rootFolderId } - Other = @{ - ExtractedText = @{ - ValidateEncoding = $true - } - } } } | ConvertTo-Json -Depth 10 $response = $global:WebRequest.callPost($uri, $jobConfigurationBody)