From c20fefa44794573ef83f0aea034efca795a49cc5 Mon Sep 17 00:00:00 2001 From: Jamie Land Date: Mon, 11 Nov 2024 10:27:34 -0500 Subject: [PATCH] Updating weaviate content retreiever --- README.md | 10 ++- .../WeaviateContentRetrieverClient.java | 5 +- .../custom/WeaviateEmbeddingStoreCustom.java | 85 +++++++++---------- .../request/retriever/WeaviateRequest.java | 2 - 4 files changed, 53 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 0441c2d..6acdeaf 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,12 @@ https://drive.google.com/drive/folders/1jZe0cEw8p_E-fghd6IFPjwiabDNAhtp7?usp=dri We need to figure out a better way to handel this in the future (or put this on github) -## Tech Debt +## Code Standards -Figure out better secret management +### OWasp Security Scanning + +The [OWASP Dependency-Check Plugin](https://owasp.org/www-project-dependency-check/) can be run using the following command: + +```sh +mvn validate -P security-scanner +``` diff --git a/src/main/java/com/redhat/composer/config/retriever/contentRetriever/WeaviateContentRetrieverClient.java b/src/main/java/com/redhat/composer/config/retriever/contentRetriever/WeaviateContentRetrieverClient.java index 50c1612..9ba2797 100644 --- a/src/main/java/com/redhat/composer/config/retriever/contentRetriever/WeaviateContentRetrieverClient.java +++ b/src/main/java/com/redhat/composer/config/retriever/contentRetriever/WeaviateContentRetrieverClient.java @@ -50,11 +50,12 @@ public ContentRetriever getContentRetriever(RetrieverRequest request) { .scheme(scheme) .host(host) .apiKey(apiKey) - .metadataParentKey("") + .metadataFieldName("") + // .metadataFieldName(null) .metadataKeys(weaviateRequest.getMetadataFields()) .objectClass(index) .avoidDups(true) - .textKey(textKey) + .textFieldName(textKey) .build(); diff --git a/src/main/java/com/redhat/composer/config/retriever/contentRetriever/custom/WeaviateEmbeddingStoreCustom.java b/src/main/java/com/redhat/composer/config/retriever/contentRetriever/custom/WeaviateEmbeddingStoreCustom.java index d1e2ccb..0584bd5 100644 --- a/src/main/java/com/redhat/composer/config/retriever/contentRetriever/custom/WeaviateEmbeddingStoreCustom.java +++ b/src/main/java/com/redhat/composer/config/retriever/contentRetriever/custom/WeaviateEmbeddingStoreCustom.java @@ -40,16 +40,15 @@ public class WeaviateEmbeddingStoreCustom implements EmbeddingStore { private static final String ADDITIONALS = "_additional"; - private static final String METADATA = "_metadata"; private static final String NULL_VALUE = ""; private final WeaviateClient client; private final String objectClass; private final boolean avoidDups; private final String consistencyLevel; - private final String metadataParentKey; + private final String metadataFieldName; private final Collection metadataKeys; - private final String textKey; + private final String textFieldName; /** * Creates a new WeaviateEmbeddingStore instance. @@ -64,12 +63,12 @@ public class WeaviateEmbeddingStoreCustom implements EmbeddingStore * provided text segment, which avoids duplicated entries in DB. * If false, then random ID will be generated. * @param consistencyLevel Consistency level: ONE, QUORUM (default) or ALL. Find more details here. - * @param metadataParentKey The key in metadata that contains the metadata. Default is "_metadata". If set to empty string, then metadata will be stored in the root of the object. * @param metadataKeys Metadata keys that should be persisted (optional) * @param useGrpcForInserts Use GRPC instead of HTTP for batch inserts only. You still need HTTP configured for search * @param securedGrpc The GRPC connection is secured * @param grpcPort The port, e.g. 50051. This parameter is optional. - * @param textKey The key in metadata that contains the text. Default is "text". + * @param textFieldName The name of the field that contains the text of a {@link TextSegment}. Default is "text". + * @param metadataFieldName metadataFieldName The name of the field where {@link Metadata} entries are stored. Default is "_metadata". If set to empty string, {@link Metadata} entries will be stored in the root of the Weaviate object. */ @Builder public WeaviateEmbeddingStoreCustom( @@ -83,9 +82,9 @@ public WeaviateEmbeddingStoreCustom( String objectClass, Boolean avoidDups, String consistencyLevel, - String metadataParentKey, Collection metadataKeys, - String textKey + String textFieldName, + String metadataFieldName ) { try { @@ -108,9 +107,9 @@ public WeaviateEmbeddingStoreCustom( this.objectClass = getOrDefault(objectClass, "Default"); this.avoidDups = getOrDefault(avoidDups, true); this.consistencyLevel = getOrDefault(consistencyLevel, QUORUM); - this.metadataParentKey = getOrDefault(metadataParentKey, "_metadata"); + this.metadataFieldName = getOrDefault(metadataFieldName, "_metadata"); this.metadataKeys = getOrDefault(metadataKeys, Collections.emptyList()); - this.textKey = getOrDefault(textKey, "text"); + this.textFieldName = getOrDefault(textFieldName, "text"); } private static String concatenate(String host, Integer port) { @@ -187,7 +186,7 @@ public List> findRelevant( double minCertainty ) { List fields = new ArrayList<>(); - fields.add(Field.builder().name(textKey).build()); + fields.add(Field.builder().name(textFieldName).build()); fields.add(Field .builder() .name(ADDITIONALS) @@ -202,8 +201,8 @@ public List> findRelevant( for (String property : metadataKeys) { metadataFields.add(Field.builder().name(property).build()); } - if (metadataParentKey != null && !metadataParentKey.isEmpty()) { - fields.add(Field.builder().name(metadataParentKey).fields(metadataFields.toArray(new Field[0])).build()); + if (!metadataFieldName.isEmpty()) { + fields.add(Field.builder().name(metadataFieldName).fields(metadataFields.toArray(new Field[0])).build()); } else { fields.addAll(metadataFields); } @@ -222,7 +221,6 @@ public List> findRelevant( ) .withLimit(maxResults) .run(); - if (result.hasErrors()) { throw new IllegalArgumentException( result.getError().getMessages().stream().map(WeaviateErrorMessage::getMessage).collect(joining("\n")) @@ -275,7 +273,7 @@ private WeaviateObject buildObject(String id, Embedding embedding, TextSegment s Map props = new HashMap<>(); Map metadata = prefillMetadata(); if (segment != null) { - props.put(textKey, segment.text()); + props.put(textFieldName, segment.text()); if (!segment.metadata().toMap().isEmpty()) { for (String property : metadataKeys) { if (segment.metadata().containsKey(property)) { @@ -285,7 +283,7 @@ private WeaviateObject buildObject(String id, Embedding embedding, TextSegment s } setMetadata(props, metadata); } else { - props.put(textKey, ""); + props.put(textFieldName, ""); setMetadata(props, metadata); } props.put("indexFilterable", true); @@ -301,8 +299,8 @@ private WeaviateObject buildObject(String id, Embedding embedding, TextSegment s private void setMetadata(Map props, Map metadata) { if (metadata != null && !metadata.isEmpty()) { - if(metadataParentKey != null && !metadataParentKey.isEmpty()) { - props.put(metadataParentKey, metadata); + if(metadataFieldName != null && !metadataFieldName.isEmpty()) { + props.put(metadataFieldName, metadata); } else { props.putAll(metadata); } @@ -318,33 +316,34 @@ private Map prefillMetadata() { } private EmbeddingMatch toEmbeddingMatch(Map item) { - Map additional = (Map) item.get(ADDITIONALS); - final Metadata metadata = new Metadata(); - Map metadataMap = new HashMap(); - if (metadataParentKey == null || metadataParentKey.isEmpty()) { - metadataKeys.stream().forEach(key -> - metadata.add(key, item.get(key)) - ); - } - else if (item.get(metadataParentKey) != null && item.get(metadataParentKey) instanceof Map) { - metadataMap = (Map) item.get(metadataParentKey); - } - if(metadataMap != null) { - for (Map.Entry entry : metadataMap.entrySet()) { - if (entry.getValue() != null && !NULL_VALUE.equals(entry.getValue())) { - metadata.add(entry.getKey(), entry.getValue()); - } - } + Map additional = (Map) item.get(ADDITIONALS); + final Metadata metadata = new Metadata(); + Map metadataMap = new HashMap<>(); + if (metadataFieldName.isEmpty()) { + metadataMap = new HashMap<>(item); + // Remove text field from metadata if we store metadata in the root of the object + metadataMap.remove(textFieldName); + } else if (item.get(metadataFieldName) != null && item.get(metadataFieldName) instanceof Map) { + metadataMap = (Map) item.get(metadataFieldName); + } + if (metadataKeys != null && !metadataKeys.isEmpty()) { + metadataMap.keySet().retainAll(metadataKeys); + } + for (Map.Entry entry : metadataMap.entrySet()) { + if (entry.getValue() != null && !NULL_VALUE.equals(entry.getValue())) { + // TODO: Remove or replace use of deprecated method + metadata.add(entry.getKey(), entry.getValue()); } - String text = (String) item.get(textKey); + } + String text = (String) item.get(textFieldName); - return new EmbeddingMatch<>( - (Double) additional.get("certainty"), - (String) additional.get("id"), - Embedding.from( - ((List) additional.get("vector")).stream().map(Double::floatValue).collect(toList()) - ), - isNullOrBlank(text) ? null : TextSegment.from(text, metadata) - ); + return new EmbeddingMatch<>( + (Double) additional.get("certainty"), + (String) additional.get("id"), + Embedding.from( + ((List) additional.get("vector")).stream().map(Double::floatValue).collect(toList()) + ), + isNullOrBlank(text) ? null : TextSegment.from(text, metadata) + ); } } diff --git a/src/main/java/com/redhat/composer/model/request/retriever/WeaviateRequest.java b/src/main/java/com/redhat/composer/model/request/retriever/WeaviateRequest.java index a4238de..9752fdd 100644 --- a/src/main/java/com/redhat/composer/model/request/retriever/WeaviateRequest.java +++ b/src/main/java/com/redhat/composer/model/request/retriever/WeaviateRequest.java @@ -4,9 +4,7 @@ import java.util.Objects; import org.apache.commons.lang3.builder.EqualsBuilder; -import org.bson.codecs.pojo.annotations.BsonDiscriminator; -import com.fasterxml.jackson.annotation.JsonSubTypes; import com.redhat.composer.model.enums.ContentRetrieverType; public class WeaviateRequest extends BaseRetrieverRequest {