Skip to content

Commit

Permalink
DR2-2002 Validate metadata only
Browse files Browse the repository at this point in the history
This allows you to skip the file exists, checksum and integrity checks
which you may need if you have a draft metadata file without the files
themselves.

I've added a button in the UI, an argument in the CLI and an option in
the Java bridge.
  • Loading branch information
MancunianSam committed Nov 21, 2024
1 parent 57fbb70 commit a2c638e
Show file tree
Hide file tree
Showing 25 changed files with 154 additions and 87 deletions.
2 changes: 2 additions & 0 deletions csv-validator-cmd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Usage: validate [options] <csv-path> <csv-schema-path>
The path to the CSV Schema file to use for validation
--disable-utf8-validation
Disable UTF-8 validation for CSV files
--skip-file-checks
Skip integrity, checksum and file existence checks
--show-progress
Show progress
--help
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ object CsvValidatorCmdApp extends App {
csvSchemaPath: Path = Paths.get("."),
csvSchemaEncoding: Charset = CsvValidator.DEFAULT_ENCODING,
disableUtf8Validation:Boolean = false,
progressCallback: Option[ProgressCallback] = None)
progressCallback: Option[ProgressCallback] = None,
skipFileChecks: Boolean = false)

def run(args: Array[String]): ExitStatus = {

Expand All @@ -69,6 +70,7 @@ object CsvValidatorCmdApp extends App {
opt[Charset]('x', "csv-encoding").optional().action { (x,c) => c.copy(csvEncoding = x) } text("Defines the charset encoding used in the CSV file")
opt[Charset]('y', "csv-schema-encoding").optional().action { (x,c) => c.copy(csvSchemaEncoding = x) }.text("Defines the charset encoding used in the CSV Schema file")
opt[Unit]("disable-utf8-validation").optional().action {(_, c) => c.copy(disableUtf8Validation = true)}.text("Disable UTF-8 validation for CSV files.")
opt[Unit]("skip-file-checks").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Skip integrity, checksum and file existence checks")
opt[Unit]("show-progress").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Show progress")
arg[Path]("<csv-path>").validate { x => if(Files.exists(x) && Files.isReadable(x)) success else failure(s"Cannot access CSV file: ${x.toString}") }.action { (x,c) => c.copy(csvPath = x) }.text("The path to the CSV file to validate")
arg[Path]("<csv-schema-path>").validate { x => if(Files.exists(x) && Files.isReadable(x)) success else failure(s"Cannot access CSV Schema file: ${x.toString}") }.action { (x,c) => c.copy(csvSchemaPath = x) }.text("The path to the CSV Schema file to use for validation")
Expand All @@ -84,7 +86,8 @@ object CsvValidatorCmdApp extends App {
config.substitutePaths,
config.caseSensitivePaths,
config.traceParser,
config.progressCallback
config.progressCallback,
config.skipFileChecks
)
} getOrElse {
//arguments are bad, usage message will have been displayed
Expand Down Expand Up @@ -141,7 +144,7 @@ object CsvValidatorCmdApp extends App {
}

def getColumnFromCsv(csvFile: TextFile, csvSchemaFile: TextFile, columnName: String): List[String] = Try {
val validator = createValidator(true, Nil, false, false)
val validator = createValidator(true, Nil, false, false, false)
val csv = validator.loadCsvFile(csvFile, csvSchemaFile)
csv.headOption.map(_.indexOf("identifier")).map { identifierIdx =>
csv.tail.map(arr => arr(identifierIdx))
Expand All @@ -157,9 +160,10 @@ object CsvValidatorCmdApp extends App {
enforceCaseSensitivePathChecks: Boolean,
trace: Boolean,
progress: Option[ProgressCallback],
skipFileChecks: Boolean,
onRow: ValidatedNel[FailMessage, Any] => Unit = rowCallback
): ExitStatus = {
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace)
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace, skipFileChecks)
validator.parseSchema(schemaFile) match {
case Validated.Invalid(errors) => (prettyPrint(errors), SystemExitCodes.InvalidSchema)
case Validated.Valid(schema) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ object CsvValidator {
type PathTo = String
type SubstitutePath = (PathFrom, PathTo)

def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean) = {
def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean, skipFileChecksSwitch: Boolean) = {
if(failFast) {
new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch }
new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch}
} else {
new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch }
new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch }
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ with TraceableParsers {
*/
val enforceCaseSensitivePathChecks: Boolean


val skipFileChecks: Boolean

lazy val versionHeader: PackratParser[String] = "VersionDecl" ::= ("version" ~> versionLiteral )

Expand Down Expand Up @@ -146,13 +146,15 @@ with TraceableParsers {
val ecspc = enforceCaseSensitivePathChecks
val ps = pathSubstitutions
val t = trace
val sfc = skipFileChecks

SchemaValidator.versionValid(version).map(Failure(_, next)).getOrElse {
version match {
case "1.2" =>
val parser1_2 = new SchemaParser1_2 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
}

parser1_2.parseVersionAware(reader) match {
Expand All @@ -165,6 +167,7 @@ with TraceableParsers {
val parser1_1 = new SchemaParser1_1 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
}

parser1_1.parseVersionAware(reader) match {
Expand All @@ -177,6 +180,7 @@ with TraceableParsers {
val parser1_0 = new SchemaParser1_0 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
}

parser1_0.parseVersionAware(reader) match {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,21 @@ case class RegExpRule(regex: String) extends Rule("regex") {
}

//TODO note the use of `Seq(rootPath): _*` when extending Rule, this is to workaround this bug https://issues.scala-lang.org/browse/SI-7436. This pattern is repeated below!
case class FileExistsRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None)) extends Rule("fileExists", Seq(rootPath): _*) {
case class FileExistsRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None), skipFileChecks: Boolean = false) extends Rule("fileExists", Seq(rootPath): _*) {

override def valid(filePath: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None) = {
if(skipFileChecks) {
true
} else {
val ruleValue = rootPath.referenceValue(columnIndex, row, schema)

val ruleValue = rootPath.referenceValue(columnIndex, row, schema)

val fileExists = ruleValue match {
case Some(rp) => new FileSystem(rp, filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks)
case None => new FileSystem(filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks)
ruleValue match {
case Some(rp) => new FileSystem(rp, filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks)
case None => new FileSystem(filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks)
}
}
fileExists


}

override def toError = s"""$ruleName""" + (if (rootPath == Literal(None)) "" else s"""(${rootPath.toError})""")
Expand Down Expand Up @@ -317,18 +321,21 @@ case class UniqueMultiRule(columns: List[ColumnReference]) extends Rule("unique(
}
}

case class ChecksumRule(rootPath: ArgProvider, file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean = false) extends Rule("checksum", Seq(rootPath, file): _*) with FileWildcardSearch[String] {
case class ChecksumRule(rootPath: ArgProvider, file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean = false, skipFileChecks: Boolean = false) extends Rule("checksum", Seq(rootPath, file): _*) with FileWildcardSearch[String] {

def this(file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean) = this(Literal(None), file, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks)
def this(file: ArgProvider, algorithm: String, enforceCaseSensitivePathChecks: Boolean) = this(Literal(None), file, algorithm, List.empty[(String,String)], enforceCaseSensitivePathChecks)
def this(file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean) = this(Literal(None), file, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks, false)
def this(file: ArgProvider, algorithm: String, enforceCaseSensitivePathChecks: Boolean, skipFileChecks: Boolean) = this(Literal(None), file, algorithm, List.empty[(String,String)], enforceCaseSensitivePathChecks, false)

override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = {
val columnDefinition = schema.columnDefinitions(columnIndex)

search(filename(columnIndex, row, schema)) match {
case Validated.Valid(hexValue: String) if hexValue == cellValue(columnIndex,row,schema) => true.validNel[String]
case Validated.Valid(hexValue: String) => s"""$toError file "${TypedPath(filename(columnIndex, row, schema)._1 + filename(columnIndex, row, schema)._2).toPlatform}" checksum match fails for line: ${row.lineNumber}, column: ${columnDefinition.id}, ${toValueError(row,columnIndex)}. Computed checksum value:"${hexValue}"""".invalidNel[Any]
case Validated.Invalid(errMsg) => s"$toError ${errMsg.head} for line: ${row.lineNumber}, column: ${columnDefinition.id}, ${toValueError(row,columnIndex)}".invalidNel[Any]
if(skipFileChecks) {
Validated.Valid("")
} else {
search(filename(columnIndex, row, schema)) match {
case Validated.Valid(hexValue: String) if hexValue == cellValue(columnIndex, row, schema) => true.validNel[String]
case Validated.Valid(hexValue: String) => s"""$toError file "${TypedPath(filename(columnIndex, row, schema)._1 + filename(columnIndex, row, schema)._2).toPlatform}" checksum match fails for line: ${row.lineNumber}, column: ${columnDefinition.id}, ${toValueError(row, columnIndex)}. Computed checksum value:"${hexValue}"""".invalidNel[Any]
case Validated.Invalid(errMsg) => s"$toError ${errMsg.head} for line: ${row.lineNumber}, column: ${columnDefinition.id}, ${toValueError(row, columnIndex)}".invalidNel[Any]
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -481,9 +481,9 @@ trait SchemaParser extends BaseSchemaParser {
*/
lazy val fileExistsExpr: PackratParser[FileExistsRule] = "FileExistsExpr" ::= ("fileExists" ~> opt("(" ~> stringProvider <~ ")")).withFailureMessage("Invalid fileExists rule") ^^ {
case None =>
FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks)
FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks, skipFileChecks = skipFileChecks)
case Some(s) =>
FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks, s)
FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks, s, skipFileChecks = skipFileChecks)
}


Expand All @@ -492,7 +492,7 @@ trait SchemaParser extends BaseSchemaParser {
*/
lazy val checksumExpr = "ChecksumExpr" ::= ("checksum(" ~> fileExpr <~ ",") ~ stringLiteral <~ ")" ^^ {
case files ~ algorithm =>
ChecksumRule(files._1.getOrElse(Literal(None)), files._2, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks)
ChecksumRule(files._1.getOrElse(Literal(None)), files._2, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks, skipFileChecks)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ case class SwitchRule(elseRules: Option[List[Rule]], cases:(Rule, List[Rule])*)



case class IntegrityCheckRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None), topLevelFolder: String = "content", includeFolder: Boolean = false) extends Rule("integrityCheck", Seq(rootPath): _*) {
case class IntegrityCheckRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None), topLevelFolder: String = "content", includeFolder: Boolean = false, skipFileChecks: Boolean = false) extends Rule("integrityCheck", Seq(rootPath): _*) {

//TODO introduce state, not very functional
var filesMap = Map[String, Set[Path]]()
Expand All @@ -83,8 +83,10 @@ case class IntegrityCheckRule(pathSubstitutions: List[(String,String)], enforceC
}

override def valid(filePath: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean]): Boolean = {

if (!filePath.isEmpty){
if (skipFileChecks) {
true
}
else if (!filePath.isEmpty){

val ruleValue = rootPath.referenceValue(columnIndex, row, schema)
val filePathS = if (FILE_SEPARATOR == WINDOWS_FILE_SEPARATOR)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ trait SchemaParser extends SchemaParser1_0 {

lazy val integrityCheckExpr: PackratParser[IntegrityCheckRule] = "IntegrityCheckExpr" ::= ("integrityCheck" ~> "(" ~> opt(stringProvider <~ ",") ~ opt(stringLiteral <~ ",") ~ stringLiteral <~ ")" ).withFailureMessage("Invalid integrityCheck rule") ^^ {
case rp ~ topLevelFolder ~ includeFolder if (includeFolder == "includeFolder") =>
IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), topLevelFolder.getOrElse("content"), true)
IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), topLevelFolder.getOrElse("content"), true, skipFileChecks)
case rp ~ topLevelFolder ~ includeFolder if (includeFolder == "excludeFolder") =>
IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), topLevelFolder.getOrElse("content"), false)
IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), topLevelFolder.getOrElse("content"), false, skipFileChecks)
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
val pathSubstitutions = List[(String,String)]()
val enforceCaseSensitivePathChecks = false
val trace = false
val skipFileChecks = false

def validateR(csv: io.Reader, schema: Schema): this.type#MetaDataValidation[Any] = validate(csv, schema, None)
}
Expand All @@ -39,6 +40,7 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
val pathSubstitutions = List[(String,String)]()
val enforceCaseSensitivePathChecks = true
val trace = false
val skipFileChecks = false
}

import v.{validate, validateR, parseSchema}
Expand Down Expand Up @@ -406,7 +408,7 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
}

"Validate fail fast" should {
val app = new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = List[(String,String)](); val enforceCaseSensitivePathChecks = false; val trace = false }
val app = new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = List[(String,String)](); val enforceCaseSensitivePathChecks = false; val trace = false; val skipFileChecks = false }

"only report first error for invalid @TotalColumns" in {
app.validate(TextFile(Paths.get(base).resolve("totalColumnsFailMetaData.csv")), parse(base + "/totalColumnsSchema.csvs"), None) must beLike {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ class MetaDataValidatorBigFileSpec extends Specification with TestResources {
"Big file" should {

"succeed with no stack overflow for all errors" in {
val v = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = List[SubstitutePath](); val enforceCaseSensitivePathChecks = false; val trace = false }
val v = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = List[SubstitutePath](); val enforceCaseSensitivePathChecks = false; val trace = false; val skipFileChecks = false }
def parse(filePath: String): Schema = v.parseSchema(TextFile(Paths.get(filePath))) fold (f => throw new IllegalArgumentException(f.toString()), s => s)

v.validate(TextFile(Paths.get(base).resolve("bigMetaData.csv")), parse(base + "/bigSchema.csvs"), None) must beLike { case Validated.Valid(_) => ok }
}

"succeed with no stack overflow for fail fast" in {
val v = new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = List[SubstitutePath](); val enforceCaseSensitivePathChecks = false; val trace = false }
val v = new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = List[SubstitutePath](); val enforceCaseSensitivePathChecks = false; val trace = false; val skipFileChecks = false }
def parse(filePath: String): Schema = v.parseSchema(TextFile(Paths.get(filePath))) fold (f => throw new IllegalArgumentException(f.toString()), s => s)

v.validate(TextFile(Paths.get(base).resolve("bigMetaData.csv")), parse(base + "/bigSchema.csvs"), None) must beLike { case Validated.Valid(_) => ok }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class MetaDataValidatorBusinessAcceptanceSpec extends Specification with TestRes

val base = resourcePath("acceptance/dp")

val v: CsvValidator = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = List[(String,String)](); val enforceCaseSensitivePathChecks = false; val trace = false }
val v: CsvValidator = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = List[(String,String)](); val enforceCaseSensitivePathChecks = false; val trace = false; val skipFileChecks = false }
import v.{validate, parseSchema}

def parse(filePath: String): Schema = parseSchema(TextFile(Paths.get(filePath))) fold (f => throw new IllegalArgumentException(f.toString()), s => s)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class MetaDataValidatorChecksumSpec extends Specification with TestResources {
val pathSubstitutions = List[(String,String)]()
val enforceCaseSensitivePathChecks = false
val trace = false
val skipFileChecks = false
override def parse(reader: Reader): ParseResult[Schema] = super.parse(reader) match {
case s @ Success(schema: Schema, _) =>
s
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class MetaDataValidatorFileCountSpec extends Specification with TestResources {
val pathSubstitutions = List[(String,String)]()
val enforceCaseSensitivePathChecks = false
val trace = false
val skipFileChecks = false
override def parse(reader: Reader): ParseResult[Schema] = super.parse(reader) match {
case s@Success(schema: Schema, _) => s
case NoSuccess(message, next) => throw new RuntimeException(message)
Expand Down
Loading

0 comments on commit a2c638e

Please sign in to comment.