Skip to content

Commit

Permalink
Merge pull request #515 from digital-preservation/DR2-2002-validate-m…
Browse files Browse the repository at this point in the history
…etadata-only

DR2-2002 Validate metadata only
  • Loading branch information
MancunianSam authored Dec 3, 2024
2 parents 57fbb70 + c4faf4c commit 3f3faf4
Show file tree
Hide file tree
Showing 25 changed files with 157 additions and 87 deletions.
2 changes: 2 additions & 0 deletions csv-validator-cmd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Usage: validate [options] <csv-path> <csv-schema-path>
The path to the CSV Schema file to use for validation
--disable-utf8-validation
Disable UTF-8 validation for CSV files
--skip-file-checks
Skip integrity, checksum and file existence checks
--show-progress
Show progress
--help
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ object CsvValidatorCmdApp extends App {
csvSchemaPath: Path = Paths.get("."),
csvSchemaEncoding: Charset = CsvValidator.DEFAULT_ENCODING,
disableUtf8Validation:Boolean = false,
progressCallback: Option[ProgressCallback] = None)
progressCallback: Option[ProgressCallback] = None,
skipFileChecks: Boolean = false)

def run(args: Array[String]): ExitStatus = {

Expand All @@ -69,6 +70,7 @@ object CsvValidatorCmdApp extends App {
opt[Charset]('x', "csv-encoding").optional().action { (x,c) => c.copy(csvEncoding = x) } text("Defines the charset encoding used in the CSV file")
opt[Charset]('y', "csv-schema-encoding").optional().action { (x,c) => c.copy(csvSchemaEncoding = x) }.text("Defines the charset encoding used in the CSV Schema file")
opt[Unit]("disable-utf8-validation").optional().action {(_, c) => c.copy(disableUtf8Validation = true)}.text("Disable UTF-8 validation for CSV files.")
opt[Unit]("skip-file-checks").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Skip integrity, checksum and file existence checks")
opt[Unit]("show-progress").optional().action {(_, c) => c.copy(progressCallback = Some(commandLineProgressCallback()))}.text("Show progress")
arg[Path]("<csv-path>").validate { x => if(Files.exists(x) && Files.isReadable(x)) success else failure(s"Cannot access CSV file: ${x.toString}") }.action { (x,c) => c.copy(csvPath = x) }.text("The path to the CSV file to validate")
arg[Path]("<csv-schema-path>").validate { x => if(Files.exists(x) && Files.isReadable(x)) success else failure(s"Cannot access CSV Schema file: ${x.toString}") }.action { (x,c) => c.copy(csvSchemaPath = x) }.text("The path to the CSV Schema file to use for validation")
Expand All @@ -84,7 +86,8 @@ object CsvValidatorCmdApp extends App {
config.substitutePaths,
config.caseSensitivePaths,
config.traceParser,
config.progressCallback
config.progressCallback,
config.skipFileChecks
)
} getOrElse {
//arguments are bad, usage message will have been displayed
Expand Down Expand Up @@ -141,7 +144,7 @@ object CsvValidatorCmdApp extends App {
}

def getColumnFromCsv(csvFile: TextFile, csvSchemaFile: TextFile, columnName: String): List[String] = Try {
val validator = createValidator(true, Nil, false, false)
val validator = createValidator(true, Nil, false, false, false)
val csv = validator.loadCsvFile(csvFile, csvSchemaFile)
csv.headOption.map(_.indexOf("identifier")).map { identifierIdx =>
csv.tail.map(arr => arr(identifierIdx))
Expand All @@ -157,9 +160,10 @@ object CsvValidatorCmdApp extends App {
enforceCaseSensitivePathChecks: Boolean,
trace: Boolean,
progress: Option[ProgressCallback],
skipFileChecks: Boolean,
onRow: ValidatedNel[FailMessage, Any] => Unit = rowCallback
): ExitStatus = {
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace)
val validator = createValidator(failFast, pathSubstitutionsList, enforceCaseSensitivePathChecks, trace, skipFileChecks)
validator.parseSchema(schemaFile) match {
case Validated.Invalid(errors) => (prettyPrint(errors), SystemExitCodes.InvalidSchema)
case Validated.Valid(schema) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ object CsvValidator {
type PathTo = String
type SubstitutePath = (PathFrom, PathTo)

def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean) = {
def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean, skipFileChecksSwitch: Boolean) = {
if(failFast) {
new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch }
new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch}
} else {
new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch }
new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; val skipFileChecks = skipFileChecksSwitch }
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ with TraceableParsers {
*/
val enforceCaseSensitivePathChecks: Boolean


val skipFileChecks: Boolean

lazy val versionHeader: PackratParser[String] = "VersionDecl" ::= ("version" ~> versionLiteral )

Expand Down Expand Up @@ -146,13 +146,15 @@ with TraceableParsers {
val ecspc = enforceCaseSensitivePathChecks
val ps = pathSubstitutions
val t = trace
val sfc = skipFileChecks

SchemaValidator.versionValid(version).map(Failure(_, next)).getOrElse {
version match {
case "1.2" =>
val parser1_2 = new SchemaParser1_2 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
}

parser1_2.parseVersionAware(reader) match {
Expand All @@ -165,6 +167,7 @@ with TraceableParsers {
val parser1_1 = new SchemaParser1_1 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
}

parser1_1.parseVersionAware(reader) match {
Expand All @@ -177,6 +180,7 @@ with TraceableParsers {
val parser1_0 = new SchemaParser1_0 {override val enforceCaseSensitivePathChecks: Boolean = ecspc
override val pathSubstitutions: List[(String, String)] = ps
override val trace: Boolean = t
override val skipFileChecks: Boolean = sfc
}

parser1_0.parseVersionAware(reader) match {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,21 @@ case class RegExpRule(regex: String) extends Rule("regex") {
}

//TODO note the use of `Seq(rootPath): _*` when extending Rule, this is to workaround this bug https://issues.scala-lang.org/browse/SI-7436. This pattern is repeated below!
case class FileExistsRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None)) extends Rule("fileExists", Seq(rootPath): _*) {
case class FileExistsRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None), skipFileChecks: Boolean = false) extends Rule("fileExists", Seq(rootPath): _*) {

override def valid(filePath: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None) = {
if(skipFileChecks) {
true
} else {
val ruleValue = rootPath.referenceValue(columnIndex, row, schema)

val ruleValue = rootPath.referenceValue(columnIndex, row, schema)

val fileExists = ruleValue match {
case Some(rp) => new FileSystem(rp, filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks)
case None => new FileSystem(filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks)
ruleValue match {
case Some(rp) => new FileSystem(rp, filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks)
case None => new FileSystem(filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks)
}
}
fileExists


}

override def toError = s"""$ruleName""" + (if (rootPath == Literal(None)) "" else s"""(${rootPath.toError})""")
Expand Down Expand Up @@ -317,18 +321,21 @@ case class UniqueMultiRule(columns: List[ColumnReference]) extends Rule("unique(
}
}

case class ChecksumRule(rootPath: ArgProvider, file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean = false) extends Rule("checksum", Seq(rootPath, file): _*) with FileWildcardSearch[String] {
case class ChecksumRule(rootPath: ArgProvider, file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean = false, skipFileChecks: Boolean = false) extends Rule("checksum", Seq(rootPath, file): _*) with FileWildcardSearch[String] {

def this(file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean) = this(Literal(None), file, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks)
def this(file: ArgProvider, algorithm: String, enforceCaseSensitivePathChecks: Boolean) = this(Literal(None), file, algorithm, List.empty[(String,String)], enforceCaseSensitivePathChecks)
def this(file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean) = this(Literal(None), file, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks, false)
def this(file: ArgProvider, algorithm: String, enforceCaseSensitivePathChecks: Boolean, skipFileChecks: Boolean) = this(Literal(None), file, algorithm, List.empty[(String,String)], enforceCaseSensitivePathChecks, false)

override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = {
val columnDefinition = schema.columnDefinitions(columnIndex)

search(filename(columnIndex, row, schema)) match {
case Validated.Valid(hexValue: String) if hexValue == cellValue(columnIndex,row,schema) => true.validNel[String]
case Validated.Valid(hexValue: String) => s"""$toError file "${TypedPath(filename(columnIndex, row, schema)._1 + filename(columnIndex, row, schema)._2).toPlatform}" checksum match fails for line: ${row.lineNumber}, column: ${columnDefinition.id}, ${toValueError(row,columnIndex)}. Computed checksum value:"${hexValue}"""".invalidNel[Any]
case Validated.Invalid(errMsg) => s"$toError ${errMsg.head} for line: ${row.lineNumber}, column: ${columnDefinition.id}, ${toValueError(row,columnIndex)}".invalidNel[Any]
if(skipFileChecks) {
Validated.Valid("")
} else {
search(filename(columnIndex, row, schema)) match {
case Validated.Valid(hexValue: String) if hexValue == cellValue(columnIndex, row, schema) => true.validNel[String]
case Validated.Valid(hexValue: String) => s"""$toError file "${TypedPath(filename(columnIndex, row, schema)._1 + filename(columnIndex, row, schema)._2).toPlatform}" checksum match fails for line: ${row.lineNumber}, column: ${columnDefinition.id}, ${toValueError(row, columnIndex)}. Computed checksum value:"${hexValue}"""".invalidNel[Any]
case Validated.Invalid(errMsg) => s"$toError ${errMsg.head} for line: ${row.lineNumber}, column: ${columnDefinition.id}, ${toValueError(row, columnIndex)}".invalidNel[Any]
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -481,9 +481,9 @@ trait SchemaParser extends BaseSchemaParser {
*/
lazy val fileExistsExpr: PackratParser[FileExistsRule] = "FileExistsExpr" ::= ("fileExists" ~> opt("(" ~> stringProvider <~ ")")).withFailureMessage("Invalid fileExists rule") ^^ {
case None =>
FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks)
FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks, skipFileChecks = skipFileChecks)
case Some(s) =>
FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks, s)
FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks, s, skipFileChecks = skipFileChecks)
}


Expand All @@ -492,7 +492,7 @@ trait SchemaParser extends BaseSchemaParser {
*/
lazy val checksumExpr = "ChecksumExpr" ::= ("checksum(" ~> fileExpr <~ ",") ~ stringLiteral <~ ")" ^^ {
case files ~ algorithm =>
ChecksumRule(files._1.getOrElse(Literal(None)), files._2, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks)
ChecksumRule(files._1.getOrElse(Literal(None)), files._2, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks, skipFileChecks)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ case class SwitchRule(elseRules: Option[List[Rule]], cases:(Rule, List[Rule])*)



case class IntegrityCheckRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None), topLevelFolder: String = "content", includeFolder: Boolean = false) extends Rule("integrityCheck", Seq(rootPath): _*) {
case class IntegrityCheckRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None), topLevelFolder: String = "content", includeFolder: Boolean = false, skipFileChecks: Boolean = false) extends Rule("integrityCheck", Seq(rootPath): _*) {

//TODO introduce state, not very functional
var filesMap = Map[String, Set[Path]]()
Expand All @@ -83,8 +83,10 @@ case class IntegrityCheckRule(pathSubstitutions: List[(String,String)], enforceC
}

override def valid(filePath: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean]): Boolean = {

if (!filePath.isEmpty){
if (skipFileChecks) {
true
}
else if (!filePath.isEmpty){

val ruleValue = rootPath.referenceValue(columnIndex, row, schema)
val filePathS = if (FILE_SEPARATOR == WINDOWS_FILE_SEPARATOR)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ trait SchemaParser extends SchemaParser1_0 {

lazy val integrityCheckExpr: PackratParser[IntegrityCheckRule] = "IntegrityCheckExpr" ::= ("integrityCheck" ~> "(" ~> opt(stringProvider <~ ",") ~ opt(stringLiteral <~ ",") ~ stringLiteral <~ ")" ).withFailureMessage("Invalid integrityCheck rule") ^^ {
case rp ~ topLevelFolder ~ includeFolder if (includeFolder == "includeFolder") =>
IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), topLevelFolder.getOrElse("content"), true)
IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), topLevelFolder.getOrElse("content"), true, skipFileChecks)
case rp ~ topLevelFolder ~ includeFolder if (includeFolder == "excludeFolder") =>
IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), topLevelFolder.getOrElse("content"), false)
IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), topLevelFolder.getOrElse("content"), false, skipFileChecks)
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
val pathSubstitutions = List[(String,String)]()
val enforceCaseSensitivePathChecks = false
val trace = false
val skipFileChecks = false

def validateR(csv: io.Reader, schema: Schema): this.type#MetaDataValidation[Any] = validate(csv, schema, None)
}
Expand All @@ -39,6 +40,7 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
val pathSubstitutions = List[(String,String)]()
val enforceCaseSensitivePathChecks = true
val trace = false
val skipFileChecks = false
}

import v.{validate, validateR, parseSchema}
Expand Down Expand Up @@ -406,7 +408,7 @@ class MetaDataValidatorAcceptanceSpec extends Specification with TestResources {
}

"Validate fail fast" should {
val app = new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = List[(String,String)](); val enforceCaseSensitivePathChecks = false; val trace = false }
val app = new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = List[(String,String)](); val enforceCaseSensitivePathChecks = false; val trace = false; val skipFileChecks = false }

"only report first error for invalid @TotalColumns" in {
app.validate(TextFile(Paths.get(base).resolve("totalColumnsFailMetaData.csv")), parse(base + "/totalColumnsSchema.csvs"), None) must beLike {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ class MetaDataValidatorBigFileSpec extends Specification with TestResources {
"Big file" should {

"succeed with no stack overflow for all errors" in {
val v = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = List[SubstitutePath](); val enforceCaseSensitivePathChecks = false; val trace = false }
val v = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = List[SubstitutePath](); val enforceCaseSensitivePathChecks = false; val trace = false; val skipFileChecks = false }
def parse(filePath: String): Schema = v.parseSchema(TextFile(Paths.get(filePath))) fold (f => throw new IllegalArgumentException(f.toString()), s => s)

v.validate(TextFile(Paths.get(base).resolve("bigMetaData.csv")), parse(base + "/bigSchema.csvs"), None) must beLike { case Validated.Valid(_) => ok }
}

"succeed with no stack overflow for fail fast" in {
val v = new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = List[SubstitutePath](); val enforceCaseSensitivePathChecks = false; val trace = false }
val v = new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = List[SubstitutePath](); val enforceCaseSensitivePathChecks = false; val trace = false; val skipFileChecks = false }
def parse(filePath: String): Schema = v.parseSchema(TextFile(Paths.get(filePath))) fold (f => throw new IllegalArgumentException(f.toString()), s => s)

v.validate(TextFile(Paths.get(base).resolve("bigMetaData.csv")), parse(base + "/bigSchema.csvs"), None) must beLike { case Validated.Valid(_) => ok }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class MetaDataValidatorBusinessAcceptanceSpec extends Specification with TestRes

val base = resourcePath("acceptance/dp")

val v: CsvValidator = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = List[(String,String)](); val enforceCaseSensitivePathChecks = false; val trace = false }
val v: CsvValidator = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = List[(String,String)](); val enforceCaseSensitivePathChecks = false; val trace = false; val skipFileChecks = false }
import v.{validate, parseSchema}

def parse(filePath: String): Schema = parseSchema(TextFile(Paths.get(filePath))) fold (f => throw new IllegalArgumentException(f.toString()), s => s)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class MetaDataValidatorChecksumSpec extends Specification with TestResources {
val pathSubstitutions = List[(String,String)]()
val enforceCaseSensitivePathChecks = false
val trace = false
val skipFileChecks = false
override def parse(reader: Reader): ParseResult[Schema] = super.parse(reader) match {
case s @ Success(schema: Schema, _) =>
s
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class MetaDataValidatorFileCountSpec extends Specification with TestResources {
val pathSubstitutions = List[(String,String)]()
val enforceCaseSensitivePathChecks = false
val trace = false
val skipFileChecks = false
override def parse(reader: Reader): ParseResult[Schema] = super.parse(reader) match {
case s@Success(schema: Schema, _) => s
case NoSuccess(message, next) => throw new RuntimeException(message)
Expand Down
Loading

0 comments on commit 3f3faf4

Please sign in to comment.