Skip to content

Commit

Permalink
Add Word2Vec Benchmark Harness (dotnet#17350)
Browse files Browse the repository at this point in the history
* Add Word2Vec Benchmark Harness

This change adds an additional scenario benchmark, the Word2Vec
benchmark. The harness pulls down Word2Vec.Net from eabdullin, applies a
patch of changes that we made to work with netcoreapp21, harness the
word training and search, and then runs the benchmark. It also updates
the timeout for running benchmarks, since the training scenario on a
100M file takes about 7 minutes locally.
  • Loading branch information
michellemcdaniel authored Apr 5, 2018
1 parent b8b30cf commit 34561ef
Show file tree
Hide file tree
Showing 9 changed files with 922 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,5 @@ tests/src/JIT/Performance/CodeQuality/BenchmarksGame/reverse-complement/revcomp-
tests/src/JIT/Performance/CodeQuality/BenchmarksGame/reverse-complement/revcomp-input25000.txt text eol=lf
tests/src/JIT/Performance/CodeQuality/BenchmarksGame/k-nucleotide/knucleotide-input.txt text eol=lf
tests/src/JIT/Performance/CodeQuality/BenchmarksGame/k-nucleotide/knucleotide-input-big.txt text eol=lf
tests/src/performance/Scenario/JitBench/Resources/word2vecnet.patch text eol=lf

4 changes: 2 additions & 2 deletions tests/scripts/run-xunit-perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,9 @@ def run_benchmark(benchname, benchdir, env, sandboxDir, benchmarkOutputDir, test
myEnv = dict(env)
benchnameWithExt = benchname + '.' + testFileExt
fullPath = os.path.join(benchdir, benchnameWithExt)
shutil.copy2(fullPath, sandboxDir)

files = glob.iglob(os.path.join(benchdir, "*.txt"))
# Copy all files in the benchmark directory to the sandbox
files = glob.iglob(os.path.join(benchdir, "*.*"))
for filename in files:
if os.path.isfile(filename):
shutil.copy2(filename, sandboxDir)
Expand Down
256 changes: 256 additions & 0 deletions tests/src/performance/Scenario/JitBench/Benchmarks/MLBenchmark.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Reflection;
using Microsoft.Xunit.Performance.Api;

namespace JitBench
{
class Word2VecBenchmark : MLBenchmark
{
public Word2VecBenchmark() : base("Word2Vec") { }

protected override string ExecutableName => "Word2VecScenario.dll";

protected override string GetWord2VecNetSrcDirectory(string outputDir)
{
return Path.Combine(GetWord2VecNetRepoRootDir(outputDir), "Word2VecScenario");
}
}

abstract class MLBenchmark : Benchmark
{
private static readonly HashSet<int> DefaultExitCodes = new HashSet<int>(new[] { 0 });

public MLBenchmark(string name) : base(name)
{
ExePath = ExecutableName;
}

protected abstract string ExecutableName { get; }

public override async Task Setup(DotNetInstallation dotNetInstall, string outputDir, bool useExistingSetup, ITestOutputHelper output)
{
if(!useExistingSetup)
{
using (var setupSection = new IndentedTestOutputHelper("Setup " + Name, output))
{
await CloneWord2VecNetRepo(outputDir, setupSection);
await Publish(dotNetInstall, outputDir, setupSection);
await DownloadAndExtractTextCorpus(dotNetInstall, outputDir, setupSection);
}
}
string tfm = DotNetSetup.GetTargetFrameworkMonikerForFrameworkVersion(dotNetInstall.FrameworkVersion);
WorkingDirPath = GetWord2VecNetPublishDirectory(dotNetInstall, outputDir, tfm);
}

async Task CloneWord2VecNetRepo(string outputDir, ITestOutputHelper output)
{
// If the repo already exists, we delete it and extract it again.
string word2VecNetRepoRootDir = GetWord2VecNetRepoRootDir(outputDir);
FileTasks.DeleteDirectory(word2VecNetRepoRootDir, output);

string word2VecPatchFullPath = Path.Combine(Path.GetDirectoryName(Assembly.GetEntryAssembly().Location), Word2VecNetPatch);

await ExecuteGitCommand($"clone {Word2VecNetRepoUrl} {word2VecNetRepoRootDir}", output);
await ExecuteGitCommand($"checkout {Word2VecNetCommitSha1Id}", output, workingDirectory: word2VecNetRepoRootDir);
await ExecuteGitCommand($"apply {word2VecPatchFullPath}", output, workingDirectory: word2VecNetRepoRootDir);
}

async Task ExecuteGitCommand(string arguments, ITestOutputHelper output, string workingDirectory = null)
{
int exitCode = await new ProcessRunner("git", arguments).WithLog(output).WithWorkingDirectory(workingDirectory).Run();

if (!DefaultExitCodes.Contains(exitCode))
throw new Exception($"git {arguments} has failed, the exit code was {exitCode}");
}

async Task DownloadAndExtractTextCorpus(DotNetInstallation dotNetInstall, string outputDir, ITestOutputHelper output)
{
// If the file already exists, exit
string word2VecNetRepoRootDir = GetWord2VecNetRepoRootDir(outputDir);
string tfm = DotNetSetup.GetTargetFrameworkMonikerForFrameworkVersion(dotNetInstall.FrameworkVersion);
string word2VecNetPublishDir = GetWord2VecNetPublishDirectory(dotNetInstall, outputDir, tfm);

// Download the corpus of text. This is a zip file that contains a text file of 100M of text from Wikipedia
var url = "http://mattmahoney.net/dc/text8.zip";
await FileTasks.DownloadAndUnzip(url, word2VecNetRepoRootDir + "_temp", output);

FileTasks.MoveFile(Path.Combine(word2VecNetRepoRootDir + "_temp", "text8"),
Path.Combine(word2VecNetPublishDir, "Corpus.txt"), output);
}

private async Task<string> Publish(DotNetInstallation dotNetInstall, string outputDir, ITestOutputHelper output)
{
string tfm = DotNetSetup.GetTargetFrameworkMonikerForFrameworkVersion(dotNetInstall.FrameworkVersion);
string publishDir = GetWord2VecNetPublishDirectory(dotNetInstall, outputDir, tfm);
if (publishDir != null)
{
FileTasks.DeleteDirectory(publishDir, output);
}
string dotNetExePath = dotNetInstall.DotNetExe;
await new ProcessRunner(dotNetExePath, $"publish -c Release -f {tfm}")
.WithWorkingDirectory(GetWord2VecNetSrcDirectory(outputDir))
.WithEnvironmentVariable("DOTNET_MULTILEVEL_LOOKUP", "0")
.WithEnvironmentVariable("WORD2VEC_FRAMEWORK_VERSION", dotNetInstall.FrameworkVersion)
.WithEnvironmentVariable("UseSharedCompilation", "false")
.WithLog(output)
.Run();

publishDir = GetWord2VecNetPublishDirectory(dotNetInstall, outputDir, tfm);
if (publishDir == null)
{
throw new DirectoryNotFoundException("Could not find 'publish' directory");
}
return publishDir;
}

public override Metric[] GetDefaultDisplayMetrics()
{
return new Metric[]
{
TrainingMetric,
FirstSearchMetric,
MedianSearchMetric
};
}

protected override IterationResult RecordIterationMetrics(ScenarioExecutionResult scenarioIteration, string stdout, string stderr, ITestOutputHelper output)
{
IterationResult result = base.RecordIterationMetrics(scenarioIteration, stdout, stderr, output);
AddConsoleMetrics(result, stdout, output);
return result;
}

void AddConsoleMetrics(IterationResult result, string stdout, ITestOutputHelper output)
{
output.WriteLine("Processing iteration results.");

double? trainingTime = null;
double? firstSearchTime = null;
double? steadyStateMedianTime = null;

using (var reader = new StringReader(stdout))
{
string line;
while ((line = reader.ReadLine()) != null)
{
Match match = Regex.Match(line, @"^Training took \s*(\d+)ms$");
if (match.Success && match.Groups.Count == 2)
{
trainingTime = Convert.ToDouble(match.Groups[1].Value);
continue;
}

match = Regex.Match(line, @"^Search took \s*(\d+)ms$");
if (match.Success && match.Groups.Count == 2)
{
firstSearchTime = Convert.ToDouble(match.Groups[1].Value);
continue;
}

match = Regex.Match(line, @"^Steadystate median search time: \s*(\d+\.\d+)ms$");
if (match.Success && match.Groups.Count == 2)
{
//many lines will match, but the final values of these variables will be from the last batch which is presumably the
//best measurement of steady state performance
steadyStateMedianTime = Convert.ToDouble(match.Groups[1].Value);
continue;
}
}
}

if (!trainingTime.HasValue)
throw new FormatException("Training time was not found.");
if (!firstSearchTime.HasValue)
throw new FormatException("First Search time was not found.");
if (!steadyStateMedianTime.HasValue)
throw new FormatException("Steady state median response time not found.");


result.Measurements.Add(TrainingMetric, trainingTime.Value);
result.Measurements.Add(FirstSearchMetric, firstSearchTime.Value);
result.Measurements.Add(MedianSearchMetric, steadyStateMedianTime.Value);

output.WriteLine($"Training took {trainingTime}ms");
output.WriteLine($"Search took {firstSearchTime}ms");
output.WriteLine($"Median steady state search {steadyStateMedianTime.Value}ms");
}

/// <summary>
/// When serializing the result data to benchview this is called to determine if any of the metrics should be reported differently
/// than they were collected. Both web apps use this to collect several measurements in each iteration, then present those measurements
/// to benchview as if each was the Duration metric of a distinct scenario test with its own set of iterations.
/// </summary>
public override bool TryGetBenchviewCustomMetricReporting(Metric originalMetric, out Metric newMetric, out string newScenarioModelName)
{
if(originalMetric.Equals(TrainingMetric))
{
newScenarioModelName = "Training";
}
else if (originalMetric.Equals(FirstSearchMetric))
{
newScenarioModelName = "First Search";
}
else if (originalMetric.Equals(MedianSearchMetric))
{
newScenarioModelName = "Median Search";
}
else
{
return base.TryGetBenchviewCustomMetricReporting(originalMetric, out newMetric, out newScenarioModelName);
}
newMetric = Metric.ElapsedTimeMilliseconds;
return true;
}

protected static string GetWord2VecNetRepoRootDir(string outputDir)
{
return Path.Combine(outputDir, "W");
}

protected abstract string GetWord2VecNetSrcDirectory(string outputDir);

string GetWord2VecNetPublishDirectory(DotNetInstallation dotNetInstall, string outputDir, string tfm)
{
string dir = Path.Combine(GetWord2VecNetSrcDirectory(outputDir), "bin", dotNetInstall.Architecture, "Release", tfm, "publish");
if (Directory.Exists(dir))
{
return dir;
}

dir = Path.Combine(GetWord2VecNetSrcDirectory(outputDir), "bin", "Release", tfm, "publish");
if (Directory.Exists(dir))
{
return dir;
}

return null;
}

string GetCoreClrRoot()
{
string currentDirectory = Directory.GetCurrentDirectory();
string workspace = Environment.GetEnvironmentVariable("CORECLR_REPO");
if (workspace == null)
{
workspace = currentDirectory;
}

return workspace;
}

private const string Word2VecNetRepoUrl = "https://github.com/eabdullin/Word2Vec.Net";
private const string Word2VecNetCommitSha1Id = "6012a2b5b886926918d51b1b56387d785115f448";
private const string Word2VecNetPatch = "word2vecnet.patch";
private const string EnvironmentFileName = "Word2VecNetEnvironment.txt";
private const string StoreDirName = ".store";
private readonly Metric TrainingMetric = new Metric("Training", "ms");
private readonly Metric FirstSearchMetric = new Metric("First Search", "ms");
private readonly Metric MedianSearchMetric = new Metric("Median Search", "ms");
private readonly Metric MeanSearchMetric = new Metric("Mean Search", "ms");
}
}

3 changes: 3 additions & 0 deletions tests/src/performance/Scenario/JitBench/JitBench.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,8 @@
Overwrite="true"
Encoding="Unicode"/>
</Target>
<Target Name="AfterBuild">
<Copy SourceFiles="Resources\word2vecnet.patch" DestinationFolder="$(OutDir)" />
</Target>

</Project>
Loading

0 comments on commit 34561ef

Please sign in to comment.