Skip to content

Commit

Permalink
Fix regex that parses git diff (dotnet#305)
Browse files Browse the repository at this point in the history
* fix warnings related to parsing of git diff

* Fix the regexe to parse the git diff properly

* Revert "fix warnings related to parsing of git diff"

This reverts commit f0dd45d.

* Handle one last error condition

* Use -z option for git diff

* Trim the last char

* review feedback
  • Loading branch information
kunalspathak authored Dec 1, 2020
1 parent 4e9ef57 commit a9a332b
Showing 1 changed file with 47 additions and 82 deletions.
129 changes: 47 additions & 82 deletions src/jit-analyze/jit-analyze.cs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public Config(string[] args)
syntax.DefineOption("b|base", ref _basePath, "Base file or directory.");
syntax.DefineOption("d|diff", ref _diffPath, "Diff file or directory.");
syntax.DefineOption("r|recursive", ref _recursive, "Search directories recursively.");
syntax.DefineOption("ext|fileExtension", ref _fileExtension, "File extension to look for.");
syntax.DefineOption("ext|fileExtension", ref _fileExtension, "File extension to look for. By default, .dasm");
syntax.DefineOption("c|count", ref _count,
"Count of files and methods (at most) to output in the summary."
+ " (count) improvements and (count) regressions of each will be included."
Expand Down Expand Up @@ -948,127 +948,92 @@ public static void GenerateTSV(IEnumerable<FileDelta> compareList, string path)
// There are files with diffs. Build up a dictionary mapping base file name to net text diff count.
// Use "git diff" to do the analysis for us, then parse that output.
//
// "git diff --no-index --exit-code --numstat" output shows added/deleted lines:
// "git diff --diff-filter=M --no-index --exit-code --numstat -z" output shows added/deleted lines:
//
// <added> <removed> <base-path> => <diff-path>
// With -diff-filter=M, "git diff" will only compare modified files i.e. the ones that exists in both base and diff
// and ignore files that are present in one but not in other.
//
// For example:
// 6 6 "dasmset_8/diff/Vector3Interop_ro/Vector3Interop_ro.dasm" => "dasmset_8/base/Vector3Interop_ro/Vector3Interop_ro.dasm"
// With -z, the output uses field terminators of NULs (\0).
// added\tremoved\t\0base-path-1\0diff-path-1\0added\tremoved\t....
//
// Note, however, that it can also use a smaller output format, for example:
// For example:
// 6\t6\t\0d:\root\dasmset_8\base\Vector3Interop_ro.dasm\0d:\root\dasmset_8\diff\Vector3Interop_ro.dasm\0<next diff information>
//
// 6 6 dasmset_8/{diff => base}/Vector3Interop_ro/Vector3Interop_ro.dasm
public static Dictionary<string, int> DiffInText(string diffPath, string basePath)
{
// run get diff command to see if we have textual diffs.
// (use git diff since it's already a dependency and cross platform)
List<string> commandArgs = new List<string>();
commandArgs.Add("diff");
commandArgs.Add("--no-index");
// only diff files that are present in both base and diff.
commandArgs.Add("--diff-filter=M");
commandArgs.Add("--exit-code");
commandArgs.Add("--numstat");
commandArgs.Add(diffPath);
commandArgs.Add("-z");
commandArgs.Add(basePath);
commandArgs.Add(diffPath);

ProcessResult result = Utility.ExecuteProcess("git", commandArgs, true);
Dictionary<string, int> fileToTextDiffCount = new Dictionary<string, int>();
Dictionary<string, int> fileToTextDiffCount = null;

if (result.ExitCode != 0)
{
// There are files with diffs. Build up a dictionary mapping base file name to net text diff count.
//
// diff --numstat output shows added/deleted lines
// added removed base-path => diff-path
var rawLines = result.StdOut.Split(new[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
string fullDiffPath = Path.GetFullPath(diffPath);
fileToTextDiffCount = new Dictionary<string, int>();

foreach (var line in rawLines)
var rawLines = result.StdOut.Split(new[] { "\0", Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
if (rawLines.Length % 3 != 0)
{
string manipulatedLine = line;
string[] fields = null;

int numFields = 5;

// Example output:
// 32\t2\t/coreclr/bin/asm/asm/{diff => base}/101301.dasm
//
// This should be split into:
//
// 32\t2\t/coreclr/bin/asm/asm/base/101301.dasm\t/coreclr/bin/asm/asm/diff/101301.dasm
Regex gitMergedOutputRegex = new Regex(@"\{(\w+)\s=>\s(\w+)\}");
Regex whitespaceRegex = new Regex(@"(\w+)\s+(\w+)\s+(.*)");
if (gitMergedOutputRegex.Matches(line).Count > 0)
Console.WriteLine($"Error parsing output: {result.StdOut}");
return fileToTextDiffCount;
}

string fullBasePath = Path.GetFullPath(basePath).TrimEnd(Path.DirectorySeparatorChar);
string fullDiffPath = Path.GetFullPath(diffPath).TrimEnd(Path.DirectorySeparatorChar);

for (int i = 0; i < rawLines.Length; i += 3)
{
string rawStats = rawLines[i];
string rawBasePath = rawLines[i + 1];
string rawDiffPath = rawLines[i + 2];

string[] fields = rawStats.Split(new char[] { ' ', '\t', '"' }, StringSplitOptions.RemoveEmptyEntries);

string parsedFullDiffFilePath = Path.GetFullPath(rawDiffPath);
string parsedFullBaseFilePath = Path.GetFullPath(rawBasePath);

string parsedDiffDir = Path.GetDirectoryName(parsedFullDiffFilePath);
string parsedBaseDir = Path.GetDirectoryName(parsedFullBaseFilePath);


if (!File.Exists(parsedFullBaseFilePath))
{
// Do the first split to remove the integers from the file path.
// Then reconstruct both the diff and the base paths.
var groups = whitespaceRegex.Match(line).Groups;
string[] modifiedLine = new string[] {
groups[whitespaceRegex.GroupNameFromNumber(1)].ToString(),
groups[whitespaceRegex.GroupNameFromNumber(2)].ToString(),
groups[whitespaceRegex.GroupNameFromNumber(3)].ToString()
};

string[] splitLine = gitMergedOutputRegex.Split(modifiedLine[2]);

// Split will output:
//
// 32\t2\t/coreclr/bin/asm/asm/
// {diffFolder}
// {baseFolder}
// 101301.dasm

// Create the base path from the second group
// {diff => base}
string manipulatedBasePath = String.Join(splitLine[2], new string[] {
splitLine[0],
splitLine[3]
});

// Create the diff path from the first
// {diff => base}
string manipulatedDiffPath = String.Join(splitLine[1], new string[] {
splitLine[0],
splitLine[3]
});

fields = new string[4] {
modifiedLine[0],
modifiedLine[1],
manipulatedBasePath,
manipulatedDiffPath
};

numFields = 4;
Console.WriteLine($"Error parsing path '{rawBasePath}'. `{parsedFullBaseFilePath}` doesn't exist.");
continue;
}
else

if (parsedBaseDir != fullBasePath)
{
fields = line.Split(new char[] { ' ', '\t', '"' }, StringSplitOptions.RemoveEmptyEntries);
Console.WriteLine($"Error parsing path '{rawBasePath}'. `{parsedBaseDir}` and `{fullBasePath}` don't match.");
}

if (fields.Length != numFields)
if (!File.Exists(parsedFullDiffFilePath))
{
Console.WriteLine($"Couldn't parse output '{line}`.");
Console.WriteLine($"Error parsing path '{rawDiffPath}'. `{parsedFullDiffFilePath}` doesn't exist.");
continue;
}

// store diff-relative path
string fullBaseFilePath = Path.GetFullPath(fields[2]);
if (!File.Exists(fullBaseFilePath))
if (parsedDiffDir != fullDiffPath)
{
Console.WriteLine($"Couldn't parse output '{line}'.");
continue;
Console.WriteLine($"Error parsing path '{rawDiffPath}'. `{parsedDiffDir}` and `{fullDiffPath}` don't match.");
}

string baseFilePath = fullBaseFilePath.Substring(fullDiffPath.Length + 1);

// Sometimes .dasm is parsed as binary and we don't get numbers, just dashes
int addCount = 0;
int delCount = 0;
Int32.TryParse(fields[0], out addCount);
Int32.TryParse(fields[1], out delCount);
fileToTextDiffCount[baseFilePath] = addCount + delCount;
fileToTextDiffCount[parsedFullBaseFilePath] = addCount + delCount;
}

Console.WriteLine($"Found {fileToTextDiffCount.Count()} files with textual diffs.");
Expand Down

0 comments on commit a9a332b

Please sign in to comment.