Skip to content

Commit

Permalink
Rewrite codebase to be able to un-remove vendor matches
Browse files Browse the repository at this point in the history
Fixes files that are by default marked vendored being removed even when explicitly marked as not vendored in gitattributes.

See #26.
  • Loading branch information
Nixinova committed Mar 1, 2024
1 parent df3f475 commit 52e2f29
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 91 deletions.
3 changes: 3 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## Next
- Added CLI option `--listFiles` to list each matching file under each language result.
- TODO
- Fixed files that are by default marked vendored being removed even when marked as not vendored in gitattributes.
- Fixed gitattributes rules from parent folders being erroneously applied.

## 2.6.1
*2023-07-24*
Expand Down
50 changes: 50 additions & 0 deletions src/helpers/parse-gitattributes.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import path from 'path';

import * as T from '../types';

export type FlagAttributes = {
'vendored': boolean | null,
'generated': boolean | null,
'documentation': boolean | null,
'binary': boolean | null,
'language': T.LanguageResult;
};

export type ParsedGitattributes = Array<{
glob: string,
attrs: FlagAttributes,
}>;

/**
* Parses a gitattributes file.
*/
export default function parseAttributes(content: string, folderRoot: string = '.'): ParsedGitattributes {
const output: ParsedGitattributes = [];

for (const line of content.split('\n')) {
if (!line) continue;

const parts = line.split(/\s+/g);
const fileGlob = parts[0];
const relFileGlob = path.join(folderRoot, fileGlob).replace(/\\/g, '/');
const attrParts = parts.slice(1);
const isTrue = (str: string) => !str.startsWith('-') && !str.endsWith('=false');
const isFalse = (str: string) => str.startsWith('-') || str.endsWith('=false');
const trueParts = (str: string) => attrParts.filter(part => part.includes(str) && isTrue(part));
const falseParts = (str: string) => attrParts.filter(part => part.includes(str) && isFalse(part));
const hasTrueParts = (str: string) => trueParts(str).length > 0;
const hasFalseParts = (str: string) => falseParts(str).length > 0;

const attrs = {
'generated': hasTrueParts('linguist-generated') ? true : hasFalseParts('linguist-generated') ? false : null,
'vendored': hasTrueParts('linguist-vendored') ? true : hasFalseParts('linguist-vendored') ? false : null,
'documentation': hasTrueParts('linguist-documentation') ? true : hasFalseParts('linguist-documentation') ? false : null,
'binary': hasTrueParts('binary') || hasFalseParts('text') ? true : hasFalseParts('binary') || hasTrueParts('text') ? false : null,
'language': trueParts('linguist-language').at(-1)?.split('=')[1] ?? null,
}

output.push({ glob: relFileGlob, attrs });
}

return output;
}
24 changes: 14 additions & 10 deletions src/helpers/walk-tree.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import fs from 'fs';
import paths from 'path';
import { Ignore } from 'ignore';
import ignore, { Ignore } from 'ignore';

let allFiles: Set<string>;
let allFolders: Set<string>;
Expand All @@ -14,17 +14,18 @@ interface WalkInput {
folderRoots: string[],
/** The absolute path of folders being checked */
folders: string[],
gitignores: Ignore,
regexIgnores: RegExp[],
/** An instantiated Ignore object listing ignored files */
ignored: Ignore,
};

interface WalkOutput {
files: string[],
folders: string[],
};

/** Generate list of files in a directory. */
export default function walk(data: WalkInput): WalkOutput {
const { init, commonRoot, folderRoots, folders, gitignores, regexIgnores } = data;
const { init, commonRoot, folderRoots, folders, ignored } = data;

// Initialise files and folders lists
if (init) {
Expand All @@ -49,18 +50,21 @@ export default function walk(data: WalkInput): WalkOutput {
// Create absolute path for disc operations
const path = paths.resolve(commonRoot, file).replace(/\\/g, '/');
const localPath = localRoot ? file.replace(`./${localRoot}/`, '') : file.replace('./', '');
// Skip if nonexistant or ignored

// Skip if nonexistant
const nonExistant = !fs.existsSync(path);
const isGitIgnored = gitignores.test(localPath).ignored;
const isRegexIgnored = regexIgnores.find(match => localPath.match(match));
if (nonExistant || isGitIgnored || isRegexIgnored) continue;
if (nonExistant) continue;
// Skip if marked as ignored
const isIgnored = ignored.test(localPath).ignored;
if (isIgnored) continue;

// Add absolute folder path to list
allFolders.add(paths.resolve(folder).replace(/\\/g, '/'));
// Check if this is a folder or file
if (file.endsWith('/')) {
// Recurse into subfolders
allFolders.add(path);
walk({ init: false, commonRoot: commonRoot, folderRoots, folders: [path], gitignores, regexIgnores });
walk({ init: false, commonRoot, folderRoots, folders: [path], ignored });
}
else {
// Add file path to list
Expand All @@ -71,7 +75,7 @@ export default function walk(data: WalkInput): WalkOutput {
// Recurse into all folders
else {
for (const i in folders) {
walk({ init: false, commonRoot: commonRoot, folderRoots: [folderRoots[i]], folders: [folders[i]], gitignores, regexIgnores });
walk({ init: false, commonRoot, folderRoots: [folderRoots[i]], folders: [folders[i]], ignored });
}
}
// Return absolute files and folders lists
Expand Down
165 changes: 84 additions & 81 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import { isBinaryFile } from 'isbinaryfile';
import walk from './helpers/walk-tree';
import loadFile, { parseGeneratedDataFile } from './helpers/load-data';
import readFile from './helpers/read-file';
import parseAttributes, { FlagAttributes } from './helpers/parse-gitattributes';
import pcre from './helpers/convert-pcre';
import * as T from './types';
import * as S from './schema';
Expand Down Expand Up @@ -38,12 +39,7 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
unknown: { count: 0, bytes: 0, extensions: {}, filenames: {} },
};

// Prepare list of ignored files
const gitignores = ignore();
const regexIgnores: RegExp[] = [];
gitignores.add('.git');
if (!opts.keepVendored) regexIgnores.push(...vendorPaths.map(path => RegExp(path, 'i')));
if (opts.ignoredFiles) gitignores.add(opts.ignoredFiles);
//*PREPARE FILES AND DATA*//

// Set a common root path so that vendor paths do not incorrectly match parent folders
const normPath = (file: string) => file.replace(/\\/g, '/');
Expand All @@ -54,28 +50,93 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
const unRelPath = (file: string) => normPath(paths.resolve(commonRoot, file));
const localPath = (file: string) => localRoot(unRelPath(file));

// Prepare list of ignored files
const ignored = ignore();
ignored.add('.git/');
ignored.add(opts.ignoredFiles ?? []);
const regexIgnores: RegExp[] = [];
if (!opts.keepVendored) regexIgnores.push(...vendorPaths.map(path => RegExp(path, 'i')));

// Load file paths and folders
let files, folders;
let files: string[];
let folders: string[];
if (useRawContent) {
// Uses raw file content
files = input;
folders = [''];
}
else {
// Uses directory on disc
const data = walk({ init: true, commonRoot, folderRoots: resolvedInput, folders: resolvedInput, gitignores, regexIgnores });
const data = walk({ init: true, commonRoot, folderRoots: resolvedInput, folders: resolvedInput, ignored });
files = data.files;
folders = data.folders;
}

// Load gitignore data and apply ignores rules
if (!useRawContent) {
for (const folder of folders) {
// Parse gitignores
const ignoresFile = paths.join(folder, '.gitignore');
if (opts.checkIgnored && fs.existsSync(ignoresFile)) {
const ignoresData = await readFile(ignoresFile);
const localIgnoresData = ignoresData.replace(/^[\/\\]/g, localRoot(folder) + '/');
ignored.add(localIgnoresData);
files = ignored.filter(files.map(relPath)).map(unRelPath);
}
}
}

// Fetch and normalise gitattributes data of all subfolders and save to metadata
const manualAttributes: Record<T.FilePath, FlagAttributes> = {}; // Maps file globs to gitattribute boolean flags
const getFlaggedGlobs = (attr: keyof FlagAttributes, val: boolean) => {
return Object.entries(manualAttributes).filter(([, attrs]) => attrs[attr] === val).map(([glob,]) => glob)
};
if (!useRawContent && opts.checkAttributes) {
const nestedAttrFiles = files.filter(file => file.endsWith('.gitattributes'));
for (const attrFile of nestedAttrFiles) {
const relAttrFile = relPath(attrFile);
const relAttrFolder = paths.dirname(relAttrFile);
const contents = await readFile(attrFile);
const parsed = parseAttributes(contents, relAttrFolder);
for (const { glob, attrs } of parsed) {
manualAttributes[glob] = attrs;
}
}
}

// Apply vendor file path matches and filter out vendored files
if (!opts.keepVendored) {
// Get data of files that have been manually marked with metadata
const vendorTrueGlobs = [...getFlaggedGlobs('vendored', true), ...getFlaggedGlobs('generated', true), ...getFlaggedGlobs('documentation', true)];
const vendorFalseGlobs = [...getFlaggedGlobs('vendored', false), ...getFlaggedGlobs('generated', false), ...getFlaggedGlobs('documentation', false)];
// Set up glob ignore object to use for expanding globs to match files
const vendorOverrides = ignore();
vendorOverrides.add(vendorFalseGlobs);
// Remove all files marked as vendored by default
const excludedFiles = files.filter(file => vendorPaths.some(vPath => RegExp(vPath).test(relPath(file))));
files = files.filter(file => !excludedFiles.includes(file));
// Re-add removed files that are overridden manually in gitattributes
const overriddenExcludedFiles = excludedFiles.filter(file => vendorOverrides.ignores(relPath(file)));
files.push(...overriddenExcludedFiles);
// Remove files explicitly marked as vendored in gitattributes
files = files.filter(file => !vendorTrueGlobs.includes(relPath(file)));
}

// Filter out binary files
if (!opts.keepBinary) {
const binaryIgnored = ignore();
binaryIgnored.add(getFlaggedGlobs('binary', true));
files = binaryIgnored.filter(files.map(relPath)).map(unRelPath);
}

// Apply aliases
opts = {
checkIgnored: !opts.quick,
checkAttributes: !opts.quick,
checkHeuristics: !opts.quick,
checkShebang: !opts.quick,
checkModeline: !opts.quick,
...opts
...opts,
};

// Ignore specific languages
Expand All @@ -88,75 +149,26 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
}
}

// Load gitignores and gitattributes
const customBinary = ignore();
const customText = ignore();
if (!useRawContent && opts.checkAttributes) {
for (const folder of folders) {
// TODO FIX: this is absolute when only 1 path given
const localFilePath = (path: string) => localRoot(folder) ? localRoot(folder) + '/' + localPath(path) : path;
// TODO: FIX linguist-language=

// Skip if folder is marked in gitattributes
if (relPath(folder) && gitignores.ignores(relPath(folder))) {
continue;
}
// Establish language overrides taken from gitattributes
const forcedLangs = Object.entries(manualAttributes).filter(([, attrs]) => attrs.language);
for (const [path, attrs] of forcedLangs) {
let forcedLang = attrs.language;
if (!forcedLang) continue;

// Parse gitignores
const ignoresFile = paths.join(folder, '.gitignore');
if (opts.checkIgnored && fs.existsSync(ignoresFile)) {
const ignoresData = await readFile(ignoresFile);
const localIgnoresData = ignoresData.replace(/^[\/\\]/g, localRoot(folder) + '/');
gitignores.add(localIgnoresData);
// If specified language is an alias, associate it with its full name
if (!langData[forcedLang]) {
const overrideLang = Object.entries(langData).find(entry => entry[1].aliases?.includes(forcedLang!.toLowerCase()));
if (overrideLang) {
forcedLang = overrideLang[0];
}

// Parse gitattributes
const attributesFile = paths.join(folder, '.gitattributes');
if (opts.checkAttributes && fs.existsSync(attributesFile)) {
const attributesData = await readFile(attributesFile);
// Explicit text/binary associations
const contentTypeMatches = attributesData.matchAll(/^(\S+).*?(-?binary|-?text)(?!=auto)/gm);
for (const [_line, path, type] of contentTypeMatches) {
if (['text', '-binary'].includes(type)) {
customText.add(localFilePath(path));
}
if (['-text', 'binary'].includes(type)) {
customBinary.add(localFilePath(path));
}
}
// Custom vendor options
const vendorMatches = attributesData.matchAll(/^(\S+).*[^-]linguist-(vendored|generated|documentation)(?!=false)/gm);
for (const [_line, path] of vendorMatches) {
gitignores.add(localFilePath(path));
}
// Custom file associations
const customLangMatches = attributesData.matchAll(/^(\S+).*[^-]linguist-language=(\S+)/gm);
for (let [_line, path, forcedLang] of customLangMatches) {
// If specified language is an alias, associate it with its full name
if (!langData[forcedLang]) {
const overrideLang = Object.entries(langData).find(entry => entry[1].aliases?.includes(forcedLang.toLowerCase()));
if (overrideLang) {
forcedLang = overrideLang[0];
}
}
const fullPath = paths.join(relPath(folder), path);
overrides[fullPath] = forcedLang;
}
}

}
}
// Check vendored files
if (!opts.keepVendored) {
// Filter out any files that match a vendor file path
if (useRawContent) {
files = gitignores.filter(files);
files = files.filter(file => !regexIgnores.find(match => match.test(file)));
}
else {
files = gitignores.filter(files.map(localPath)).map(unRelPath);
}
overrides[unRelPath(path)] = forcedLang;
}

//*PARSE LANGUAGES*//

// Load all files and parse languages
const addResult = (file: string, result: T.LanguageResult) => {
if (!fileAssociations[file]) {
Expand Down Expand Up @@ -267,15 +279,6 @@ async function analyse(input?: string | string[], opts: T.Options = {}): Promise
results.files.results[file] = fileAssociations[file][0];
continue;
}
// Skip binary files
if (!useRawContent && !opts.keepBinary) {
const isCustomText = customText.ignores(relPath(file));
const isCustomBinary = customBinary.ignores(relPath(file));
const isBinaryExt = binaryData.some(ext => file.endsWith('.' + ext));
if (!isCustomText && (isCustomBinary || isBinaryExt || await isBinaryFile(file))) {
continue;
}
}

// Parse heuristics if applicable
if (opts.checkHeuristics) for (const heuristics of heuristicsData.disambiguations) {
Expand Down

0 comments on commit 52e2f29

Please sign in to comment.