Skip to content

Commit

Permalink
feat(arrow): Experimental module for Apache Arrow attribute data extr…
Browse files Browse the repository at this point in the history
…action (#2278)
  • Loading branch information
ibgreen authored Oct 15, 2024
1 parent bd27f0b commit 87449fc
Show file tree
Hide file tree
Showing 22 changed files with 827 additions and 51 deletions.
5 changes: 5 additions & 0 deletions modules/arrow/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# @luma.gl/arrow

This is Apache Arrow utilities for luma.gl.

See [luma.gl](http://luma.gl) for documentation.
49 changes: 49 additions & 0 deletions modules/arrow/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"private": true,
"name": "@luma.gl/arrow",
"description": "luma.gl Apache Arrow bindings",
"version": "9.2.0-alpha.0",
"license": "MIT",
"type": "module",
"publishConfig": {
"access": "public"
},
"repository": {
"type": "git",
"url": "https://github.com/visgl/luma.gl"
},
"keywords": [
"webgl",
"visualization",
"animation",
"3d"
],
"types": "dist/index.d.ts",
"main": "dist/index.cjs",
"module": "dist/index.js",
"exports": {
".": {
"import": "./dist/index.js",
"require": "./dist/index.cjs",
"types": "./dist/index.d.ts"
}
},
"files": [
"src",
"dist",
"dist.min.js",
"README.md"
],
"sideEffects": false,
"scripts": {
"build-minified-bundle": "ocular-bundle ./bundle.ts --output=dist/dist.min.js",
"build-dev-bundle": "ocular-bundle ./bundle.ts --output=dist/dist.dev.js --env=dev",
"prepublishOnly": "npm run build-minified-bundle && npm run build-dev-bundle"
},
"dependencies": {
"@luma.gl/core": "9.2.0-alpha.0",
"@math.gl/polygon": "^4.1.0",
"apache-arrow": "^17.0.0"
},
"gitHead": "c636c34b8f1581eed163e94543a8eb1f4382ba8e"
}
22 changes: 22 additions & 0 deletions modules/arrow/src/arrow/analyze-arrow-table.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// luma.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import * as arrow from 'apache-arrow';
import {getArrowPaths} from './arrow-paths';
import {ArrowColumnInfo} from './arrow-types';
import {getArrowColumnInfo} from './arrow-column-info';

export function analyzeArrowTable(arrowTable: arrow.Table): Record<string, ArrowColumnInfo> {
const paths = getArrowPaths(arrowTable);
const columnInfos: Record<string, ArrowColumnInfo> = {};

for (const path of paths) {
const columnInfo = getArrowColumnInfo(arrowTable, path);
if (columnInfo) {
columnInfos[path] = columnInfo;
}
}

return columnInfos;
}
88 changes: 88 additions & 0 deletions modules/arrow/src/arrow/arrow-column-info.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// luma.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors
import * as arrow from 'apache-arrow';
import {
AttributeArrowType,
NumericArrowType,
ArrowColumnInfo,
isNumericArrowType,
isInstanceArrowType,
// isVertexArrowType,
getSignedShaderType
} from './arrow-types';
import {getArrowVectorByPath} from './arrow-paths';

/** Extracts info from columns that can be used as GPU data sources */
export function getArrowColumnInfo(arrowTable: arrow.Table, path: string): ArrowColumnInfo | null {
const vector = getArrowVectorByPath(arrowTable, path);
if (isInstanceArrowType(vector.type)) {
return getInstanceColumnInfo(vector);
}
// if (isVertexArrowType(vector.type)) {
// return getVertexColumnInfo(vector);
// }
return null;
}

/** Extracts info from columns that can be used with GPU instanced attributes */
export function getInstanceColumnInfo(vector: arrow.Vector<AttributeArrowType>): ArrowColumnInfo {
let components: 1 | 2 | 3 | 4 = 1;

let dataVector = vector as arrow.Vector<NumericArrowType>;
if (arrow.DataType.isFixedSizeList(vector.type)) {
dataVector = vector.getChild(0)!;
if (vector.type.listSize < 1 || vector.type.listSize > 4) {
throw new Error('Attribute column fixed list size must be between 1 and 4');
}
components = vector.type.listSize as 1 | 2 | 3 | 4;
}

if (!isNumericArrowType(dataVector.type)) {
throw new Error('Attribute column must be numeric or fixed list of numeric');
}

const signedDataType = getSignedShaderType(dataVector.type, components);

const columnInfo: ArrowColumnInfo = {
// data: dataVector.data,
signedDataType,
components,
stepMode: 'instance',
values: [],
offsets: []
};

for (const data of dataVector.data) {
columnInfo.values.push(data.values);
}
return columnInfo;
}

/** Extracts info from columns that can be used with GPU vertex attributes *
export function getVertexColumnInfo(vector: arrow.Vector<MeshArrowType>): MeshData[] {
if (!arrow.DataType.isList(vector.type)) {
throw new Error('mesh data must be an Arrow list');
}
for (const data of vector.data) {
const offsets = data.valueOffsets;
if (arrow.DataType.isFixedSizeList(vector.type)) {
const dataVector = vector.getChild(0)!;
const getArrowColumnInfo
const dataVectorType = dataVector.type;
if (isNumericArrowType(dataVectorType)) {
return {
data: dataVector.data,
values: dataVector.data.values,
size: vector.type.listSize,
type: getAttributeShaderType(dataVectorType)
};
}
const size = dataVector;
return vector.getChild(0)!.data;
}
return vector.data;
}
*/
115 changes: 115 additions & 0 deletions modules/arrow/src/arrow/arrow-paths.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// luma.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import * as arrow from 'apache-arrow';

export function getArrowPaths(
arrowObject: arrow.Data | arrow.Table | arrow.RecordBatch | arrow.Vector
): string[] {
const data = getArrowDataArray(arrowObject)[0];
return getArrowPathsRecursive(data, []);
}

export function getArrowPathsRecursive(arrowData: arrow.Data, currentPath: string[]): string[] {
if (!arrow.DataType.isStruct(arrowData.type)) {
return [currentPath.join('.')];
}

const fields = arrowData.type.children;
const nestedPaths: any[] = [];
for (let fieldIndex = 0; fieldIndex < fields.length; fieldIndex++) {
const field = fields[fieldIndex];
const fieldData = arrowData.children[fieldIndex];
const fieldPath = [...currentPath, field.name];
const paths = getArrowPathsRecursive(fieldData, fieldPath);
nestedPaths.push(...paths);
}

return nestedPaths;
}

export function getArrowDataByPath(
arrowObject: arrow.Data | arrow.Table | arrow.RecordBatch | arrow.Vector,
columnPath: string
): arrow.Data {
const data = getArrowDataArray(arrowObject)[0];

const path = decomposePath(columnPath);
let nestedData = data;
for (const key of path) {
if (!arrow.DataType.isStruct(nestedData.type)) {
throw new Error(
`Arrow table nested column is a not a struct: '${key} in '${path.join('.')}'`
);
}
const fields = nestedData.type.children;
const indexByField = fields.findIndex(field => field.name === key);
if (indexByField === -1) {
throw new Error(
`Arrow table schema does not contain nested column '${key} in '${path.join('.')}'`
);
}

nestedData = nestedData.children[indexByField];
}

// Check that we resolved all the intermediate structs
if (arrow.DataType.isStruct(nestedData.type)) {
throw new Error(`Arrow table nested column '${path.join('.')}' is a struct`);
}

return nestedData;
}

export function getArrowVectorByPath(arrowTable: arrow.Table, columnPath: string): arrow.Vector {
// Make a temporary vector from the top level struct data.
const vector = arrow.makeVector(arrowTable.data);

const path = decomposePath(columnPath);
let nestedVector = vector;
for (const key of path) {
if (!arrow.DataType.isStruct(nestedVector.type)) {
throw new Error(
`Arrow table nested column is a not a struct: '${key} in '${path.join('.')}'`
);
}
const fields = nestedVector.type.children;
const indexByField = fields.findIndex(field => field.name === key);
if (indexByField === -1) {
throw new Error(
`Arrow table schema does not contain nested column '${key} in '${path.join('.')}'`
);
}

nestedVector = nestedVector.getChildAt(indexByField)!;
}

// Check that we resolved all the intermediate structs
if (arrow.DataType.isStruct(nestedVector.type)) {
throw new Error(`Arrow table nested column '${path.join('.')}' is a struct`);
}

return nestedVector;
}

/** Get a data object from an arrow object */
export function getArrowDataArray(
arrowObject: arrow.Data | arrow.Table | arrow.RecordBatch | arrow.Vector
): arrow.Data[] {
if (arrowObject instanceof arrow.Table) {
return arrowObject.data;
} else if (arrowObject instanceof arrow.RecordBatch) {
return [arrowObject.data];
} else if (arrowObject instanceof arrow.Vector) {
// @ts-expect-error for some reason read-only in this context
return arrowObject.data;
}
return [arrowObject];
}

// HELPER FUNCTIONS

function decomposePath(path: string): string[] {
return path.split('.');
}
73 changes: 73 additions & 0 deletions modules/arrow/src/arrow/arrow-types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// luma.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import type {SignedDataType, BigTypedArray} from '@luma.gl/core';
import * as arrow from 'apache-arrow';

export type NumericArrowType = arrow.Int | arrow.Float;

/** An instance attribute-compatible column - has 1-4 (fixed) numeric values per row */
export type AttributeArrowType = NumericArrowType | arrow.FixedSizeList<NumericArrowType>;

/** A non-instance attribute compatible column - has a list of 1-4 (fixed) numeric values per row */
export type MeshArrowType = arrow.List<NumericArrowType | arrow.FixedSizeList<NumericArrowType>>;

/** Extracted information required to populate a mesh */
export type ArrowColumnInfo = {
stepMode: 'instance' | 'vertex';
signedDataType: SignedDataType;
components: 1 | 2 | 3 | 4;
values: BigTypedArray[];
offsets: Uint32Array[][];
};

export function isNumericArrowType(type: arrow.DataType): type is arrow.Int | arrow.Float {
return arrow.DataType.isFloat(type) || arrow.DataType.isInt(type);
}

/** Instance = One "vec1-vec4 value" per step */
export function isInstanceArrowType(type: arrow.DataType): type is AttributeArrowType {
return (
isNumericArrowType(type) ||
(arrow.DataType.isFixedSizeList(type) && isNumericArrowType(type.children[0].type))
// TODO - check listSize?
);
}

/** Vertex = Multiple "vec1-vec4 values" per step */
export function isVertexArrowType(type: arrow.DataType): type is MeshArrowType {
return arrow.DataType.isList(type) && isInstanceArrowType(type.children[0].type);
}

/** Get the luma.gl signed shader type corresponding to an Apache Arrow type */
export function getSignedShaderType(
arrowType: NumericArrowType,
size: 1 | 2 | 3 | 4
): SignedDataType {
if (arrow.DataType.isInt(arrowType)) {
switch (arrowType.bitWidth) {
case 8:
return arrowType.isSigned ? 'sint8' : 'uint8';
case 16:
return arrowType.isSigned ? 'sint16' : 'uint16';
case 32:
return arrowType.isSigned ? 'sint32' : 'uint32';
case 64:
throw new Error('64-bit integers are not supported in shaders');
}
}

if (arrow.DataType.isFloat(arrowType)) {
switch (arrowType.precision) {
case arrow.Precision.HALF:
return 'float16';
case arrow.Precision.SINGLE:
return 'float32';
case arrow.Precision.DOUBLE:
throw new Error('Double precision floats are not supported in shaders');
}
}

throw new Error(`Unsupported arrow type ${arrowType}`);
}
16 changes: 16 additions & 0 deletions modules/arrow/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// luma.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

export type {NumericArrowType, ArrowColumnInfo} from './arrow/arrow-types';
export {
isNumericArrowType
// isInstanceArrowType,
// isVertexArrowType,
} from './arrow/arrow-types';

export {getArrowPaths, getArrowDataByPath, getArrowVectorByPath} from './arrow/arrow-paths';

export {getArrowColumnInfo} from './arrow/arrow-column-info';

export {analyzeArrowTable} from './arrow/analyze-arrow-table';
21 changes: 21 additions & 0 deletions modules/arrow/test/arrow/analyze-arrow-table.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// luma.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import test from 'tape-promise/tape';
import {ARROW_TABLES} from '@luma.gl/arrow/test/data/arrow/make-arrow-tables';
import {analyzeArrowTable} from '@luma.gl/arrow';

test('getArrowDataByPath', async t => {
const {simpleTable} = ARROW_TABLES;
let tableColumns = analyzeArrowTable(simpleTable);
t.ok(tableColumns, 'extracted info from simple table');
t.comment(JSON.stringify(tableColumns));

const {nestedTable} = ARROW_TABLES;
tableColumns = analyzeArrowTable(nestedTable);
t.ok(tableColumns, 'extracted info from nested table');
t.comment(JSON.stringify(tableColumns));

t.end();
});
Loading

0 comments on commit 87449fc

Please sign in to comment.