Merge pull request apache#12 from pskevin/row-coder

Implementing RowCoder
robertwb · Jan 9, 2022 · 3e0f9ab · 3e0f9ab
2 parents 1a7ee9d + ba549bd
commit 3e0f9ab
Show file tree

Hide file tree

Showing 2 changed files with 522 additions and 0 deletions.
diff --git a/sdks/node-ts/src/apache_beam/coders/row_coder.ts b/sdks/node-ts/src/apache_beam/coders/row_coder.ts
@@ -0,0 +1,260 @@
+import * as runnerApi from '../proto/beam_runner_api';
+import * as translations from '../internal/translations'
+
+import { Writer, Reader } from 'protobufjs';
+import { Coder, Context, CODER_REGISTRY } from "./coders";
+
+import { Schema, Field, FieldType, AtomicType } from '../proto/schema';
+import { PipelineContext } from '..';
+import { BoolCoder, BytesCoder, IterableCoder, StrUtf8Coder, VarIntCoder } from './standard_coders';
+
+const argsort = x => x.map((v, i) => [v, i]).sort().map(y => y[1]);
+
+export class RowCoder implements Coder<any> {
+    public static URN: string = "beam:coder:row:v1";
+
+    private schema: Schema;
+    private nFields: number;
+    private fieldNames: string[];
+    private fieldNullable: (boolean | undefined)[];
+    private encodingPositionsAreTrivial: boolean = true;
+    private encodingPositions: number[];
+    private encodingPositionsArgsSorted: number[];
+
+    private hasNullableFields: boolean;
+    private components: Coder<any>[];
+
+    addFieldOfType(obj:any, f: Field, value:any): any {
+        if(f.type !== undefined)
+        {
+            let typeInfo = f.type?.typeInfo;
+            switch (typeInfo.oneofKind) {
+                case "atomicType":
+                    obj[f.name] = value;
+                    break;
+                case "arrayType":
+                    obj[f.name] = Array.from(value);
+                    break;
+                // case "iterableType":
+                // case "mapType":
+                case "rowType":
+                    if (typeInfo.rowType.schema !== undefined) {
+                        obj[f.name] = value;
+                    } else {
+                        throw new Error("Schema missing on RowType");
+                    }
+                    break;
+                // case "logicalType":
+                default:
+                    throw new Error(`Encountered a type that is not currently supported by RowCoder: ${f.type}`);
+            }
+            return obj;
+        }
+    }
+
+    getNonNullCoderFromType(t: FieldType): any {
+        let typeInfo = t.typeInfo;
+
+        switch (typeInfo.oneofKind) {
+            case "atomicType":
+                let atomicType: AtomicType = typeInfo.atomicType;
+                switch (atomicType) {
+                    case AtomicType.INT16:
+                    case AtomicType.INT32:
+                    case AtomicType.INT64:
+                        return new VarIntCoder();
+                    // case AtomicType.BYTE:
+                    case AtomicType.BYTES:
+                        return new BytesCoder();
+                    // case AtomicType.FLOAT:
+                    // case AtomicType.DOUBLE:
+                    case AtomicType.STRING:
+                        return new StrUtf8Coder();
+                    case AtomicType.BOOLEAN:
+                        return new BoolCoder();
+                    default:
+                        throw new Error(`Encountered an Atomic type that is not currently supported by RowCoder: ${atomicType}`);
+                }
+                break;
+            case "arrayType":
+                if (typeInfo.arrayType.elementType !== undefined) {
+                    return new IterableCoder(
+                        this.getNonNullCoderFromType(
+                            typeInfo.arrayType.elementType
+                        )
+                    )
+                } else {
+                    throw new Error("ElementType missing on ArrayType");
+                }
+            // case "iterableType":
+            // case "mapType":
+            case "rowType":
+                if (typeInfo.rowType.schema !== undefined) {
+                    return RowCoder.OfSchema(typeInfo.rowType.schema);
+                } else {
+                    throw new Error("Schema missing on RowType");
+                }
+                break;
+            // case "logicalType":
+            default:
+                throw new Error(`Encountered a type that is not currently supported by RowCoder: ${t}`);
+        }
+    }
+
+    static OfSchema(schema: Schema): RowCoder {
+        return new RowCoder(schema);
+    }
+
+    constructor(schema: Schema) {
+        this.schema = schema;
+        this.nFields = this.schema.fields.length;
+        this.fieldNames = this.schema.fields.map((f: Field) => f.name);
+        this.fieldNullable = this.schema.fields.map((f: Field) => f.type?.nullable);
+        // self.constructor = named_tuple_fromschema(schema)
+        this.encodingPositions = this.schema.fields.map((_, i) => i);
+
+        if (this.schema.encodingPositionsSet) {
+            // Should never be duplicate encoding positions.
+            let encPosx = schema.fields.map((f: Field) => f.encodingPosition);
+            if (encPosx.length != this.encodingPositions.length) {
+                throw new Error(`Schema with id ${this.schema.id} has encoding_positions_set=True, but not all fields have encoding_position set`);
+            }
+            // Checking if positions are in {0, ..., length-1}
+            this.encodingPositionsAreTrivial = encPosx === this.encodingPositions;
+            this.encodingPositions = encPosx;
+            this.encodingPositionsArgsSorted = argsort(encPosx);
+        }
+
+        this.hasNullableFields = this.schema.fields.some((f: Field) => f.type?.nullable);
+        this.components = this.encodingPositions
+            .map(i => this.schema.fields[i])
+            .map((f: Field) => {
+                if(f.type !== undefined) {
+                    return this.getNonNullCoderFromType(f.type);
+                }
+            });
+    }
+
+    encode(element: any, writer: Writer, context: Context) {
+        // let bytesIntCoder = new BytesCoder();
+
+        // The number of attributes in the schema, encoded with
+        // beam:coder:varint:v1. This makes it possible to detect certain
+        // allowed schema changes (appending or removing columns) in
+        // long-running streaming pipelines.
+        writer.int32(this.nFields);
+
+
+        // A byte array representing a packed bitset indicating null fields (a
+        //     1 indicating a null) encoded with beam:coder:bytes:v1. The unused
+        //     bits in the last byte must be set to 0. If there are no nulls an
+        //     empty byte array is encoded.
+        //     The two-byte bitset (not including the lenghth-prefix) for the row
+        //     [NULL, 0, 0, 0, NULL, 0, 0, NULL, 0, NULL] would be
+        //     [0b10010001, 0b00000010]
+        let attrs = this.fieldNames.map(name => element[name]);
+
+        let bytesCoder = new BytesCoder();
+
+        let nullFields: number[] = [];
+
+
+        if (this.hasNullableFields) {
+            if (attrs.some(attr => attr == undefined)) {
+                let running = 0;
+                attrs.forEach(
+                    (attr, i) => {
+                        if (i && i % 8 == 0) {
+                            nullFields.push(running);
+                            running = 0;
+                        }
+                        running |= (attr == undefined ? 1 : 0) << (i % 8);
+                    })
+                nullFields.push(running);
+            } 
+        } 
+
+        writer.bytes(new Uint8Array(nullFields));
+
+        // An encoding for each non-null field, concatenated together.
+        let positions = this.encodingPositionsAreTrivial ? this.encodingPositions : this.encodingPositionsArgsSorted;
+
+        positions.forEach(
+            (i) => {
+                let attr = attrs[i];
+                if (attr == undefined) {
+                    if (!this.fieldNullable[i]) {
+                        throw new Error(`Attempted to encode null for non-nullable field \"${this.schema.fields[i].name}\".`);
+                    }
+                } else {
+                    this.components[i].encode(attr, writer, Context.needsDelimiters);
+                }
+            })
+    }
+
+    decode(reader: Reader, context: Context): any {
+        let nFields = reader.int32();
+
+        // Addressing Null values
+        let nulls: any[],
+            hasNulls = false,
+            nullMaskBytes = reader.int32(),
+            nullMask = reader.buf.slice(reader.pos, reader.pos + nullMaskBytes);
+
+        reader.pos += nullMaskBytes;
+
+        if (nullMask.length > 0) {
+            hasNulls = true;
+
+            let running = 0;
+            nulls = Array(nFields).fill(0).map(
+                (_, i) => {
+                    if (i % 8 == 0) {
+                        let chunk = Math.floor(i / 8);
+                        running = chunk >= nullMask.length ? 0 : nullMask[chunk];
+                    }
+                    return (running >> (i % 8)) & 0x01;
+                })
+        }
+
+        // Note that if this coder's schema has *fewer* attributes than the encoded
+        // value, we just need to ignore the additional values, which will occur
+        // here because we only decode as many values as we have coders for.
+        let positions = this.encodingPositionsAreTrivial ? this.encodingPositions : this.encodingPositionsArgsSorted,
+            sortedComponents = positions.slice(0, Math.min(this.nFields, nFields)).map(
+                (i) => {
+                    if (hasNulls && nulls[i]) {
+                        return undefined;
+                    } else {
+                        return this.components[i].decode(reader, Context.needsDelimiters);
+                    }
+                })
+
+        // If this coder's schema has more attributes than the encoded value, then
+        // the schema must have changed. Populate the unencoded fields with nulls.
+        while (sortedComponents.length < this.nFields) {
+            sortedComponents.push(undefined);
+        }
+
+        let obj:any = {};
+        positions.forEach(
+            (i) => {
+                obj = this.addFieldOfType(obj, this.schema.fields[i], sortedComponents[i])
+            }
+        );
+
+        return obj;
+    }
+
+
+    toProto(pipelineContext: PipelineContext): runnerApi.Coder {
+        return {
+            spec: {
+                urn: RowCoder.URN,
+                payload: new Uint8Array(),
+            },
+            componentCoderIds: [],
+        };
+    }
+}
+CODER_REGISTRY.register(RowCoder.URN, RowCoder);