Skip to content

Commit

Permalink
feat(outputs.parquet): Introduce Parquet output
Browse files Browse the repository at this point in the history
  • Loading branch information
powersj committed Jul 16, 2024
1 parent 90fdcfc commit b0c805e
Show file tree
Hide file tree
Showing 8 changed files with 690 additions and 18 deletions.
12 changes: 6 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ require (
github.com/antchfx/jsonquery v1.3.3
github.com/antchfx/xmlquery v1.4.0
github.com/antchfx/xpath v1.3.0
github.com/apache/arrow/go/v13 v13.0.0
github.com/apache/arrow/go/v16 v16.0.0-20240319161736-1ee3da0064a0
github.com/apache/arrow/go/v16 v16.1.0
github.com/apache/arrow/go/v17 v17.0.0
github.com/apache/iotdb-client-go v1.2.0-tsbs
github.com/apache/thrift v0.19.0
github.com/apache/thrift v0.20.0
github.com/aristanetworks/goarista v0.0.0-20190325233358-a123909ec740
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5
github.com/awnumar/memguard v0.22.5
Expand Down Expand Up @@ -344,7 +344,7 @@ require (
github.com/golang-sql/sqlexp v0.1.0 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/flatbuffers v24.3.7+incompatible // indirect
github.com/google/flatbuffers v24.3.25+incompatible // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-querystring v1.1.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
Expand All @@ -367,7 +367,7 @@ require (
github.com/hashicorp/golang-lru v1.0.2 // indirect
github.com/hashicorp/packer-plugin-sdk v0.3.2 // indirect
github.com/hashicorp/serf v0.10.1 // indirect
github.com/huandu/xstrings v1.3.3 // indirect
github.com/huandu/xstrings v1.4.0 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/jackc/chunkreader/v2 v2.0.1 // indirect
github.com/jackc/pgpassfile v1.0.0 // indirect
Expand All @@ -388,7 +388,7 @@ require (
github.com/json-iterator/go v1.1.12 // indirect
github.com/jzelinskie/whirlpool v0.0.0-20201016144138-0675e54bb004 // indirect
github.com/klauspost/asmfmt v1.3.2 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/klauspost/cpuid/v2 v2.2.8 // indirect
github.com/kr/fs v0.1.0 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/leodido/ragel-machinery v0.0.0-20190525184631-5f46317e436b // indirect
Expand Down
23 changes: 12 additions & 11 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -827,18 +827,18 @@ github.com/antlr4-go/antlr/v4 v4.13.0 h1:lxCg3LAv+EUK6t1i0y1V6/SLeUi0eKEKdhQAlS8
github.com/antlr4-go/antlr/v4 v4.13.0/go.mod h1:pfChB/xh/Unjila75QW7+VU4TSnWnnk9UTnmpPaOR2g=
github.com/apache/arrow/go/v10 v10.0.1/go.mod h1:YvhnlEePVnBS4+0z3fhPfUy7W1Ikj0Ih0vcRo/gZ1M0=
github.com/apache/arrow/go/v11 v11.0.0/go.mod h1:Eg5OsL5H+e299f7u5ssuXsuHQVEGC4xei5aX110hRiI=
github.com/apache/arrow/go/v13 v13.0.0 h1:kELrvDQuKZo8csdWYqBQfyi431x6Zs/YJTEgUuSVcWk=
github.com/apache/arrow/go/v13 v13.0.0/go.mod h1:W69eByFNO0ZR30q1/7Sr9d83zcVZmF2MiP3fFYAWJOc=
github.com/apache/arrow/go/v15 v15.0.2 h1:60IliRbiyTWCWjERBCkO1W4Qun9svcYoZrSLcyOsMLE=
github.com/apache/arrow/go/v15 v15.0.2/go.mod h1:DGXsR3ajT524njufqf95822i+KTh+yea1jass9YXgjA=
github.com/apache/arrow/go/v16 v16.0.0-20240319161736-1ee3da0064a0 h1:XbC214lVvnAnDzowGV7dYiv4f4Aa6jhtIby08OgbcUg=
github.com/apache/arrow/go/v16 v16.0.0-20240319161736-1ee3da0064a0/go.mod h1:VVbdJivCXZAJ6IhOSCSzk/RVQ/PlcitjskAWEST3Sc0=
github.com/apache/arrow/go/v16 v16.1.0 h1:dwgfOya6s03CzH9JrjCBx6bkVb4yPD4ma3haj9p7FXI=
github.com/apache/arrow/go/v16 v16.1.0/go.mod h1:9wnc9mn6vEDTRIm4+27pEjQpRKuTvBaessPoEXQzxWA=
github.com/apache/arrow/go/v17 v17.0.0 h1:RRR2bdqKcdbss9Gxy2NS/hK8i4LDMh23L6BbkN5+F54=
github.com/apache/arrow/go/v17 v17.0.0/go.mod h1:jR7QHkODl15PfYyjM2nU+yTLScZ/qfj7OSUZmJ8putc=
github.com/apache/iotdb-client-go v1.2.0-tsbs h1:hezGUydAkDSceCvsetYorI87S2e8HZ4hTQHmGZgOGDY=
github.com/apache/iotdb-client-go v1.2.0-tsbs/go.mod h1:3D6QYkqRmASS/4HsjU+U/3fscyc5M9xKRfywZsKuoZY=
github.com/apache/thrift v0.15.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU=
github.com/apache/thrift v0.16.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU=
github.com/apache/thrift v0.19.0 h1:sOqkWPzMj7w6XaYbJQG7m4sGqVolaW/0D28Ln7yPzMk=
github.com/apache/thrift v0.19.0/go.mod h1:SUALL216IiaOw2Oy+5Vs9lboJ/t9g40C+G07Dc0QC1I=
github.com/apache/thrift v0.20.0 h1:631+KvYbsBZxmuJjYwhezVsrfc/TbqtZV4QcxOX1fOI=
github.com/apache/thrift v0.20.0/go.mod h1:hOk1BQqcp2OLzGsyVXdfMk7YFlMxK3aoEVhjD06QhB8=
github.com/apex/log v1.6.0/go.mod h1:x7s+P9VtvFBXge9Vbn+8TrqKmuzmD35TTkeBHul8UtY=
github.com/apex/logs v1.0.0/go.mod h1:XzxuLZ5myVHDy9SAmYpamKKRNApGj54PfYLcFrXqDwo=
github.com/aphistic/golf v0.0.0-20180712155816-02c07f170c5a/go.mod h1:3NqKYiepwy8kCu4PNA+aP7WUV72eXWJeP9/r3/K9aLE=
Expand Down Expand Up @@ -1382,8 +1382,8 @@ github.com/google/cel-go v0.20.1 h1:nDx9r8S3L4pE61eDdt8igGj8rf5kjYR3ILxWIpWNi84=
github.com/google/cel-go v0.20.1/go.mod h1:kWcIzTsPX0zmQ+H3TirHstLLf9ep5QTsZBN9u4dOYLg=
github.com/google/flatbuffers v1.12.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/flatbuffers v2.0.8+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/flatbuffers v24.3.7+incompatible h1:BxGUkIQnOciBu33bd5BdvqY8Qvo0O/GR4SPhh7x9Ed0=
github.com/google/flatbuffers v24.3.7+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI=
github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I=
github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U=
github.com/google/gnxi v0.0.0-20231026134436-d82d9936af15 h1:EETGSLGKBReUUYZdztSp45EzTE6CHw2qMKIfyPrgp6c=
Expand Down Expand Up @@ -1593,8 +1593,9 @@ github.com/henrybear327/go-proton-api v1.0.0/go.mod h1:w63MZuzufKcIZ93pwRgiOtxMX
github.com/hetznercloud/hcloud-go/v2 v2.4.0 h1:MqlAE+w125PLvJRCpAJmEwrIxoVdUdOyuFUhE/Ukbok=
github.com/hetznercloud/hcloud-go/v2 v2.4.0/go.mod h1:l7fA5xsncFBzQTyw29/dw5Yr88yEGKKdc6BHf24ONS0=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/huandu/xstrings v1.3.3 h1:/Gcsuc1x8JVbJ9/rlye4xZnVAbEkGauT8lbebqcQws4=
github.com/huandu/xstrings v1.3.3/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
github.com/huandu/xstrings v1.4.0 h1:D17IlohoQq4UcpqD7fDk80P7l+lwAmlFaBHgOipl2FU=
github.com/huandu/xstrings v1.4.0/go.mod h1:y5/lhBue+AyNmUVz9RLU9xbLR0o4KIIExikq4ovT0aE=
github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
Expand Down Expand Up @@ -1766,8 +1767,8 @@ github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
github.com/klauspost/cpuid/v2 v2.2.8 h1:+StwCXwm9PdpiEkPyzBXIy+M9KUb4ODm0Zarf1kS5BM=
github.com/klauspost/cpuid/v2 v2.2.8/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
github.com/klauspost/pgzip v1.2.4/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
github.com/klauspost/pgzip v1.2.5/go.mod h1:Ch1tH69qFZu15pkjo5kYi6mth2Zzwzt50oCQKQE9RUs=
github.com/klauspost/pgzip v1.2.6 h1:8RXeL5crjEUFnR2/Sn6GJNWtSQ3Dk8pq4CL3jvdDyjU=
Expand Down
2 changes: 1 addition & 1 deletion plugins/inputs/sql/drivers.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
_ "github.com/ClickHouse/clickhouse-go"
_ "github.com/IBM/nzgo/v12"
_ "github.com/SAP/go-hdb/driver"
_ "github.com/apache/arrow/go/v13/arrow/flight/flightsql/driver"
_ "github.com/apache/arrow/go/v17/arrow/flight/flightsql/driver"
_ "github.com/go-sql-driver/mysql"
_ "github.com/jackc/pgx/v4/stdlib"
_ "github.com/microsoft/go-mssqldb"
Expand Down
5 changes: 5 additions & 0 deletions plugins/outputs/all/parquet.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
//go:build !custom || outputs || outputs.parquet

package all

import _ "github.com/influxdata/telegraf/plugins/outputs/parquet" // register plugin
106 changes: 106 additions & 0 deletions plugins/outputs/parquet/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Parquet Output Plugin

This plugin sends writes metrics to parquet files. By default, the parquet
output will groups metrics by metric name and write those metrics all to the
same file. If a metric schema does not match then metrics are dropped.

To lean more about Parquet check out the [Parquet docs][] as well as a blog
post on [Querying Parquet][].

[Parquet docs]: https://parquet.apache.org/docs/
[Querying Parquet]: https://www.influxdata.com/blog/querying-parquet-millisecond-latency/

## Global configuration options <!-- @/docs/includes/plugin_config.md -->

In addition to the plugin-specific configuration settings, plugins support
additional global and plugin configuration settings. These settings are used to
modify metrics, tags, and field or create aliases and configure ordering, etc.
See the [CONFIGURATION.md][CONFIGURATION.md] for more details.

[CONFIGURATION.md]: ../../../docs/CONFIGURATION.md#plugins

## Configuration

```toml @sample.conf
# A plugin that writes metrics to parquet files
[[outputs.parquet]]
## Directory to write parquet files in. If a file already exists the output
## will attempt to continue using the existing file.
# directory = "."

## Files are rotated after the time interval specified. When set to 0 no time
## based rotation is performed.
# rotation_interval = "0h"

## Timestamp field name
## Field name to use to store the timestamp. If set to an empty string, then
## the timestamp is omitted.
# timestamp_field_name = "timestamp"
```

## Building Parquet Files

### Schema

Parquet files require a schema when writing files. To generate a schema,
Telegraf will go through all grouped metrics and generate an Apache Arrow schema
based on the union of all fields and tags. If a field and tag have the same name
then the field takes precedence.

The consequence of schema generation is that the very first flush sequence a
metric is seen takes much longer due to the additional looping through the
metrics to generate the schema. Subsequent flush intervals are significantly
faster.

When writing to a file, the schema is used to look for each value and if it is
not present a null value is added. The result is that if additional fields are
present after the first metric flush those fields are omitted.

### Write

The plugin makes use of the buffered writer. This may buffer some metrics into
memory before writing it to disk. This method is used as it can more compactly
write multiple flushes of metrics into a single Parquet row group.

Additionally, the Parquet format requires a proper footer, so close must be
called on the file to ensure it is properly formatted.

## File Rotation

If a file with the same target name exists at start, the existing file is
rotated to avoid over-writing it or conflicting schema.

File rotation is available via a time based interval that a user can optionally
set. Due to the usage of a buffered writer, a size based rotation is not
possible as the file may not actually get data at each interval.

## Explore Parquet Files

If a user wishes to explore a schema or data in a Parquet file quickly, then
look at the

### CLI

The Arrow repo contains a Go CLI tool to read and parse Parquet files:

```s
go install github.com/apache/arrow/go/v16/parquet/cmd/parquet_reader@latest
parquet_reader <file>
```

### Python

Users can also use the [pyarrow][] library to quick open and explore Parquet
files:

```python
import pyarrow.parquet as pq

table = pq.read_table('example.parquet')
```

Once created, a user can look the various [pyarrow.Table][] functions to further
explore the data.

[pyarrow]: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html
[pyarrow.Table]: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html#pyarrow.Table
Loading

0 comments on commit b0c805e

Please sign in to comment.