Apache Arrow 示例
Apache Arrow 内存列式格式示例,用于高性能数据处理和分析
📋 基础 Arrow 模式定义 json
🟢 simple
基本 Arrow 模式示例,包含用于列式数据存储的原始数据类型
🏷️ arrow, schema, primitive, data
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"nullable": false,
"metadata": {}
},
{
"name": "name",
"type": "string",
"nullable": false,
"metadata": {
"description": "Person name"
}
},
{
"name": "age",
"type": "int32",
"nullable": true,
"metadata": {}
},
{
"name": "email",
"type": "string",
"nullable": true,
"metadata": {}
},
{
"name": "is_active",
"type": "bool",
"nullable": false,
"metadata": {}
},
{
"name": "salary",
"type": "float64",
"nullable": true,
"metadata": {
"unit": "USD"
}
},
{
"name": "created_at",
"type": "timestamp[us]",
"nullable": false,
"metadata": {}
}
],
"metadata": {
"description": "Employee data schema"
}
}
}
📋 Arrow 记录批次示例 json
🟢 simple
显示列式内存布局的完整 Arrow 记录批次和数据示例
🏷️ arrow, batch, columnar, memory
{
"schema": {
"fields": [
{
"name": "product_id",
"type": "int64",
"nullable": false
},
{
"name": "product_name",
"type": "string",
"nullable": false
},
{
"name": "price",
"type": "float64",
"nullable": false
},
{
"name": "in_stock",
"type": "bool",
"nullable": false
}
]
},
"batches": [
{
"count": 4,
"columns": [
{
"name": "product_id",
"type": "int64",
"data": [101, 102, 103, 104]
},
{
"name": "product_name",
"type": "string",
"data": ["Laptop", "Mouse", "Keyboard", "Monitor"]
},
{
"name": "price",
"type": "float64",
"data": [1299.99, 29.99, 79.99, 399.99]
},
{
"name": "in_stock",
"type": "bool",
"data": [true, true, false, true]
}
]
}
]
}
📋 复杂 Arrow 数据类型 json
🟡 intermediate
高级 Arrow 模式,包含嵌套、列表和映射类型用于复杂数据结构
🏷️ arrow, complex, nested, struct, list, map
{
"schema": {
"fields": [
{
"name": "user_id",
"type": "int64",
"nullable": false
},
{
"name": "profile",
"type": {
"type": "struct",
"fields": [
{
"name": "first_name",
"type": "string",
"nullable": false
},
{
"name": "last_name",
"type": "string",
"nullable": false
},
{
"name": "age",
"type": "int32",
"nullable": true
}
]
},
"nullable": true
},
{
"name": "tags",
"type": {
"type": "list",
"item_type": "string",
"nullable": false
},
"nullable": true
},
{
"name": "metadata",
"type": {
"type": "map",
"key_type": "string",
"value_type": "string",
"value_nullable": true
},
"nullable": true
},
{
"name": "purchase_history",
"type": {
"type": "list",
"item_type": {
"type": "struct",
"fields": [
{
"name": "product_id",
"type": "int64",
"nullable": false
},
{
"name": "timestamp",
"type": "timestamp[us]",
"nullable": false
},
{
"name": "amount",
"type": "decimal(10,2)",
"nullable": false
}
]
},
"nullable": false
},
"nullable": true
}
]
}
}
📋 Arrow 字典编码 json
🟡 intermediate
用于高效字符串存储和压缩的 Arrow 字典编码
🏷️ arrow, dictionary, encoding, compression
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"nullable": false
},
{
"name": "category",
"type": {
"type": "dictionary",
"index_type": "int32",
"value_type": "string",
"ordered": false
},
"nullable": false,
"dictionary": {
"type": "string",
"values": ["Electronics", "Clothing", "Books", "Home", "Sports"]
}
},
{
"name": "status",
"type": {
"type": "dictionary",
"index_type": "int8",
"value_type": "string",
"ordered": true
},
"nullable": false,
"dictionary": {
"type": "string",
"values": ["Active", "Inactive", "Pending", "Deleted"]
}
}
]
},
"batches": [
{
"count": 5,
"columns": [
{
"name": "id",
"type": "int64",
"data": [1, 2, 3, 4, 5]
},
{
"name": "category",
"type": "dictionary[int32]",
"data": [0, 1, 0, 3, 2]
},
{
"name": "status",
"type": "dictionary[int8]",
"data": [0, 0, 2, 0, 1]
}
]
}
]
}
📋 Arrow 时间序列数据 json
🔴 complex
为时间序列数据优化的 Arrow 模式,包含时间类型和窗口函数
🏷️ arrow, time-series, temporal, metrics
{
"schema": {
"fields": [
{
"name": "timestamp",
"type": "timestamp[ns]",
"nullable": false,
"metadata": {
"timezone": "UTC",
"unit": "nanoseconds"
}
},
{
"name": "metric_name",
"type": "string",
"nullable": false
},
{
"name": "value",
"type": "float64",
"nullable": true,
"metadata": {
"precision": "double"
}
},
{
"name": "tags",
"type": {
"type": "map",
"key_type": "string",
"value_type": "string",
"value_nullable": true
},
"nullable": true
},
{
"name": "window_start",
"type": "timestamp[us]",
"nullable": true
},
{
"name": "window_end",
"type": "timestamp[us]",
"nullable": true
},
{
"name": "aggregation_type",
"type": {
"type": "dictionary",
"index_type": "int8",
"value_type": "string",
"ordered": false
},
"nullable": true,
"dictionary": {
"type": "string",
"values": ["sum", "avg", "min", "max", "count"]
}
}
],
"metadata": {
"description": "Time series metrics schema",
"compression": "lz4",
"partitioning": "by_hour"
}
},
"metadata": {
"name": "timeseries_metrics",
"created_at": "2025-11-30T12:00:00Z",
"version": "1.0",
"creator": "Arrow Time Series Pipeline"
}
}
📋 Arrow 性能优化 json
🔴 complex
高性能分析和内存效率的高级 Arrow 优化技术
🏷️ arrow, performance, optimization, memory, simd
{
"schema": {
"fields": [
{
"name": "partition_key",
"type": "int64",
"nullable": false,
"metadata": {
"partition_key": true
}
},
{
"name": "compressed_strings",
"type": {
"type": "dictionary",
"index_type": "int32",
"value_type": "string",
"ordered": false
},
"nullable": false,
"metadata": {
"compression": "dictionary"
}
},
{
"name": "dense_numeric",
"type": "float32",
"nullable": false,
"metadata": {
"precision": "single",
"compression": "bitshuffle"
}
},
{
"name": "sparse_boolean",
"type": "bool",
"nullable": true,
"metadata": {
"encoding": "run_length"
}
},
{
"name": "large_array",
"type": {
"type": "list",
"item_type": "int64",
"nullable": false
},
"nullable": true,
"metadata": {
"large_list": true,
"offset_size": "64bit"
}
},
{
"name": "union_field",
"type": {
"type": "union",
"mode": "dense",
"types": [
{
"type": "int64",
"id": 0
},
{
"type": "string",
"id": 1
},
{
"type": "float64",
"id": 2
}
]
},
"nullable": true
}
],
"metadata": {
"optimization": "performance",
"memory_layout": "columnar",
"alignment": 64
}
},
"performance_hints": {
"batch_size": 65536,
"memory_pool": "default",
"thread_safety": true,
"zero_copy": true,
"simd_optimized": true
}
}