🎯 Рекомендуемые коллекции
Балансированные коллекции примеров кода из различных категорий, которые вы можете исследовать
Примеры Apache Arrow
Примеры формата в памяти Apache Arrow для высокопроизводительной обработки данных и аналитики
📋 Базовое определение схемы Arrow json
🟢 simple
Основные примеры схемы Arrow с примитивными типами данных для хранения колонных данных
🏷️ arrow, schema, primitive, data
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"nullable": false,
"metadata": {}
},
{
"name": "name",
"type": "string",
"nullable": false,
"metadata": {
"description": "Person name"
}
},
{
"name": "age",
"type": "int32",
"nullable": true,
"metadata": {}
},
{
"name": "email",
"type": "string",
"nullable": true,
"metadata": {}
},
{
"name": "is_active",
"type": "bool",
"nullable": false,
"metadata": {}
},
{
"name": "salary",
"type": "float64",
"nullable": true,
"metadata": {
"unit": "USD"
}
},
{
"name": "created_at",
"type": "timestamp[us]",
"nullable": false,
"metadata": {}
}
],
"metadata": {
"description": "Employee data schema"
}
}
}
📋 Пример пакета записей Arrow json
🟢 simple
Полный пакет записей Arrow с данными, показывающими колонное размещение в памяти
🏷️ arrow, batch, columnar, memory
{
"schema": {
"fields": [
{
"name": "product_id",
"type": "int64",
"nullable": false
},
{
"name": "product_name",
"type": "string",
"nullable": false
},
{
"name": "price",
"type": "float64",
"nullable": false
},
{
"name": "in_stock",
"type": "bool",
"nullable": false
}
]
},
"batches": [
{
"count": 4,
"columns": [
{
"name": "product_id",
"type": "int64",
"data": [101, 102, 103, 104]
},
{
"name": "product_name",
"type": "string",
"data": ["Laptop", "Mouse", "Keyboard", "Monitor"]
},
{
"name": "price",
"type": "float64",
"data": [1299.99, 29.99, 79.99, 399.99]
},
{
"name": "in_stock",
"type": "bool",
"data": [true, true, false, true]
}
]
}
]
}
📋 Сложные типы данных Arrow json
🟡 intermediate
Продвинутая схема Arrow с вложенными, списочными и map-типами для сложных структур данных
🏷️ arrow, complex, nested, struct, list, map
{
"schema": {
"fields": [
{
"name": "user_id",
"type": "int64",
"nullable": false
},
{
"name": "profile",
"type": {
"type": "struct",
"fields": [
{
"name": "first_name",
"type": "string",
"nullable": false
},
{
"name": "last_name",
"type": "string",
"nullable": false
},
{
"name": "age",
"type": "int32",
"nullable": true
}
]
},
"nullable": true
},
{
"name": "tags",
"type": {
"type": "list",
"item_type": "string",
"nullable": false
},
"nullable": true
},
{
"name": "metadata",
"type": {
"type": "map",
"key_type": "string",
"value_type": "string",
"value_nullable": true
},
"nullable": true
},
{
"name": "purchase_history",
"type": {
"type": "list",
"item_type": {
"type": "struct",
"fields": [
{
"name": "product_id",
"type": "int64",
"nullable": false
},
{
"name": "timestamp",
"type": "timestamp[us]",
"nullable": false
},
{
"name": "amount",
"type": "decimal(10,2)",
"nullable": false
}
]
},
"nullable": false
},
"nullable": true
}
]
}
}
📋 Словарное кодирование Arrow json
🟡 intermediate
Словарное кодирование Arrow для эффективного хранения строк и сжатия
🏷️ arrow, dictionary, encoding, compression
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"nullable": false
},
{
"name": "category",
"type": {
"type": "dictionary",
"index_type": "int32",
"value_type": "string",
"ordered": false
},
"nullable": false,
"dictionary": {
"type": "string",
"values": ["Electronics", "Clothing", "Books", "Home", "Sports"]
}
},
{
"name": "status",
"type": {
"type": "dictionary",
"index_type": "int8",
"value_type": "string",
"ordered": true
},
"nullable": false,
"dictionary": {
"type": "string",
"values": ["Active", "Inactive", "Pending", "Deleted"]
}
}
]
},
"batches": [
{
"count": 5,
"columns": [
{
"name": "id",
"type": "int64",
"data": [1, 2, 3, 4, 5]
},
{
"name": "category",
"type": "dictionary[int32]",
"data": [0, 1, 0, 3, 2]
},
{
"name": "status",
"type": "dictionary[int8]",
"data": [0, 0, 2, 0, 1]
}
]
}
]
}
📋 Данные временных рядов с Arrow json
🔴 complex
Оптимизированная схема Arrow для данных временных рядов с временными типами и оконными функциями
🏷️ arrow, time-series, temporal, metrics
{
"schema": {
"fields": [
{
"name": "timestamp",
"type": "timestamp[ns]",
"nullable": false,
"metadata": {
"timezone": "UTC",
"unit": "nanoseconds"
}
},
{
"name": "metric_name",
"type": "string",
"nullable": false
},
{
"name": "value",
"type": "float64",
"nullable": true,
"metadata": {
"precision": "double"
}
},
{
"name": "tags",
"type": {
"type": "map",
"key_type": "string",
"value_type": "string",
"value_nullable": true
},
"nullable": true
},
{
"name": "window_start",
"type": "timestamp[us]",
"nullable": true
},
{
"name": "window_end",
"type": "timestamp[us]",
"nullable": true
},
{
"name": "aggregation_type",
"type": {
"type": "dictionary",
"index_type": "int8",
"value_type": "string",
"ordered": false
},
"nullable": true,
"dictionary": {
"type": "string",
"values": ["sum", "avg", "min", "max", "count"]
}
}
],
"metadata": {
"description": "Time series metrics schema",
"compression": "lz4",
"partitioning": "by_hour"
}
},
"metadata": {
"name": "timeseries_metrics",
"created_at": "2025-11-30T12:00:00Z",
"version": "1.0",
"creator": "Arrow Time Series Pipeline"
}
}
📋 Оптимизация производительности Arrow json
🔴 complex
Продвинутые техники оптимизации Arrow для высокопроизводительной аналитики и эффективности памяти
🏷️ arrow, performance, optimization, memory, simd
{
"schema": {
"fields": [
{
"name": "partition_key",
"type": "int64",
"nullable": false,
"metadata": {
"partition_key": true
}
},
{
"name": "compressed_strings",
"type": {
"type": "dictionary",
"index_type": "int32",
"value_type": "string",
"ordered": false
},
"nullable": false,
"metadata": {
"compression": "dictionary"
}
},
{
"name": "dense_numeric",
"type": "float32",
"nullable": false,
"metadata": {
"precision": "single",
"compression": "bitshuffle"
}
},
{
"name": "sparse_boolean",
"type": "bool",
"nullable": true,
"metadata": {
"encoding": "run_length"
}
},
{
"name": "large_array",
"type": {
"type": "list",
"item_type": "int64",
"nullable": false
},
"nullable": true,
"metadata": {
"large_list": true,
"offset_size": "64bit"
}
},
{
"name": "union_field",
"type": {
"type": "union",
"mode": "dense",
"types": [
{
"type": "int64",
"id": 0
},
{
"type": "string",
"id": 1
},
{
"type": "float64",
"id": 2
}
]
},
"nullable": true
}
],
"metadata": {
"optimization": "performance",
"memory_layout": "columnar",
"alignment": 64
}
},
"performance_hints": {
"batch_size": 65536,
"memory_pool": "default",
"thread_safety": true,
"zero_copy": true,
"simd_optimized": true
}
}