Exemples Apache Arrow
Exemples de format colonnaire en mémoire Apache Arrow pour le traitement de données et l'analyse haute performance
Key Facts
- Category
- Data Formats
- Items
- 6
- Format Families
- json, markdown, text
Sample Overview
Exemples de format colonnaire en mémoire Apache Arrow pour le traitement de données et l'analyse haute performance This sample set belongs to Data Formats and can be used to test related workflows inside Elysia Tools.
📋 Définition de Schéma Arrow de Base json
Exemples essentiels de schéma Arrow avec types de données primitifs pour le stockage de données colonnaires
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"nullable": false,
"metadata": {}
},
{
"name": "name",
"type": "string",
"nullable": false,
"metadata": {
"description": "Person name"
}
},
{
"name": "age",
"type": "int32",
"nullable": true,
"metadata": {}
},
{
"name": "email",
"type": "string",
"nullable": true,
"metadata": {}
},
{
"name": "is_active",
"type": "bool",
"nullable": false,
"metadata": {}
},
{
"name": "salary",
"type": "float64",
"nullable": true,
"metadata": {
"unit": "USD"
}
},
{
"name": "created_at",
"type": "timestamp[us]",
"nullable": false,
"metadata": {}
}
],
"metadata": {
"description": "Employee data schema"
}
}
}
📋 Exemple de Lot d'Enregistrements Arrow json
Lot d'enregistrements Arrow complet avec données montrant la disposition mémoire colonnaire
{
"schema": {
"fields": [
{
"name": "product_id",
"type": "int64",
"nullable": false
},
{
"name": "product_name",
"type": "string",
"nullable": false
},
{
"name": "price",
"type": "float64",
"nullable": false
},
{
"name": "in_stock",
"type": "bool",
"nullable": false
}
]
},
"batches": [
{
"count": 4,
"columns": [
{
"name": "product_id",
"type": "int64",
"data": [101, 102, 103, 104]
},
{
"name": "product_name",
"type": "string",
"data": ["Laptop", "Mouse", "Keyboard", "Monitor"]
},
{
"name": "price",
"type": "float64",
"data": [1299.99, 29.99, 79.99, 399.99]
},
{
"name": "in_stock",
"type": "bool",
"data": [true, true, false, true]
}
]
}
]
}
📋 Types de Données Complexes Arrow json
Schéma Arrow avancé avec types imbriqués, liste et map pour structures de données complexes
{
"schema": {
"fields": [
{
"name": "user_id",
"type": "int64",
"nullable": false
},
{
"name": "profile",
"type": {
"type": "struct",
"fields": [
{
"name": "first_name",
"type": "string",
"nullable": false
},
{
"name": "last_name",
"type": "string",
"nullable": false
},
{
"name": "age",
"type": "int32",
"nullable": true
}
]
},
"nullable": true
},
{
"name": "tags",
"type": {
"type": "list",
"item_type": "string",
"nullable": false
},
"nullable": true
},
{
"name": "metadata",
"type": {
"type": "map",
"key_type": "string",
"value_type": "string",
"value_nullable": true
},
"nullable": true
},
{
"name": "purchase_history",
"type": {
"type": "list",
"item_type": {
"type": "struct",
"fields": [
{
"name": "product_id",
"type": "int64",
"nullable": false
},
{
"name": "timestamp",
"type": "timestamp[us]",
"nullable": false
},
{
"name": "amount",
"type": "decimal(10,2)",
"nullable": false
}
]
},
"nullable": false
},
"nullable": true
}
]
}
}
📋 Encodage Dictionnaire Arrow json
Encodage dictionnaire Arrow pour le stockage efficace de chaînes et la compression
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"nullable": false
},
{
"name": "category",
"type": {
"type": "dictionary",
"index_type": "int32",
"value_type": "string",
"ordered": false
},
"nullable": false,
"dictionary": {
"type": "string",
"values": ["Electronics", "Clothing", "Books", "Home", "Sports"]
}
},
{
"name": "status",
"type": {
"type": "dictionary",
"index_type": "int8",
"value_type": "string",
"ordered": true
},
"nullable": false,
"dictionary": {
"type": "string",
"values": ["Active", "Inactive", "Pending", "Deleted"]
}
}
]
},
"batches": [
{
"count": 5,
"columns": [
{
"name": "id",
"type": "int64",
"data": [1, 2, 3, 4, 5]
},
{
"name": "category",
"type": "dictionary[int32]",
"data": [0, 1, 0, 3, 2]
},
{
"name": "status",
"type": "dictionary[int8]",
"data": [0, 0, 2, 0, 1]
}
]
}
]
}
📋 Données de Séries Temporelles avec Arrow json
Schéma Arrow optimisé pour données de séries temporelles avec types temporels et fonctions de fenêtrage
{
"schema": {
"fields": [
{
"name": "timestamp",
"type": "timestamp[ns]",
"nullable": false,
"metadata": {
"timezone": "UTC",
"unit": "nanoseconds"
}
},
{
"name": "metric_name",
"type": "string",
"nullable": false
},
{
"name": "value",
"type": "float64",
"nullable": true,
"metadata": {
"precision": "double"
}
},
{
"name": "tags",
"type": {
"type": "map",
"key_type": "string",
"value_type": "string",
"value_nullable": true
},
"nullable": true
},
{
"name": "window_start",
"type": "timestamp[us]",
"nullable": true
},
{
"name": "window_end",
"type": "timestamp[us]",
"nullable": true
},
{
"name": "aggregation_type",
"type": {
"type": "dictionary",
"index_type": "int8",
"value_type": "string",
"ordered": false
},
"nullable": true,
"dictionary": {
"type": "string",
"values": ["sum", "avg", "min", "max", "count"]
}
}
],
"metadata": {
"description": "Time series metrics schema",
"compression": "lz4",
"partitioning": "by_hour"
}
},
"metadata": {
"name": "timeseries_metrics",
"created_at": "2025-11-30T12:00:00Z",
"version": "1.0",
"creator": "Arrow Time Series Pipeline"
}
}
📋 Optimisation de Performance Arrow json
Techniques avancées d'optimisation Arrow pour analyse haute performance et efficacité mémoire
{
"schema": {
"fields": [
{
"name": "partition_key",
"type": "int64",
"nullable": false,
"metadata": {
"partition_key": true
}
},
{
"name": "compressed_strings",
"type": {
"type": "dictionary",
"index_type": "int32",
"value_type": "string",
"ordered": false
},
"nullable": false,
"metadata": {
"compression": "dictionary"
}
},
{
"name": "dense_numeric",
"type": "float32",
"nullable": false,
"metadata": {
"precision": "single",
"compression": "bitshuffle"
}
},
{
"name": "sparse_boolean",
"type": "bool",
"nullable": true,
"metadata": {
"encoding": "run_length"
}
},
{
"name": "large_array",
"type": {
"type": "list",
"item_type": "int64",
"nullable": false
},
"nullable": true,
"metadata": {
"large_list": true,
"offset_size": "64bit"
}
},
{
"name": "union_field",
"type": {
"type": "union",
"mode": "dense",
"types": [
{
"type": "int64",
"id": 0
},
{
"type": "string",
"id": 1
},
{
"type": "float64",
"id": 2
}
]
},
"nullable": true
}
],
"metadata": {
"optimization": "performance",
"memory_layout": "columnar",
"alignment": 64
}
},
"performance_hints": {
"batch_size": 65536,
"memory_pool": "default",
"thread_safety": true,
"zero_copy": true,
"simd_optimized": true
}
}