🎯 Exemples recommandés
Balanced sample collections from various categories for you to explore
Exemples Apache Arrow
Exemples de format colonnaire en mémoire Apache Arrow pour le traitement de données et l'analyse haute performance
📋 Définition de Schéma Arrow de Base json
Exemples essentiels de schéma Arrow avec types de données primitifs pour le stockage de données colonnaires
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"nullable": false,
"metadata": {}
},
{
"name": "name",
"type": "string",
"nullable": false,
"metadata": {
"description": "Person name"
}
},
{
"name": "age",
"type": "int32",
"nullable": true,
"metadata": {}
},
{
"name": "email",
"type": "string",
"nullable": true,
"metadata": {}
},
{
"name": "is_active",
"type": "bool",
"nullable": false,
"metadata": {}
},
{
"name": "salary",
"type": "float64",
"nullable": true,
"metadata": {
"unit": "USD"
}
},
{
"name": "created_at",
"type": "timestamp[us]",
"nullable": false,
"metadata": {}
}
],
"metadata": {
"description": "Employee data schema"
}
}
}
📋 Exemple de Lot d'Enregistrements Arrow json
Lot d'enregistrements Arrow complet avec données montrant la disposition mémoire colonnaire
{
"schema": {
"fields": [
{
"name": "product_id",
"type": "int64",
"nullable": false
},
{
"name": "product_name",
"type": "string",
"nullable": false
},
{
"name": "price",
"type": "float64",
"nullable": false
},
{
"name": "in_stock",
"type": "bool",
"nullable": false
}
]
},
"batches": [
{
"count": 4,
"columns": [
{
"name": "product_id",
"type": "int64",
"data": [101, 102, 103, 104]
},
{
"name": "product_name",
"type": "string",
"data": ["Laptop", "Mouse", "Keyboard", "Monitor"]
},
{
"name": "price",
"type": "float64",
"data": [1299.99, 29.99, 79.99, 399.99]
},
{
"name": "in_stock",
"type": "bool",
"data": [true, true, false, true]
}
]
}
]
}
📋 Types de Données Complexes Arrow json
Schéma Arrow avancé avec types imbriqués, liste et map pour structures de données complexes
{
"schema": {
"fields": [
{
"name": "user_id",
"type": "int64",
"nullable": false
},
{
"name": "profile",
"type": {
"type": "struct",
"fields": [
{
"name": "first_name",
"type": "string",
"nullable": false
},
{
"name": "last_name",
"type": "string",
"nullable": false
},
{
"name": "age",
"type": "int32",
"nullable": true
}
]
},
"nullable": true
},
{
"name": "tags",
"type": {
"type": "list",
"item_type": "string",
"nullable": false
},
"nullable": true
},
{
"name": "metadata",
"type": {
"type": "map",
"key_type": "string",
"value_type": "string",
"value_nullable": true
},
"nullable": true
},
{
"name": "purchase_history",
"type": {
"type": "list",
"item_type": {
"type": "struct",
"fields": [
{
"name": "product_id",
"type": "int64",
"nullable": false
},
{
"name": "timestamp",
"type": "timestamp[us]",
"nullable": false
},
{
"name": "amount",
"type": "decimal(10,2)",
"nullable": false
}
]
},
"nullable": false
},
"nullable": true
}
]
}
}
📋 Encodage Dictionnaire Arrow json
Encodage dictionnaire Arrow pour le stockage efficace de chaînes et la compression
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"nullable": false
},
{
"name": "category",
"type": {
"type": "dictionary",
"index_type": "int32",
"value_type": "string",
"ordered": false
},
"nullable": false,
"dictionary": {
"type": "string",
"values": ["Electronics", "Clothing", "Books", "Home", "Sports"]
}
},
{
"name": "status",
"type": {
"type": "dictionary",
"index_type": "int8",
"value_type": "string",
"ordered": true
},
"nullable": false,
"dictionary": {
"type": "string",
"values": ["Active", "Inactive", "Pending", "Deleted"]
}
}
]
},
"batches": [
{
"count": 5,
"columns": [
{
"name": "id",
"type": "int64",
"data": [1, 2, 3, 4, 5]
},
{
"name": "category",
"type": "dictionary[int32]",
"data": [0, 1, 0, 3, 2]
},
{
"name": "status",
"type": "dictionary[int8]",
"data": [0, 0, 2, 0, 1]
}
]
}
]
}
📋 Données de Séries Temporelles avec Arrow json
Schéma Arrow optimisé pour données de séries temporelles avec types temporels et fonctions de fenêtrage
{
"schema": {
"fields": [
{
"name": "timestamp",
"type": "timestamp[ns]",
"nullable": false,
"metadata": {
"timezone": "UTC",
"unit": "nanoseconds"
}
},
{
"name": "metric_name",
"type": "string",
"nullable": false
},
{
"name": "value",
"type": "float64",
"nullable": true,
"metadata": {
"precision": "double"
}
},
{
"name": "tags",
"type": {
"type": "map",
"key_type": "string",
"value_type": "string",
"value_nullable": true
},
"nullable": true
},
{
"name": "window_start",
"type": "timestamp[us]",
"nullable": true
},
{
"name": "window_end",
"type": "timestamp[us]",
"nullable": true
},
{
"name": "aggregation_type",
"type": {
"type": "dictionary",
"index_type": "int8",
"value_type": "string",
"ordered": false
},
"nullable": true,
"dictionary": {
"type": "string",
"values": ["sum", "avg", "min", "max", "count"]
}
}
],
"metadata": {
"description": "Time series metrics schema",
"compression": "lz4",
"partitioning": "by_hour"
}
},
"metadata": {
"name": "timeseries_metrics",
"created_at": "2025-11-30T12:00:00Z",
"version": "1.0",
"creator": "Arrow Time Series Pipeline"
}
}
📋 Optimisation de Performance Arrow json
Techniques avancées d'optimisation Arrow pour analyse haute performance et efficacité mémoire
{
"schema": {
"fields": [
{
"name": "partition_key",
"type": "int64",
"nullable": false,
"metadata": {
"partition_key": true
}
},
{
"name": "compressed_strings",
"type": {
"type": "dictionary",
"index_type": "int32",
"value_type": "string",
"ordered": false
},
"nullable": false,
"metadata": {
"compression": "dictionary"
}
},
{
"name": "dense_numeric",
"type": "float32",
"nullable": false,
"metadata": {
"precision": "single",
"compression": "bitshuffle"
}
},
{
"name": "sparse_boolean",
"type": "bool",
"nullable": true,
"metadata": {
"encoding": "run_length"
}
},
{
"name": "large_array",
"type": {
"type": "list",
"item_type": "int64",
"nullable": false
},
"nullable": true,
"metadata": {
"large_list": true,
"offset_size": "64bit"
}
},
{
"name": "union_field",
"type": {
"type": "union",
"mode": "dense",
"types": [
{
"type": "int64",
"id": 0
},
{
"type": "string",
"id": 1
},
{
"type": "float64",
"id": 2
}
]
},
"nullable": true
}
],
"metadata": {
"optimization": "performance",
"memory_layout": "columnar",
"alignment": 64
}
},
"performance_hints": {
"batch_size": 65536,
"memory_pool": "default",
"thread_safety": true,
"zero_copy": true,
"simd_optimized": true
}
}