Exemples Parquet
Exemples du format de stockage columnaire Apache Parquet avec schémas, compression et traitement de données
Key Facts
- Category
- Data Formats
- Items
- 3
- Format Families
- json
Sample Overview
Exemples du format de stockage columnaire Apache Parquet avec schémas, compression et traitement de données This sample set belongs to Data Formats and can be used to test related workflows inside Elysia Tools.
📋 Définition de Schéma Parquet de Base json
🟢 simple
⭐
Exemples de schéma Parquet de base avec types primitifs et imbriqués pour le stockage de données
⏱️ 10 min
🏷️ parquet, schema, data format
Prerequisites:
Basic data concepts, Columnar storage basics
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": null
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "email",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "age",
"type": "int32",
"repetition_type": "OPTIONAL",
"converted_type": null
},
{
"name": "is_active",
"type": "boolean",
"repetition_type": "REQUIRED",
"converted_type": null
},
{
"name": "created_at",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MILLIS"
},
{
"name": "salary",
"type": "double",
"repetition_type": "OPTIONAL",
"converted_type": null
}
]
},
"data": [
{
"id": 1,
"name": "John Doe",
"email": "[email protected]",
"age": 30,
"is_active": true,
"created_at": 1609459200000,
"salary": 75000.00
},
{
"id": 2,
"name": "Jane Smith",
"email": null,
"age": 28,
"is_active": true,
"created_at": 1609545600000,
"salary": 82000.00
},
{
"id": 3,
"name": "Bob Johnson",
"email": "[email protected]",
"age": null,
"is_active": false,
"created_at": 1609632000000,
"salary": 68000.00
}
],
"compression": {
"codec": "snappy",
"level": "default"
},
"metadata": {
"created_by": "data_engineering_team",
"description": "Employee data with basic personal information"
}
}
📋 Structures de Données Imbriquées Parquet json
🟡 intermediate
⭐⭐⭐
Exemples de données imbriquées complexes avec structs, lists et maps pour le stockage hiérarchique
⏱️ 20 min
🏷️ parquet, nested data, complex structures
Prerequisites:
Parquet basics, Nested data concepts, JSON structures
{
"schema": {
"fields": [
{
"name": "order_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "customer",
"type": "group",
"repetition_type": "REQUIRED",
"fields": [
{
"name": "customer_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "email",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "address",
"type": "group",
"repetition_type": "OPTIONAL",
"fields": [
{
"name": "street",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "city",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "state",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "postal_code",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
}
]
}
]
},
{
"name": "products",
"type": "list",
"repetition_type": "REPEATED",
"converted_type": "LIST",
"fields": [
{
"name": "list",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "product_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "category",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "price",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "quantity",
"type": "int32",
"repetition_type": "REQUIRED"
}
]
}
]
},
{
"name": "payments",
"type": "list",
"repetition_type": "REPEATED",
"converted_type": "LIST",
"fields": [
{
"name": "element",
"type": "group",
"repetition_type": "REQUIRED",
"fields": [
{
"name": "payment_id",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "amount",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "currency",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "method",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "status",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
}
]
}
]
},
{
"name": "order_date",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MILLIS"
},
{
"name": "total_amount",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "metadata",
"type": "map",
"repetition_type": "OPTIONAL",
"converted_type": "MAP",
"fields": [
{
"name": "key_value",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "key",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "value",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
}
]
}
]
}
]
},
"data": [
{
"order_id": 1001,
"customer": {
"customer_id": 5001,
"name": "Alice Johnson",
"email": "[email protected]",
"address": {
"street": "123 Main St",
"city": "New York",
"state": "NY",
"postal_code": "10001"
}
},
"products": [
{
"product_id": 10001,
"name": "Laptop Pro",
"category": "Electronics",
"price": 1299.99,
"quantity": 1
},
{
"product_id": 10002,
"name": "Wireless Mouse",
"category": "Accessories",
"price": 29.99,
"quantity": 1
}
],
"payments": [
{
"payment_id": "pay_001",
"amount": 1329.98,
"currency": "USD",
"method": "credit_card",
"status": "completed"
}
],
"order_date": 1609545600000,
"total_amount": 1329.98,
"metadata": {
"source": "web",
"campaign": "summer_sale",
"device": "mobile"
}
},
{
"order_id": 1002,
"customer": {
"customer_id": 5002,
"name": "Bob Smith",
"email": null,
"address": {
"street": "456 Oak Ave",
"city": "Los Angeles",
"state": "CA",
"postal_code": "90001"
}
},
"products": [
{
"product_id": 10003,
"name": "Smart TV",
"category": "Electronics",
"price": 899.99,
"quantity": 1
},
{
"product_id": 10004,
"name": "HDMI Cable",
"category": "Accessories",
"price": 15.99,
"quantity": 2
},
{
"product_id": 10005,
"name": "Streaming Service Subscription",
"category": "Digital",
"price": 12.99,
"quantity": 1
}
],
"payments": [
{
"payment_id": "pay_002",
"amount": 931.97,
"currency": "USD",
"method": "paypal",
"status": "completed"
}
],
"order_date": 1609632000000,
"total_amount": 931.97,
"metadata": {
"source": "mobile_app",
"promotion": "free_shipping"
}
}
],
"compression": {
"codec": "gzip",
"level": "default"
},
"encoding": {
"dictionary": "RLE_DICTIONARY",
"data": "DELTA_BINARY_PACKED"
},
"metadata": {
"created_by": "order_processing_system",
"description": "E-commerce order data with nested customer, products, and payment information",
"partition_keys": ["order_date"],
"sort_keys": ["customer.customer_id", "order_id"]
}
}
📋 Optimisation des Performances Parquet json
🔴 complex
⭐⭐⭐⭐⭐
Techniques avancées d'optimisation Parquet avec compression, encodage et stratégies de partitionnement
⏱️ 30 min
🏷️ parquet, optimization, bigdata, performance
Prerequisites:
Advanced Parquet, Data engineering, Performance tuning
{
"schema": {
"fields": [
{
"name": "event_timestamp",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MICROS",
"encoding": "DELTA_BINARY_PACKED",
"statistics": {}
},
{
"name": "user_id",
"type": "int64",
"repetition_type": "REQUIRED",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 1000000,
"null_count": 0
}
},
{
"name": "session_id",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 500000,
"null_count": 0
}
},
{
"name": "event_type",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 25,
"null_count": 0
}
},
{
"name": "page_url",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
},
{
"name": "user_agent",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
},
{
"name": "ip_address",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY"
},
{
"name": "country_code",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 150,
"null_count": 10000
}
},
{
"name": "device_type",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 5,
"null_count": 5000
}
},
{
"name": "browser",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 10,
"null_count": 5000
}
},
{
"name": "os",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 8,
"null_count": 5000
}
},
{
"name": "revenue",
"type": "double",
"repetition_type": "OPTIONAL",
"encoding": "PLAIN",
"statistics": {
"min": 0.0,
"max": 599.99,
"null_count": 8000000
}
},
{
"name": "custom_properties",
"type": "map",
"repetition_type": "OPTIONAL",
"converted_type": "MAP",
"fields": [
{
"name": "key_value",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "key",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY"
},
{
"name": "value",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
}
]
}
]
}
]
},
"compression_strategies": {
"codec": "zstd",
"level": 5,
"column_specific": {
"event_type": "rle_dictionary",
"user_id": "bit_packed",
"session_id": "rle_dictionary",
"revenue": "delta"
}
},
"partitioning": {
"partition_keys": [
"event_date",
"event_hour",
"country_code"
],
"partition_value_schema": {
"event_date": "string",
"event_hour": "int",
"country_code": "string"
},
"partition_filtering": {
"date_range": {
"start": "2023-01-01",
"end": "2023-12-31"
},
"country_whitelist": ["US", "CA", "GB", "DE", "FR", "JP", "AU"]
}
},
"bucketing": {
"enabled": true,
"bucket_by": "user_id",
"bucket_count": 256
},
"sorting": {
"sort_keys": [
"event_timestamp",
"user_id",
"session_id"
],
"sort_order": "ascending"
},
"row_group_optimization": {
"target_row_group_size": 134217728,
"max_row_group_size": 268435456,
"adaptive_row_groups": true,
"min_rows_per_group": 1000
},
"statistics_optimization": {
"column_indexes": [
"user_id",
"session_id",
"event_timestamp"
],
"bloom_filter_columns": [
"user_id",
"session_id",
"device_id",
"order_id"
],
"bloom_filter_fpp": 0.05,
"null_count_statistics": true,
"min_max_statistics": true,
"distinct_count_statistics": true,
"ndv_max_tracked_values": 1000000
},
"data_page_optimization": {
"data_page_size": 1048576,
"dictionary_page_size": 1048576,
"v1_page_size": 1048576
},
"schema_evolution": {
"add_columns": true,
"drop_columns": false,
"rename_columns": false,
"change_types": false,
"column_order": "maintain"
},
"performance_metrics": {
"estimated_size_mb": 2048,
"compression_ratio": 0.15,
"scan_efficiency_score": 0.95,
"query_performance_rating": "excellent"
},
"optimization_rules": [
{
"rule_name": "low_cardinality_string_optimization",
"condition": "distinct_count < 100 AND avg_length > 10",
"action": "use_rle_dictionary_encoding"
},
{
"rule_name": "numeric_range_optimization",
"condition": "type IN (int32, int64) AND range/max < 0.1",
"action": "use_delta_encoding"
},
{
"rule_name": "high_null_ratio_optimization",
"condition": "null_count / total_count > 0.8",
"action": "use_nullable_with_default_values"
},
{
"rule_name": "large_string_optimization",
"condition": "avg_length > 100 AND varchar",
"action": "use_delta_length_byte_array"
}
],
"metadata": {
"created_by": "data_lake_optimization_engine",
"version": "2.0",
"optimization_applied": "2025-01-15T10:30:00Z",
"last_analyzed": "2025-01-15T10:30:00Z",
"data_quality_metrics": {
"completeness_score": 0.98,
"accuracy_score": 0.95,
"consistency_score": 0.97,
"validity_score": 0.99
},
"usage_patterns": {
"common_query_filters": [
"event_timestamp BETWEEN start_date AND end_date",
"user_id = specific_user",
"event_type IN (page_view, click, purchase)",
"country_code = specific_country"
],
"access_frequency": "high",
"update_frequency": "daily",
"retention_period": "2_years"
},
"cost_optimization": {
"storage_cost_savings": "65%",
"query_performance_improvement": "40%",
"compression_tuning_applied": true,
"partition_strategy_optimized": true
}
}
}