Parquet 示例
Apache Parquet 列式存储格式示例,包含模式、压缩和数据处理
📋 基础 Parquet 模式定义
🟢 simple
⭐
基本的 Parquet 模式示例,包含原始和嵌套类型用于数据存储
⏱️ 10 min
🏷️ parquet, schema, data format
Prerequisites:
Basic data concepts, Columnar storage basics
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": null
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "email",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "age",
"type": "int32",
"repetition_type": "OPTIONAL",
"converted_type": null
},
{
"name": "is_active",
"type": "boolean",
"repetition_type": "REQUIRED",
"converted_type": null
},
{
"name": "created_at",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MILLIS"
},
{
"name": "salary",
"type": "double",
"repetition_type": "OPTIONAL",
"converted_type": null
}
]
},
"data": [
{
"id": 1,
"name": "John Doe",
"email": "[email protected]",
"age": 30,
"is_active": true,
"created_at": 1609459200000,
"salary": 75000.00
},
{
"id": 2,
"name": "Jane Smith",
"email": null,
"age": 28,
"is_active": true,
"created_at": 1609545600000,
"salary": 82000.00
},
{
"id": 3,
"name": "Bob Johnson",
"email": "[email protected]",
"age": null,
"is_active": false,
"created_at": 1609632000000,
"salary": 68000.00
}
],
"compression": {
"codec": "snappy",
"level": "default"
},
"metadata": {
"created_by": "data_engineering_team",
"description": "Employee data with basic personal information"
}
}
📋 嵌套 Parquet 数据结构
🟡 intermediate
⭐⭐⭐
复杂的嵌套数据示例,包含结构体、列表和映射用于层次化数据存储
⏱️ 20 min
🏷️ parquet, nested data, complex structures
Prerequisites:
Parquet basics, Nested data concepts, JSON structures
{
"schema": {
"fields": [
{
"name": "order_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "customer",
"type": "group",
"repetition_type": "REQUIRED",
"fields": [
{
"name": "customer_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "email",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "address",
"type": "group",
"repetition_type": "OPTIONAL",
"fields": [
{
"name": "street",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "city",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "state",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "postal_code",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
}
]
}
]
},
{
"name": "products",
"type": "list",
"repetition_type": "REPEATED",
"converted_type": "LIST",
"fields": [
{
"name": "list",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "product_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "category",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "price",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "quantity",
"type": "int32",
"repetition_type": "REQUIRED"
}
]
}
]
},
{
"name": "payments",
"type": "list",
"repetition_type": "REPEATED",
"converted_type": "LIST",
"fields": [
{
"name": "element",
"type": "group",
"repetition_type": "REQUIRED",
"fields": [
{
"name": "payment_id",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "amount",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "currency",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "method",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "status",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
}
]
}
]
},
{
"name": "order_date",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MILLIS"
},
{
"name": "total_amount",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "metadata",
"type": "map",
"repetition_type": "OPTIONAL",
"converted_type": "MAP",
"fields": [
{
"name": "key_value",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "key",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "value",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
}
]
}
]
}
]
},
"data": [
{
"order_id": 1001,
"customer": {
"customer_id": 5001,
"name": "Alice Johnson",
"email": "[email protected]",
"address": {
"street": "123 Main St",
"city": "New York",
"state": "NY",
"postal_code": "10001"
}
},
"products": [
{
"product_id": 10001,
"name": "Laptop Pro",
"category": "Electronics",
"price": 1299.99,
"quantity": 1
},
{
"product_id": 10002,
"name": "Wireless Mouse",
"category": "Accessories",
"price": 29.99,
"quantity": 1
}
],
"payments": [
{
"payment_id": "pay_001",
"amount": 1329.98,
"currency": "USD",
"method": "credit_card",
"status": "completed"
}
],
"order_date": 1609545600000,
"total_amount": 1329.98,
"metadata": {
"source": "web",
"campaign": "summer_sale",
"device": "mobile"
}
},
{
"order_id": 1002,
"customer": {
"customer_id": 5002,
"name": "Bob Smith",
"email": null,
"address": {
"street": "456 Oak Ave",
"city": "Los Angeles",
"state": "CA",
"postal_code": "90001"
}
},
"products": [
{
"product_id": 10003,
"name": "Smart TV",
"category": "Electronics",
"price": 899.99,
"quantity": 1
},
{
"product_id": 10004,
"name": "HDMI Cable",
"category": "Accessories",
"price": 15.99,
"quantity": 2
},
{
"product_id": 10005,
"name": "Streaming Service Subscription",
"category": "Digital",
"price": 12.99,
"quantity": 1
}
],
"payments": [
{
"payment_id": "pay_002",
"amount": 931.97,
"currency": "USD",
"method": "paypal",
"status": "completed"
}
],
"order_date": 1609632000000,
"total_amount": 931.97,
"metadata": {
"source": "mobile_app",
"promotion": "free_shipping"
}
}
],
"compression": {
"codec": "gzip",
"level": "default"
},
"encoding": {
"dictionary": "RLE_DICTIONARY",
"data": "DELTA_BINARY_PACKED"
},
"metadata": {
"created_by": "order_processing_system",
"description": "E-commerce order data with nested customer, products, and payment information",
"partition_keys": ["order_date"],
"sort_keys": ["customer.customer_id", "order_id"]
}
}
📋 Parquet 性能优化
🔴 complex
⭐⭐⭐⭐⭐
高级 Parquet 优化技术,包含压缩、编码和分区策略
⏱️ 30 min
🏷️ parquet, optimization, bigdata, performance
Prerequisites:
Advanced Parquet, Data engineering, Performance tuning
{
"schema": {
"fields": [
{
"name": "event_timestamp",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MICROS",
"encoding": "DELTA_BINARY_PACKED",
"statistics": {}
},
{
"name": "user_id",
"type": "int64",
"repetition_type": "REQUIRED",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 1000000,
"null_count": 0
}
},
{
"name": "session_id",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 500000,
"null_count": 0
}
},
{
"name": "event_type",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 25,
"null_count": 0
}
},
{
"name": "page_url",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
},
{
"name": "user_agent",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
},
{
"name": "ip_address",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY"
},
{
"name": "country_code",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 150,
"null_count": 10000
}
},
{
"name": "device_type",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 5,
"null_count": 5000
}
},
{
"name": "browser",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 10,
"null_count": 5000
}
},
{
"name": "os",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 8,
"null_count": 5000
}
},
{
"name": "revenue",
"type": "double",
"repetition_type": "OPTIONAL",
"encoding": "PLAIN",
"statistics": {
"min": 0.0,
"max": 599.99,
"null_count": 8000000
}
},
{
"name": "custom_properties",
"type": "map",
"repetition_type": "OPTIONAL",
"converted_type": "MAP",
"fields": [
{
"name": "key_value",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "key",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY"
},
{
"name": "value",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
}
]
}
]
}
]
},
"compression_strategies": {
"codec": "zstd",
"level": 5,
"column_specific": {
"event_type": "rle_dictionary",
"user_id": "bit_packed",
"session_id": "rle_dictionary",
"revenue": "delta"
}
},
"partitioning": {
"partition_keys": [
"event_date",
"event_hour",
"country_code"
],
"partition_value_schema": {
"event_date": "string",
"event_hour": "int",
"country_code": "string"
},
"partition_filtering": {
"date_range": {
"start": "2023-01-01",
"end": "2023-12-31"
},
"country_whitelist": ["US", "CA", "GB", "DE", "FR", "JP", "AU"]
}
},
"bucketing": {
"enabled": true,
"bucket_by": "user_id",
"bucket_count": 256
},
"sorting": {
"sort_keys": [
"event_timestamp",
"user_id",
"session_id"
],
"sort_order": "ascending"
},
"row_group_optimization": {
"target_row_group_size": 134217728,
"max_row_group_size": 268435456,
"adaptive_row_groups": true,
"min_rows_per_group": 1000
},
"statistics_optimization": {
"column_indexes": [
"user_id",
"session_id",
"event_timestamp"
],
"bloom_filter_columns": [
"user_id",
"session_id",
"device_id",
"order_id"
],
"bloom_filter_fpp": 0.05,
"null_count_statistics": true,
"min_max_statistics": true,
"distinct_count_statistics": true,
"ndv_max_tracked_values": 1000000
},
"data_page_optimization": {
"data_page_size": 1048576,
"dictionary_page_size": 1048576,
"v1_page_size": 1048576
},
"schema_evolution": {
"add_columns": true,
"drop_columns": false,
"rename_columns": false,
"change_types": false,
"column_order": "maintain"
},
"performance_metrics": {
"estimated_size_mb": 2048,
"compression_ratio": 0.15,
"scan_efficiency_score": 0.95,
"query_performance_rating": "excellent"
},
"optimization_rules": [
{
"rule_name": "low_cardinality_string_optimization",
"condition": "distinct_count < 100 AND avg_length > 10",
"action": "use_rle_dictionary_encoding"
},
{
"rule_name": "numeric_range_optimization",
"condition": "type IN (int32, int64) AND range/max < 0.1",
"action": "use_delta_encoding"
},
{
"rule_name": "high_null_ratio_optimization",
"condition": "null_count / total_count > 0.8",
"action": "use_nullable_with_default_values"
},
{
"rule_name": "large_string_optimization",
"condition": "avg_length > 100 AND varchar",
"action": "use_delta_length_byte_array"
}
],
"metadata": {
"created_by": "data_lake_optimization_engine",
"version": "2.0",
"optimization_applied": "2025-01-15T10:30:00Z",
"last_analyzed": "2025-01-15T10:30:00Z",
"data_quality_metrics": {
"completeness_score": 0.98,
"accuracy_score": 0.95,
"consistency_score": 0.97,
"validity_score": 0.99
},
"usage_patterns": {
"common_query_filters": [
"event_timestamp BETWEEN start_date AND end_date",
"user_id = specific_user",
"event_type IN (page_view, click, purchase)",
"country_code = specific_country"
],
"access_frequency": "high",
"update_frequency": "daily",
"retention_period": "2_years"
},
"cost_optimization": {
"storage_cost_savings": "65%",
"query_performance_improvement": "40%",
"compression_tuning_applied": true,
"partition_strategy_optimized": true
}
}
}