Parquet Samples
Apache Parquet columnar storage format examples with schemas, compression, and data processing
Key Facts
- Category
- Data Formats
- Items
- 3
- Format Families
- json
Sample Overview
Apache Parquet columnar storage format examples with schemas, compression, and data processing This sample set belongs to Data Formats and can be used to test related workflows inside Elysia Tools.
📋 Basic Parquet Schema Definition json
🟢 simple
⭐
Essential Parquet schema examples with primitive and nested types for data storage
⏱️ 10 min
🏷️ parquet, schema, data format
Prerequisites:
Basic data concepts, Columnar storage basics
{
"schema": {
"fields": [
{
"name": "id",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": null
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "email",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "age",
"type": "int32",
"repetition_type": "OPTIONAL",
"converted_type": null
},
{
"name": "is_active",
"type": "boolean",
"repetition_type": "REQUIRED",
"converted_type": null
},
{
"name": "created_at",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MILLIS"
},
{
"name": "salary",
"type": "double",
"repetition_type": "OPTIONAL",
"converted_type": null
}
]
},
"data": [
{
"id": 1,
"name": "John Doe",
"email": "[email protected]",
"age": 30,
"is_active": true,
"created_at": 1609459200000,
"salary": 75000.00
},
{
"id": 2,
"name": "Jane Smith",
"email": null,
"age": 28,
"is_active": true,
"created_at": 1609545600000,
"salary": 82000.00
},
{
"id": 3,
"name": "Bob Johnson",
"email": "[email protected]",
"age": null,
"is_active": false,
"created_at": 1609632000000,
"salary": 68000.00
}
],
"compression": {
"codec": "snappy",
"level": "default"
},
"metadata": {
"created_by": "data_engineering_team",
"description": "Employee data with basic personal information"
}
}
📋 Nested Parquet Data Structures json
🟡 intermediate
⭐⭐⭐
Complex nested data examples with structs, lists, and maps for hierarchical data storage
⏱️ 20 min
🏷️ parquet, nested data, complex structures
Prerequisites:
Parquet basics, Nested data concepts, JSON structures
{
"schema": {
"fields": [
{
"name": "order_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "customer",
"type": "group",
"repetition_type": "REQUIRED",
"fields": [
{
"name": "customer_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "email",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "address",
"type": "group",
"repetition_type": "OPTIONAL",
"fields": [
{
"name": "street",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "city",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "state",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
},
{
"name": "postal_code",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8"
}
]
}
]
},
{
"name": "products",
"type": "list",
"repetition_type": "REPEATED",
"converted_type": "LIST",
"fields": [
{
"name": "list",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "product_id",
"type": "int64",
"repetition_type": "REQUIRED"
},
{
"name": "name",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "category",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "price",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "quantity",
"type": "int32",
"repetition_type": "REQUIRED"
}
]
}
]
},
{
"name": "payments",
"type": "list",
"repetition_type": "REPEATED",
"converted_type": "LIST",
"fields": [
{
"name": "element",
"type": "group",
"repetition_type": "REQUIRED",
"fields": [
{
"name": "payment_id",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "amount",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "currency",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "method",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "status",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
}
]
}
]
},
{
"name": "order_date",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MILLIS"
},
{
"name": "total_amount",
"type": "double",
"repetition_type": "REQUIRED"
},
{
"name": "metadata",
"type": "map",
"repetition_type": "OPTIONAL",
"converted_type": "MAP",
"fields": [
{
"name": "key_value",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "key",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
},
{
"name": "value",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8"
}
]
}
]
}
]
},
"data": [
{
"order_id": 1001,
"customer": {
"customer_id": 5001,
"name": "Alice Johnson",
"email": "[email protected]",
"address": {
"street": "123 Main St",
"city": "New York",
"state": "NY",
"postal_code": "10001"
}
},
"products": [
{
"product_id": 10001,
"name": "Laptop Pro",
"category": "Electronics",
"price": 1299.99,
"quantity": 1
},
{
"product_id": 10002,
"name": "Wireless Mouse",
"category": "Accessories",
"price": 29.99,
"quantity": 1
}
],
"payments": [
{
"payment_id": "pay_001",
"amount": 1329.98,
"currency": "USD",
"method": "credit_card",
"status": "completed"
}
],
"order_date": 1609545600000,
"total_amount": 1329.98,
"metadata": {
"source": "web",
"campaign": "summer_sale",
"device": "mobile"
}
},
{
"order_id": 1002,
"customer": {
"customer_id": 5002,
"name": "Bob Smith",
"email": null,
"address": {
"street": "456 Oak Ave",
"city": "Los Angeles",
"state": "CA",
"postal_code": "90001"
}
},
"products": [
{
"product_id": 10003,
"name": "Smart TV",
"category": "Electronics",
"price": 899.99,
"quantity": 1
},
{
"product_id": 10004,
"name": "HDMI Cable",
"category": "Accessories",
"price": 15.99,
"quantity": 2
},
{
"product_id": 10005,
"name": "Streaming Service Subscription",
"category": "Digital",
"price": 12.99,
"quantity": 1
}
],
"payments": [
{
"payment_id": "pay_002",
"amount": 931.97,
"currency": "USD",
"method": "paypal",
"status": "completed"
}
],
"order_date": 1609632000000,
"total_amount": 931.97,
"metadata": {
"source": "mobile_app",
"promotion": "free_shipping"
}
}
],
"compression": {
"codec": "gzip",
"level": "default"
},
"encoding": {
"dictionary": "RLE_DICTIONARY",
"data": "DELTA_BINARY_PACKED"
},
"metadata": {
"created_by": "order_processing_system",
"description": "E-commerce order data with nested customer, products, and payment information",
"partition_keys": ["order_date"],
"sort_keys": ["customer.customer_id", "order_id"]
}
}
📋 Parquet Performance Optimization json
🔴 complex
⭐⭐⭐⭐⭐
Advanced Parquet optimization techniques with compression, encoding, and partitioning strategies
⏱️ 30 min
🏷️ parquet, optimization, bigdata, performance
Prerequisites:
Advanced Parquet, Data engineering, Performance tuning
{
"schema": {
"fields": [
{
"name": "event_timestamp",
"type": "int64",
"repetition_type": "REQUIRED",
"converted_type": "TIMESTAMP_MICROS",
"encoding": "DELTA_BINARY_PACKED",
"statistics": {}
},
{
"name": "user_id",
"type": "int64",
"repetition_type": "REQUIRED",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 1000000,
"null_count": 0
}
},
{
"name": "session_id",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 500000,
"null_count": 0
}
},
{
"name": "event_type",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 25,
"null_count": 0
}
},
{
"name": "page_url",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
},
{
"name": "user_agent",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
},
{
"name": "ip_address",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY"
},
{
"name": "country_code",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 150,
"null_count": 10000
}
},
{
"name": "device_type",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 5,
"null_count": 5000
}
},
{
"name": "browser",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 10,
"null_count": 5000
}
},
{
"name": "os",
"type": "binary",
"repetition_type": "OPTIONAL",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY",
"statistics": {
"distinct_count": 8,
"null_count": 5000
}
},
{
"name": "revenue",
"type": "double",
"repetition_type": "OPTIONAL",
"encoding": "PLAIN",
"statistics": {
"min": 0.0,
"max": 599.99,
"null_count": 8000000
}
},
{
"name": "custom_properties",
"type": "map",
"repetition_type": "OPTIONAL",
"converted_type": "MAP",
"fields": [
{
"name": "key_value",
"type": "group",
"repetition_type": "REPEATED",
"fields": [
{
"name": "key",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "RLE_DICTIONARY"
},
{
"name": "value",
"type": "binary",
"repetition_type": "REQUIRED",
"converted_type": "UTF8",
"encoding": "DELTA_LENGTH_BYTE_ARRAY"
}
]
}
]
}
]
},
"compression_strategies": {
"codec": "zstd",
"level": 5,
"column_specific": {
"event_type": "rle_dictionary",
"user_id": "bit_packed",
"session_id": "rle_dictionary",
"revenue": "delta"
}
},
"partitioning": {
"partition_keys": [
"event_date",
"event_hour",
"country_code"
],
"partition_value_schema": {
"event_date": "string",
"event_hour": "int",
"country_code": "string"
},
"partition_filtering": {
"date_range": {
"start": "2023-01-01",
"end": "2023-12-31"
},
"country_whitelist": ["US", "CA", "GB", "DE", "FR", "JP", "AU"]
}
},
"bucketing": {
"enabled": true,
"bucket_by": "user_id",
"bucket_count": 256
},
"sorting": {
"sort_keys": [
"event_timestamp",
"user_id",
"session_id"
],
"sort_order": "ascending"
},
"row_group_optimization": {
"target_row_group_size": 134217728,
"max_row_group_size": 268435456,
"adaptive_row_groups": true,
"min_rows_per_group": 1000
},
"statistics_optimization": {
"column_indexes": [
"user_id",
"session_id",
"event_timestamp"
],
"bloom_filter_columns": [
"user_id",
"session_id",
"device_id",
"order_id"
],
"bloom_filter_fpp": 0.05,
"null_count_statistics": true,
"min_max_statistics": true,
"distinct_count_statistics": true,
"ndv_max_tracked_values": 1000000
},
"data_page_optimization": {
"data_page_size": 1048576,
"dictionary_page_size": 1048576,
"v1_page_size": 1048576
},
"schema_evolution": {
"add_columns": true,
"drop_columns": false,
"rename_columns": false,
"change_types": false,
"column_order": "maintain"
},
"performance_metrics": {
"estimated_size_mb": 2048,
"compression_ratio": 0.15,
"scan_efficiency_score": 0.95,
"query_performance_rating": "excellent"
},
"optimization_rules": [
{
"rule_name": "low_cardinality_string_optimization",
"condition": "distinct_count < 100 AND avg_length > 10",
"action": "use_rle_dictionary_encoding"
},
{
"rule_name": "numeric_range_optimization",
"condition": "type IN (int32, int64) AND range/max < 0.1",
"action": "use_delta_encoding"
},
{
"rule_name": "high_null_ratio_optimization",
"condition": "null_count / total_count > 0.8",
"action": "use_nullable_with_default_values"
},
{
"rule_name": "large_string_optimization",
"condition": "avg_length > 100 AND varchar",
"action": "use_delta_length_byte_array"
}
],
"metadata": {
"created_by": "data_lake_optimization_engine",
"version": "2.0",
"optimization_applied": "2025-01-15T10:30:00Z",
"last_analyzed": "2025-01-15T10:30:00Z",
"data_quality_metrics": {
"completeness_score": 0.98,
"accuracy_score": 0.95,
"consistency_score": 0.97,
"validity_score": 0.99
},
"usage_patterns": {
"common_query_filters": [
"event_timestamp BETWEEN start_date AND end_date",
"user_id = specific_user",
"event_type IN (page_view, click, purchase)",
"country_code = specific_country"
],
"access_frequency": "high",
"update_frequency": "daily",
"retention_period": "2_years"
},
"cost_optimization": {
"storage_cost_savings": "65%",
"query_performance_improvement": "40%",
"compression_tuning_applied": true,
"partition_strategy_optimized": true
}
}
}