Parquet Samples

Apache Parquet columnar storage format examples with schemas, compression, and data processing

📋 Basic Parquet Schema Definition

🟢 simple

Essential Parquet schema examples with primitive and nested types for data storage

⏱️ 10 min 🏷️ parquet, schema, data format
Prerequisites: Basic data concepts, Columnar storage basics
{
  "schema": {
    "fields": [
      {
        "name": "id",
        "type": "int64",
        "repetition_type": "REQUIRED",
        "converted_type": null
      },
      {
        "name": "name",
        "type": "binary",
        "repetition_type": "REQUIRED",
        "converted_type": "UTF8"
      },
      {
        "name": "email",
        "type": "binary",
        "repetition_type": "OPTIONAL",
        "converted_type": "UTF8"
      },
      {
        "name": "age",
        "type": "int32",
        "repetition_type": "OPTIONAL",
        "converted_type": null
      },
      {
        "name": "is_active",
        "type": "boolean",
        "repetition_type": "REQUIRED",
        "converted_type": null
      },
      {
        "name": "created_at",
        "type": "int64",
        "repetition_type": "REQUIRED",
        "converted_type": "TIMESTAMP_MILLIS"
      },
      {
        "name": "salary",
        "type": "double",
        "repetition_type": "OPTIONAL",
        "converted_type": null
      }
    ]
  },
  "data": [
    {
      "id": 1,
      "name": "John Doe",
      "email": "[email protected]",
      "age": 30,
      "is_active": true,
      "created_at": 1609459200000,
      "salary": 75000.00
    },
    {
      "id": 2,
      "name": "Jane Smith",
      "email": null,
      "age": 28,
      "is_active": true,
      "created_at": 1609545600000,
      "salary": 82000.00
    },
    {
      "id": 3,
      "name": "Bob Johnson",
      "email": "[email protected]",
      "age": null,
      "is_active": false,
      "created_at": 1609632000000,
      "salary": 68000.00
    }
  ],
  "compression": {
    "codec": "snappy",
    "level": "default"
  },
  "metadata": {
    "created_by": "data_engineering_team",
    "description": "Employee data with basic personal information"
  }
}

📋 Nested Parquet Data Structures

🟡 intermediate ⭐⭐⭐

Complex nested data examples with structs, lists, and maps for hierarchical data storage

⏱️ 20 min 🏷️ parquet, nested data, complex structures
Prerequisites: Parquet basics, Nested data concepts, JSON structures
{
  "schema": {
    "fields": [
      {
        "name": "order_id",
        "type": "int64",
        "repetition_type": "REQUIRED"
      },
      {
        "name": "customer",
        "type": "group",
        "repetition_type": "REQUIRED",
        "fields": [
          {
            "name": "customer_id",
            "type": "int64",
            "repetition_type": "REQUIRED"
          },
          {
            "name": "name",
            "type": "binary",
            "repetition_type": "REQUIRED",
            "converted_type": "UTF8"
          },
          {
            "name": "email",
            "type": "binary",
            "repetition_type": "OPTIONAL",
            "converted_type": "UTF8"
          },
          {
            "name": "address",
            "type": "group",
            "repetition_type": "OPTIONAL",
            "fields": [
              {
                "name": "street",
                "type": "binary",
                "repetition_type": "OPTIONAL",
                "converted_type": "UTF8"
              },
              {
                "name": "city",
                "type": "binary",
                "repetition_type": "OPTIONAL",
                "converted_type": "UTF8"
              },
              {
                "name": "state",
                "type": "binary",
                "repetition_type": "OPTIONAL",
                "converted_type": "UTF8"
              },
              {
                "name": "postal_code",
                "type": "binary",
                "repetition_type": "OPTIONAL",
                "converted_type": "UTF8"
              }
            ]
          }
        ]
      },
      {
        "name": "products",
        "type": "list",
        "repetition_type": "REPEATED",
        "converted_type": "LIST",
        "fields": [
          {
            "name": "list",
            "type": "group",
            "repetition_type": "REPEATED",
            "fields": [
              {
                "name": "product_id",
                "type": "int64",
                "repetition_type": "REQUIRED"
              },
              {
                "name": "name",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8"
              },
              {
                "name": "category",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8"
              },
              {
                "name": "price",
                "type": "double",
                "repetition_type": "REQUIRED"
              },
              {
                "name": "quantity",
                "type": "int32",
                "repetition_type": "REQUIRED"
              }
            ]
          }
        ]
      },
      {
        "name": "payments",
        "type": "list",
        "repetition_type": "REPEATED",
        "converted_type": "LIST",
        "fields": [
          {
            "name": "element",
            "type": "group",
            "repetition_type": "REQUIRED",
            "fields": [
              {
                "name": "payment_id",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8"
              },
              {
                "name": "amount",
                "type": "double",
                "repetition_type": "REQUIRED"
              },
              {
                "name": "currency",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8"
              },
              {
                "name": "method",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8"
              },
              {
                "name": "status",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8"
              }
            ]
          }
        ]
      },
      {
        "name": "order_date",
        "type": "int64",
        "repetition_type": "REQUIRED",
        "converted_type": "TIMESTAMP_MILLIS"
      },
      {
        "name": "total_amount",
        "type": "double",
        "repetition_type": "REQUIRED"
      },
      {
        "name": "metadata",
        "type": "map",
        "repetition_type": "OPTIONAL",
        "converted_type": "MAP",
        "fields": [
          {
            "name": "key_value",
            "type": "group",
            "repetition_type": "REPEATED",
            "fields": [
              {
                "name": "key",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8"
              },
              {
                "name": "value",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8"
              }
            ]
          }
        ]
      }
    ]
  },
  "data": [
    {
      "order_id": 1001,
      "customer": {
        "customer_id": 5001,
        "name": "Alice Johnson",
        "email": "[email protected]",
        "address": {
          "street": "123 Main St",
          "city": "New York",
          "state": "NY",
          "postal_code": "10001"
        }
      },
      "products": [
        {
          "product_id": 10001,
          "name": "Laptop Pro",
          "category": "Electronics",
          "price": 1299.99,
          "quantity": 1
        },
        {
          "product_id": 10002,
          "name": "Wireless Mouse",
          "category": "Accessories",
          "price": 29.99,
          "quantity": 1
        }
      ],
      "payments": [
        {
          "payment_id": "pay_001",
          "amount": 1329.98,
          "currency": "USD",
          "method": "credit_card",
          "status": "completed"
        }
      ],
      "order_date": 1609545600000,
      "total_amount": 1329.98,
      "metadata": {
        "source": "web",
        "campaign": "summer_sale",
        "device": "mobile"
      }
    },
    {
      "order_id": 1002,
      "customer": {
        "customer_id": 5002,
        "name": "Bob Smith",
        "email": null,
        "address": {
          "street": "456 Oak Ave",
          "city": "Los Angeles",
          "state": "CA",
          "postal_code": "90001"
        }
      },
      "products": [
        {
          "product_id": 10003,
          "name": "Smart TV",
          "category": "Electronics",
          "price": 899.99,
          "quantity": 1
        },
        {
          "product_id": 10004,
          "name": "HDMI Cable",
          "category": "Accessories",
          "price": 15.99,
          "quantity": 2
        },
        {
          "product_id": 10005,
          "name": "Streaming Service Subscription",
          "category": "Digital",
          "price": 12.99,
          "quantity": 1
        }
      ],
      "payments": [
        {
          "payment_id": "pay_002",
          "amount": 931.97,
          "currency": "USD",
          "method": "paypal",
          "status": "completed"
        }
      ],
      "order_date": 1609632000000,
      "total_amount": 931.97,
      "metadata": {
        "source": "mobile_app",
        "promotion": "free_shipping"
      }
    }
  ],
  "compression": {
    "codec": "gzip",
    "level": "default"
  },
  "encoding": {
    "dictionary": "RLE_DICTIONARY",
    "data": "DELTA_BINARY_PACKED"
  },
  "metadata": {
    "created_by": "order_processing_system",
    "description": "E-commerce order data with nested customer, products, and payment information",
    "partition_keys": ["order_date"],
    "sort_keys": ["customer.customer_id", "order_id"]
  }
}

📋 Parquet Performance Optimization

🔴 complex ⭐⭐⭐⭐⭐

Advanced Parquet optimization techniques with compression, encoding, and partitioning strategies

⏱️ 30 min 🏷️ parquet, optimization, bigdata, performance
Prerequisites: Advanced Parquet, Data engineering, Performance tuning
{
  "schema": {
    "fields": [
      {
        "name": "event_timestamp",
        "type": "int64",
        "repetition_type": "REQUIRED",
        "converted_type": "TIMESTAMP_MICROS",
        "encoding": "DELTA_BINARY_PACKED",
        "statistics": {}
      },
      {
        "name": "user_id",
        "type": "int64",
        "repetition_type": "REQUIRED",
        "encoding": "RLE_DICTIONARY",
        "statistics": {
          "distinct_count": 1000000,
          "null_count": 0
        }
      },
      {
        "name": "session_id",
        "type": "binary",
        "repetition_type": "REQUIRED",
        "converted_type": "UTF8",
        "encoding": "RLE_DICTIONARY",
        "statistics": {
          "distinct_count": 500000,
          "null_count": 0
        }
      },
      {
        "name": "event_type",
        "type": "binary",
        "repetition_type": "REQUIRED",
        "converted_type": "UTF8",
        "encoding": "RLE_DICTIONARY",
        "statistics": {
          "distinct_count": 25,
          "null_count": 0
        }
      },
      {
        "name": "page_url",
        "type": "binary",
        "repetition_type": "OPTIONAL",
        "converted_type": "UTF8",
        "encoding": "DELTA_LENGTH_BYTE_ARRAY"
      },
      {
        "name": "user_agent",
        "type": "binary",
        "repetition_type": "OPTIONAL",
        "converted_type": "UTF8",
        "encoding": "DELTA_LENGTH_BYTE_ARRAY"
      },
      {
        "name": "ip_address",
        "type": "binary",
        "repetition_type": "OPTIONAL",
        "converted_type": "UTF8",
        "encoding": "RLE_DICTIONARY"
      },
      {
        "name": "country_code",
        "type": "binary",
        "repetition_type": "OPTIONAL",
        "converted_type": "UTF8",
        "encoding": "RLE_DICTIONARY",
        "statistics": {
          "distinct_count": 150,
          "null_count": 10000
        }
      },
      {
        "name": "device_type",
        "type": "binary",
        "repetition_type": "OPTIONAL",
        "converted_type": "UTF8",
        "encoding": "RLE_DICTIONARY",
        "statistics": {
          "distinct_count": 5,
          "null_count": 5000
        }
      },
      {
        "name": "browser",
        "type": "binary",
        "repetition_type": "OPTIONAL",
        "converted_type": "UTF8",
        "encoding": "RLE_DICTIONARY",
        "statistics": {
          "distinct_count": 10,
          "null_count": 5000
        }
      },
      {
        "name": "os",
        "type": "binary",
        "repetition_type": "OPTIONAL",
        "converted_type": "UTF8",
        "encoding": "RLE_DICTIONARY",
        "statistics": {
          "distinct_count": 8,
          "null_count": 5000
        }
      },
      {
        "name": "revenue",
        "type": "double",
        "repetition_type": "OPTIONAL",
        "encoding": "PLAIN",
        "statistics": {
          "min": 0.0,
          "max": 599.99,
          "null_count": 8000000
        }
      },
      {
        "name": "custom_properties",
        "type": "map",
        "repetition_type": "OPTIONAL",
        "converted_type": "MAP",
        "fields": [
          {
            "name": "key_value",
            "type": "group",
            "repetition_type": "REPEATED",
            "fields": [
              {
                "name": "key",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8",
                "encoding": "RLE_DICTIONARY"
              },
              {
                "name": "value",
                "type": "binary",
                "repetition_type": "REQUIRED",
                "converted_type": "UTF8",
                "encoding": "DELTA_LENGTH_BYTE_ARRAY"
              }
            ]
          }
        ]
      }
    ]
  },
  "compression_strategies": {
    "codec": "zstd",
    "level": 5,
    "column_specific": {
      "event_type": "rle_dictionary",
      "user_id": "bit_packed",
      "session_id": "rle_dictionary",
      "revenue": "delta"
    }
  },
  "partitioning": {
    "partition_keys": [
      "event_date",
      "event_hour",
      "country_code"
    ],
    "partition_value_schema": {
      "event_date": "string",
      "event_hour": "int",
      "country_code": "string"
    },
    "partition_filtering": {
      "date_range": {
        "start": "2023-01-01",
        "end": "2023-12-31"
      },
      "country_whitelist": ["US", "CA", "GB", "DE", "FR", "JP", "AU"]
    }
  },
  "bucketing": {
    "enabled": true,
    "bucket_by": "user_id",
    "bucket_count": 256
  },
  "sorting": {
    "sort_keys": [
      "event_timestamp",
      "user_id",
      "session_id"
    ],
    "sort_order": "ascending"
  },
  "row_group_optimization": {
    "target_row_group_size": 134217728,
    "max_row_group_size": 268435456,
    "adaptive_row_groups": true,
    "min_rows_per_group": 1000
  },
  "statistics_optimization": {
    "column_indexes": [
      "user_id",
      "session_id",
      "event_timestamp"
    ],
    "bloom_filter_columns": [
      "user_id",
      "session_id",
      "device_id",
      "order_id"
    ],
    "bloom_filter_fpp": 0.05,
    "null_count_statistics": true,
    "min_max_statistics": true,
    "distinct_count_statistics": true,
    "ndv_max_tracked_values": 1000000
  },
  "data_page_optimization": {
    "data_page_size": 1048576,
    "dictionary_page_size": 1048576,
    "v1_page_size": 1048576
  },
  "schema_evolution": {
    "add_columns": true,
    "drop_columns": false,
    "rename_columns": false,
    "change_types": false,
    "column_order": "maintain"
  },
  "performance_metrics": {
    "estimated_size_mb": 2048,
    "compression_ratio": 0.15,
    "scan_efficiency_score": 0.95,
    "query_performance_rating": "excellent"
  },
  "optimization_rules": [
    {
      "rule_name": "low_cardinality_string_optimization",
      "condition": "distinct_count < 100 AND avg_length > 10",
      "action": "use_rle_dictionary_encoding"
    },
    {
      "rule_name": "numeric_range_optimization",
      "condition": "type IN (int32, int64) AND range/max < 0.1",
      "action": "use_delta_encoding"
    },
    {
      "rule_name": "high_null_ratio_optimization",
      "condition": "null_count / total_count > 0.8",
      "action": "use_nullable_with_default_values"
    },
    {
      "rule_name": "large_string_optimization",
      "condition": "avg_length > 100 AND varchar",
      "action": "use_delta_length_byte_array"
    }
  ],
  "metadata": {
    "created_by": "data_lake_optimization_engine",
    "version": "2.0",
    "optimization_applied": "2025-01-15T10:30:00Z",
    "last_analyzed": "2025-01-15T10:30:00Z",
    "data_quality_metrics": {
      "completeness_score": 0.98,
      "accuracy_score": 0.95,
      "consistency_score": 0.97,
      "validity_score": 0.99
    },
    "usage_patterns": {
      "common_query_filters": [
        "event_timestamp BETWEEN start_date AND end_date",
        "user_id = specific_user",
        "event_type IN (page_view, click, purchase)",
        "country_code = specific_country"
      ],
      "access_frequency": "high",
      "update_frequency": "daily",
      "retention_period": "2_years"
    },
    "cost_optimization": {
      "storage_cost_savings": "65%",
      "query_performance_improvement": "40%",
      "compression_tuning_applied": true,
      "partition_strategy_optimized": true
    }
  }
}