Exemples Mojo

Exemples de langage de programmation Mojo - langage spécialisé développement IA avec simplicité Python et performance C++

💻 Mojo Hello World mojo

🟢 simple ⭐⭐

Programme Hello World Mojo avec syntaxe et fonctionnalités IA/ML

⏱️ 20 min 🏷️ mojo, ai, machine learning, performance
Prerequisites: Basic programming concepts, Python familiarity helpful
// Mojo Hello World Examples - AI Development Language

# 1. Basic Hello World
fn main():
    print("Hello, Mojo World!")

# 2. Hello World with Mojo types and variables
fn hello_with_types():
    # Mojo supports Python-like syntax with type hints
    name: String = "Mojo"
    version: Float = 0.1
    is_ready: Bool = True
    year: Int = 2024

    print("Language: {name}")
    print("Version: {version}")
    print("Ready: {is_ready}")
    print("Year: {year}")

# 3. Hello World with AI context
fn ai_hello_world():
    # Mojo is designed for AI/ML development
    framework: String = "Mojo AI"
    purpose: String = "High-performance AI development"
    speedup: Float = 3500.0  # 3500x faster than Python for some operations

    print("{framework}: {purpose}")
    print("Performance improvement: {speedup}x")

# 4. Hello World with functions and type inference
def greet(name: String) -> String:
    return f"Hello, {name}!"

def demonstrate_functions():
    greeting = greet("Mojo Developer")
    print(greeting)

    # Type inference example
    x = 42        # Inferred as Int
    y = 3.14      # Inferred as Float
    z = "AI/ML"   # Inferred as String

    print(f"x = {x}, y = {y}, z = {z}")

# 5. Hello World with Mojo structs
struct Point:
    var x: Float
    var y: Float

    fn __init__(self, x: Float, y: Float):
        self.x = x
        self.y = y

    fn distance(self, other: Point) -> Float:
        return ((self.x - other.x)**2 + (self.y - other.y)**2).sqrt()

fn demonstrate_structs():
    p1 = Point(0.0, 0.0)
    p2 = Point(3.0, 4.0)
    dist = p1.distance(p2)
    print(f"Distance between points: {dist}")

# 6. Hello World with lists and comprehensions
fn demonstrate_collections():
    # Python-like list syntax
    numbers = [1, 2, 3, 4, 5]
    squares = [x**2 for x in numbers]
    even_squares = [x for x in squares if x % 2 == 0]

    print(f"Original: {numbers}")
    print(f"Squares: {squares}")
    print(f"Even squares: {even_squares}")

# 7. Hello World with Mojo tensors (AI/ML focus)
from tensor import Tensor

fn tensor_hello_world():
    # Create a simple tensor
    data = Tensor[DType.float32](2, 3)

    # Fill with values
    for i in range(2):
        for j in range(3):
            data[i, j] = Float(i * 3 + j)

    print("Tensor data:")
    print(data)
    print(f"Tensor shape: {data.shape()}")

# 8. Hello World with Mojo's memory safety
@value
struct SafeString:
    var data: String

    fn __init__(self, content: String):
        self.data = content

    fn greet(self) -> String:
        return f"Safe hello: {self.data}"

fn demonstrate_memory_safety():
    safe_msg = SafeString("Memory-safe Mojo!")
    print(safe_msg.greet())

# 9. Hello World with Mojo concurrency
from algorithm import parallel_for

fn concurrent_hello():
    # Parallel processing example
    def print_message(i: Int):
        print(f"Hello from thread {i}")

    # Execute in parallel
    parallel_for[print_message](range(4))

# 10. Hello World with ML concepts
fn ml_hello_world():
    # Simple linear regression in Mojo style
    @parameter
    fn predict[X: Int](w: Tensor[DType.float32, X], b: Float, x: Tensor[DType.float32, X]) -> Float:
        return (w @ x).sum() + b

    # Create mock model parameters
    weights = Tensor[DType.float32](3)
    bias = 0.5
    input = Tensor[DType.float32](3)

    # Fill with values
    for i in range(3):
        weights[i] = Float(i + 1)
        input[i] = Float(i * 0.5)

    prediction = predict(weights, bias, input)
    print(f"ML prediction: {prediction}")

# 11. Hello World with Mojo's SIMD capabilities
from simd import SIMD

fn simd_hello():
    # SIMD vector operations
    var vec = SIMD[DType.float32, 4](1.0, 2.0, 3.0, 4.0)
    var result = vec * 2.0

    print("SIMD vector operations:")
    print(f"Original: {vec}")
    print(f"Result: {result}")

# 12. Hello World with Mojo's string formatting
fn advanced_string_formatting():
    name = "Mojo"
    version = 0.1
    features = ["High performance", "Python compatibility", "AI/ML focus"]

    # f-string formatting (like Python)
    message = f"{name} v{version}: {', '.join(features)}"
    print(message)

    # Format with alignment
    print(f"{'Language':<10} | {'Version':>8}")
    print(f"{name:<10} | {version:>8.1f}")

# Main function demonstrating all examples
fn main():
    print("=== Mojo Hello World Examples ===\n")

    print("1. Basic Hello World:")
    hello_with_types()
    print()

    print("2. AI/ML Context:")
    ai_hello_world()
    print()

    print("3. Functions:")
    demonstrate_functions()
    print()

    print("4. Structs:")
    demonstrate_structs()
    print()

    print("5. Collections:")
    demonstrate_collections()
    print()

    print("6. Tensors (AI focus):")
    tensor_hello_world()
    print()

    print("7. Memory Safety:")
    demonstrate_memory_safety()
    print()

    print("8. Concurrency:")
    concurrent_hello()
    print()

    print("9. ML Concepts:")
    ml_hello_world()
    print()

    print("10. SIMD Operations:")
    simd_hello()
    print()

    print("11. Advanced String Formatting:")
    advanced_string_formatting()
    print()

    print("=== All Mojo Examples Completed ===")

💻 Bases Apprentissage Automatique Mojo mojo

🟡 intermediate ⭐⭐⭐⭐

Opérations ML de base en Mojo incluant tenseurs, réseaux neuronaux et boucles d'entraînement

⏱️ 35 min 🏷️ mojo, ml, ai, neural networks
Prerequisites: Basic ML concepts, Linear algebra, Mojo basic syntax
// Mojo Machine Learning Basics

from tensor import Tensor
from random import random
from math import sqrt, exp
from algorithm import parallel_for

# 1. Tensor Operations Foundation
fn tensor_basics():
    print("=== Tensor Basics ===")

    # Create tensors of different types
    # Float tensor
    float_tensor = Tensor[DType.float32](3, 4)

    # Int tensor
    int_tensor = Tensor[DType.int32](2, 3)

    # Initialize with values
    for i in range(3):
        for j in range(4):
            float_tensor[i, j] = Float(i * 4 + j)

    for i in range(2):
        for j in range(3):
            int_tensor[i, j] = i * 3 + j

    print("Float tensor:")
    print(float_tensor)
    print(f"Shape: {float_tensor.shape()}")
    print(f"Size: {float_tensor.numel()}")

    print("\nInt tensor:")
    print(int_tensor)
    print(f"Shape: {int_tensor.shape()}")

# 2. Basic Neural Network Layer
@value
struct LinearLayer:
    var weights: Tensor[DType.float32]
    var bias: Tensor[DType.float32]
    var input_size: Int
    var output_size: Int

    fn __init__(inout self, input_size: Int, output_size: Int):
        self.input_size = input_size
        self.output_size = output_size

        # Initialize weights and bias
        self.weights = Tensor[DType.float32](output_size, input_size)
        self.bias = Tensor[DType.float32](output_size)

        # Random initialization (Xavier/Glorot)
        var scale = sqrt(2.0 / Float(input_size))
        for i in range(output_size):
            for j in range(input_size):
                self.weights[i, j] = (random() - 0.5) * 2.0 * scale
            self.bias[i] = 0.0

    fn forward(self, input: Tensor[DType.float32]) -> Tensor[DType.float32]:
        # Linear transformation: output = weights @ input + bias
        var output = Tensor[DType.float32](self.output_size)

        for i in range(self.output_size):
            var sum = 0.0
            for j in range(self.input_size):
                sum += self.weights[i, j] * input[j]
            output[i] = sum + self.bias[i]

        return output

# 3. Activation Functions
fn relu(x: Tensor[DType.float32]) -> Tensor[DType.float32]:
    var result = Tensor[DType.float32](x.numel())
    for i in range(x.numel()):
        result[i] = max(0.0, x[i])
    return result

fn sigmoid(x: Tensor[DType.float32]) -> Tensor[DType.float32]:
    var result = Tensor[DType.float32](x.numel())
    for i in range(x.numel()):
        result[i] = 1.0 / (1.0 + exp(-x[i]))
    return result

fn softmax(x: Tensor[DType.float32]) -> Tensor[DType.float32]:
    # Subtract max for numerical stability
    var max_val = x[0]
    for i in range(x.numel()):
        if x[i] > max_val:
            max_val = x[i]

    var exp_values = Tensor[DType.float32](x.numel())
    var exp_sum = 0.0

    for i in range(x.numel()):
        exp_values[i] = exp(x[i] - max_val)
        exp_sum += exp_values[i]

    for i in range(x.numel()):
        exp_values[i] /= exp_sum

    return exp_values

# 4. Loss Functions
fn mse_loss(predicted: Tensor[DType.float32], target: Tensor[DType.float32]) -> Float:
    var sum = 0.0
    for i in range(predicted.numel()):
        var diff = predicted[i] - target[i]
        sum += diff * diff
    return sum / Float(predicted.numel())

fn cross_entropy_loss(predicted: Tensor[DType.float32], target: Int) -> Float:
    # Predicted should be softmax probabilities
    return -log(predicted[target] + 1e-10)  # Add small epsilon for numerical stability

# 5. Simple Neural Network
@value
struct SimpleNeuralNetwork:
    var layer1: LinearLayer
    var layer2: LinearLayer
    var layer3: LinearLayer

    fn __init__(inout self, input_size: Int, hidden_size: Int, output_size: Int):
        self.layer1 = LinearLayer(input_size, hidden_size)
        self.layer2 = LinearLayer(hidden_size, hidden_size)
        self.layer3 = LinearLayer(hidden_size, output_size)

    fn forward(self, input: Tensor[DType.float32]) -> Tensor[DType.float32]:
        # Forward pass through layers
        var hidden1 = self.layer1.forward(input)
        hidden1 = relu(hidden1)

        var hidden2 = self.layer2.forward(hidden1)
        hidden2 = relu(hidden2)

        var output = self.layer3.forward(hidden2)
        output = softmax(output)

        return output

# 6. Training Data Generator
fn generate_xor_data(num_samples: Int) -> Tuple[Tensor[DType.float32], Tensor[Int]]:
    # Generate XOR dataset
    var inputs = Tensor[DType.float32](num_samples, 2)
    var targets = Tensor[Int](num_samples)

    for i in range(num_samples):
        # Generate random binary inputs
        var x1 = if random() > 0.5 else 0.0 else 1.0
        var x2 = if random() > 0.5 else 0.0 else 1.0
        var target = Int(x1 != x2)  # XOR logic

        inputs[i, 0] = x1
        inputs[i, 1] = x2
        targets[i] = target

    return (inputs, targets)

# 7. Training Loop
fn train_network():
    print("\n=== Training Neural Network ===")

    # Create network
    var network = SimpleNeuralNetwork(2, 4, 2)

    # Generate training data
    var (inputs, targets) = generate_xor_data(1000)

    # Training parameters
    var learning_rate = 0.1
    var epochs = 100

    print(f"Training for {epochs} epochs...")

    for epoch in range(epochs):
        var total_loss = 0.0
        var correct = 0

        # Simple gradient descent training
        for i in range(100):  # Use subset for faster training
            # Get single sample
            var input = Tensor[DType.float32](2)
            input[0] = inputs[i, 0]
            input[1] = inputs[i, 1]

            var target = targets[i]

            # Forward pass
            var output = network.forward(input)

            # Calculate loss
            var loss = cross_entropy_loss(output, target)
            total_loss += loss

            # Simple accuracy calculation
            var predicted_class = 0
            if output[1] > output[0]:
                predicted_class = 1

            if predicted_class == target:
                correct += 1

    var accuracy = Float(correct) / Float(100)
    print(f"Training completed! Accuracy: {accuracy:.2f}")

# 8. Batch Processing
fn batch_operations():
    print("\n=== Batch Operations ===")

    # Create batch of data
    var batch_size = 32
    var features = 10
    var batch_data = Tensor[DType.float32](batch_size, features)

    # Fill with random data
    for i in range(batch_size):
        for j in range(features):
            batch_data[i, j] = random()

    # Parallel batch processing
    def process_batch(start_idx: Int, end_idx: Int):
        for i in range(start_idx, end_idx):
            # Simulate processing each sample
            var sum = 0.0
            for j in range(features):
                sum += batch_data[i, j]
            # Normalize the row
            for j in range(features):
                batch_data[i, j] /= sum

    # Process in parallel chunks
    var chunk_size = batch_size // 4
    parallel_for[process_batch](range(0, batch_size, chunk_size))

    print(f"Processed batch of size {batch_size} with {features} features")
    print(f"Sample row after processing: [", end="")
    for j in range(3):
        print(f"{batch_data[0, j]:.3f}", end="")
        if j < 2: print(", ", end="")
    print("]")

# 9. Optimization Algorithms
struct GradientDescent:
    var learning_rate: Float

    fn __init__(self, learning_rate: Float):
        self.learning_rate = learning_rate

    fn update(self, param: Tensor[DType.float32], gradient: Tensor[DType.float32]):
        for i in range(param.numel()):
            param[i] -= self.learning_rate * gradient[i]

# 10. Simple CNN Example (1D convolution)
fn simple_convolution():
    print("\n=== Simple 1D Convolution ===")

    # Input signal
    var signal = Tensor[DType.float32](10)
    for i in range(10):
        signal[i] = sin(Float(i) * 0.5)

    # Convolution kernel
    var kernel = Tensor[DType.float32](3)
    kernel[0] = 0.25
    kernel[1] = 0.5
    kernel[2] = 0.25

    # Convolution output
    var output_size = signal.numel() - kernel.numel() + 1
    var conv_output = Tensor[DType.float32](output_size)

    for i in range(output_size):
        var sum = 0.0
        for j in range(kernel.numel()):
            sum += signal[i + j] * kernel[j]
        conv_output[i] = sum

    print("Original signal: [", end="")
    for i in range(5):
        print(f"{signal[i]:.2f}", end="")
        if i < 4: print(", ", end="")
    print("]")

    print("Convolved signal: [", end="")
    for i in range(5):
        print(f"{conv_output[i]:.2f}", end="")
        if i < 4: print(", ", end="")
    print("]")

# 11. Data Normalization
fn normalize_data(data: Tensor[DType.float32]) -> Tensor[DType.float32]:
    # Calculate mean and standard deviation
    var sum = 0.0
    var sum_sq = 0.0
    var n = Float(data.numel())

    for i in range(data.numel()):
        sum += data[i]
        sum_sq += data[i] * data[i]

    var mean = sum / n
    var variance = (sum_sq / n) - (mean * mean)
    var std = sqrt(max(variance, 1e-8))

    # Normalize
    var normalized = Tensor[DType.float32](data.numel())
    for i in range(data.numel()):
        normalized[i] = (data[i] - mean) / std

    return normalized

fn demonstrate_normalization():
    print("\n=== Data Normalization ===")

    # Create sample data
    var data = Tensor[DType.float32](10)
    for i in range(10):
        data[i] = Float(i) * 2.0 + 5.0  # Data with mean ~15

    print("Original data: [", end="")
    for i in range(5):
        print(f"{data[i]:.1f}", end="")
        if i < 4: print(", ", end="")
    print("]")

    var normalized = normalize_data(data)
    print("Normalized data: [", end="")
    for i in range(5):
        print(f"{normalized[i]:.2f}", end="")
        if i < 4: print(", ", end="")
    print("]")

# Main function demonstrating all ML concepts
fn main():
    print("=== Mojo Machine Learning Basics ===\n")

    tensor_basics()

    # Demonstrate neural network layer
    print("\n=== Neural Network Layer ===")
    var layer = LinearLayer(3, 4)
    var input = Tensor[DType.float32](3)
    for i in range(3):
        input[i] = Float(i + 1)

    var output = layer.forward(input)
    print("Layer input:", input)
    print("Layer output:", output)

    # Demonstrate activation functions
    print("\n=== Activation Functions ===")
    var test_input = Tensor[DType.float32](5)
    for i in range(5):
        test_input[i] = Float(i - 2)  # [-2, -1, 0, 1, 2]

    print("Input:", test_input)
    print("ReLU:", relu(test_input))
    print("Sigmoid:", sigmoid(test_input))
    print("Softmax:", softmax(test_input))

    # Demonstrate loss functions
    print("\n=== Loss Functions ===")
    var pred = Tensor[DType.float32](3)
    var target = Tensor[DType.float32](3)
    for i in range(3):
        pred[i] = Float(i)
        target[i] = Float(i + 1)

    print("MSE Loss:", mse_loss(pred, target))

    # Training demonstration
    train_network()

    # Batch processing
    batch_operations()

    # Convolution
    simple_convolution()

    # Normalization
    demonstrate_normalization()

    print("\n=== All ML Basics Completed ===")

💻 Calcul Haute Performance Mojo mojo

🔴 complex ⭐⭐⭐⭐⭐

Fonctionnalités avancées Mojo incluant SIMD, traitement parallèle et optimisation mémoire

⏱️ 40 min 🏷️ mojo, performance, simd, parallel, ai
Prerequisites: Advanced Mojo, Computer architecture, Parallel programming concepts
// Mojo High-Performance Computing for AI

from tensor import Tensor
from simd import SIMD
from algorithm import parallel_for, vectorize
from memory import _memcpy
from math import sqrt, sin, cos
from time import now

# 1. SIMD Vector Operations
fn simd_vector_operations():
    print("=== SIMD Vector Operations ===")

    # Create SIMD vectors
    var vec_a = SIMD[DType.float32, 8](1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)
    var vec_b = SIMD[DType.float32, 8](8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0)

    # Vector arithmetic
    var vec_sum = vec_a + vec_b
    var vec_product = vec_a * vec_b
    var vec_diff = vec_a - vec_b

    print("Vector A:", vec_a)
    print("Vector B:", vec_b)
    print("Sum:", vec_sum)
    print("Product:", vec_product)
    print("Difference:", vec_diff)

    # Vector reduction operations
    var sum_elements = 0.0
    for i in range(8):
        sum_elements += vec_a[i]

    print(f"Sum of A elements: {sum_elements}")

# 2. Matrix Multiplication with SIMD
@value
struct Matrix4x4:
    var data: SIMD[DType.float32, 16]  # 4x4 matrix stored as SIMD vector

    fn __init__(self):
        self.data = SIMD[DType.float32, 16](0.0)

    fn set(self, row: Int, col: Int, value: Float):
        self.data[row * 4 + col] = value

    fn get(self, row: Int, col: Int) -> Float:
        return self.data[row * 4 + col]

    fn __mul__(self, other: Matrix4x4) -> Matrix4x4:
        var result = Matrix4x4()

        # Optimized matrix multiplication using SIMD
        for i in range(4):
            for j in range(4):
                var sum = 0.0
                for k in range(4):
                    sum += self.get(i, k) * other.get(k, j)
                result.set(i, j, sum)

        return result

fn demonstrate_matrix_ops():
    print("\n=== Optimized Matrix Operations ===")

    var mat_a = Matrix4x4()
    var mat_b = Matrix4x4()

    # Fill matrices with identity + some values
    for i in range(4):
        for j in range(4):
            mat_a.set(i, j, 1.0 if i == j else Float(i + j))
            mat_b.set(i, j, 2.0 if i == j else Float(i * j))

    print("Matrix A diagonal sum: 4.0")
    print("Matrix B diagonal sum: 8.0")

    var mat_c = mat_a * mat_b
    print("Matrix multiplication completed")

# 3. Parallel Processing Example
fn parallel_processing():
    print("\n=== Parallel Processing ===")

    # Large dataset for parallel processing
    var data_size = 1000000
    var data = Tensor[DType.float32](data_size)

    # Initialize data
    for i in range(data_size):
        data[i] = sin(Float(i) * 0.001)

    # Sequential processing
    var start_time = now()
    var sequential_sum = 0.0
    for i in range(data_size):
        sequential_sum += data[i] * data[i]
    var sequential_time = now() - start_time

    # Parallel processing using Mojo's parallel_for
    start_time = now()
    var partial_sums = Tensor[DType.float32](8)  # 8 threads

    def process_chunk(start: Int, end: Int, thread_id: Int):
        var local_sum = 0.0
        for i in range(start, end):
            local_sum += data[i] * data[i]
        partial_sums[thread_id] = local_sum

    # Launch parallel tasks
    var chunk_size = data_size // 8
    parallel_for[process_chunk](range(8))

    # Combine partial results
    var parallel_sum = 0.0
    for i in range(8):
        parallel_sum += partial_sums[i]

    var parallel_time = now() - start_time

    print(f"Sequential result: {sequential_sum}")
    print(f"Parallel result: {parallel_sum}")
    print(f"Sequential time: {sequential_time}μs")
    print(f"Parallel time: {parallel_time}μs")
    print(f"Speedup: {sequential_time / parallel_time:.2f}x")

# 4. Memory-Optimized Tensor Operations
fn memory_optimized_operations():
    print("\n=== Memory-Optimized Operations ===")

    # Create large tensors
    var size = 1000
    var tensor_a = Tensor[DType.float32](size, size)
    var tensor_b = Tensor[DType.float32](size, size)

    # Fill with test data
    for i in range(size):
        for j in range(size):
            tensor_a[i, j] = Float(i * j) / 1000.0
            tensor_b[i, j] = Float(i + j) / 1000.0

    # Memory-efficient element-wise operation
    var start_time = now()

    # Process in blocks to improve cache locality
    var block_size = 64
    var result = Tensor[DType.float32](size, size)

    for block_i in range(0, size, block_size):
        for block_j in range(0, size, block_size):
            # Process block
            for i in range(block_i, min(block_i + block_size, size)):
                for j in range(block_j, min(block_j + block_size, size)):
                    result[i, j] = tensor_a[i, j] + tensor_b[i, j]

    var processing_time = now() - start_time
    print(f"Block-based processing time: {processing_time}μs")
    print(f"Result[500, 500] = {result[500, 500]:.4f}")

# 5. Vectorized Operations
fn vectorized_operations():
    print("\n=== Vectorized Operations ===")

    # Create large arrays
    var n = 100000
    var array_a = Tensor[DType.float32](n)
    var array_b = Tensor[DType.float32](n)
    var array_c = Tensor[DType.float32](n)

    # Initialize
    for i in range(n):
        array_a[i] = Float(i)
        array_b[i] = Float(n - i)

    # Vectorized operation
    @vectorize
    def vectorized_add(a: Tensor[DType.float32], b: Tensor[DType.float32], c: Tensor[DType.float32]):
        for i in range(len(c)):
            c[i] = a[i] + b[i]

    var start_time = now()
    vectorized_add(array_a, array_b, array_c)
    var vectorized_time = now() - start_time

    # Verify result
    var expected_sum = array_a[0] + array_b[0]
    print(f"Vectorized addition result[0]: {array_c[0]} (expected: {expected_sum})")
    print(f"Vectorized processing time: {vectorized_time}μs")

# 6. Custom Kernels for AI Operations
fn custom_ai_kernels():
    print("\n=== Custom AI Kernels ===")

    # ReLU activation kernel
    @always_inline
    fn relu_kernel(input: Pointer[DType.float32], output: Pointer[DType.float32], size: Int):
        for i in range(size):
            output[i] = max(0.0, input[i])

    # Softmax kernel (optimized)
    @always_inline
    fn softmax_kernel(input: Pointer[DType.float32], output: Pointer[DType.float32], size: Int):
        # Find maximum
        var max_val = input[0]
        for i in range(1, size):
            if input[i] > max_val:
                max_val = input[i]

        # Compute exponential and sum
        var sum = 0.0
        for i in range(size):
            var exp_val = exp(input[i] - max_val)
            output[i] = exp_val
            sum += exp_val

        # Normalize
        var inv_sum = 1.0 / sum
        for i in range(size):
            output[i] *= inv_sum

    # Test kernels
    var data_size = 1000
    var input_data = Tensor[DType.float32](data_size)
    var relu_output = Tensor[DType.float32](data_size)
    var softmax_output = Tensor[DType.float32](data_size)

    # Fill with random data
    for i in range(data_size):
        input_data[i] = (Float(i) - Float(data_size / 2)) / 100.0

    var start_time = now()
    relu_kernel(input_data.data(), relu_output.data(), data_size)
    var relu_time = now() - start_time

    start_time = now()
    softmax_kernel(input_data.data(), softmax_output.data(), data_size)
    var softmax_time = now() - start_time

    print(f"ReLU kernel processing time: {relu_time}μs")
    print(f"Softmax kernel processing time: {softmax_time}μs")
    print(f"ReLU output[500]: {relu_output[500]:.4f}")
    print(f"Softmax sum: {softmax_output.sum():.6f}")

# 7. GPU-like Parallel Reduction
fn parallel_reduction():
    print("\n=== Parallel Reduction ===")

    var data_size = 1048576  # 2^20 elements
    var data = Tensor[DType.float32](data_size)

    # Fill with data
    for i in range(data_size):
        data[i] = Float(i % 100) / 100.0

    # Sequential reduction
    var start_time = now()
    var sequential_sum = 0.0
    for i in range(data_size):
        sequential_sum += data[i]
    var sequential_time = now() - start_time

    # Tree-based parallel reduction
    start_time = now()
    var temp_data = Tensor[DType.float32](data_size)
    _memcpy(temp_data.data(), data.data(), data_size.numel() * sizeof(DType.float32))

    var active_size = data_size
    while active_size > 1:
        for i in range(active_size // 2):
            temp_data[i] = temp_data[2 * i] + temp_data[2 * i + 1]

        if active_size % 2 == 1:
            temp_data[active_size // 2] = temp_data[active_size - 1]

        active_size = (active_size + 1) // 2

    var parallel_sum = temp_data[0]
    var parallel_time = now() - start_time

    print(f"Sequential sum: {sequential_sum:.6f}")
    print(f"Parallel sum: {parallel_sum:.6f}")
    print(f"Sequential time: {sequential_time}μs")
    print(f"Parallel time: {parallel_time}μs")
    print(f"Reduction speedup: {sequential_time / parallel_time:.2f}x")

# 8. Cache-Oblivious Matrix Multiplication
fn cache_oblivious_multiply(A: Pointer[DType.float32], B: Pointer[DType.float32],
                           C: Pointer[DType.float32], n: Int, block_size: Int = 64):
    # Recursive cache-oblivious matrix multiplication
    if n <= block_size:
        # Base case: regular multiplication
        for i in range(n):
            for j in range(n):
                var sum = 0.0
                for k in range(n):
                    sum += A[i * n + k] * B[k * n + j]
                C[i * n + j] = sum
        return

    # Recursive case: divide into blocks
    var half_n = n // 2

    # C11 = A11 * B11 + A12 * B21
    cache_oblivious_multiply(A, B, C, half_n, block_size)
    cache_oblivious_multiply(A + half_n * n, B + half_n, C, half_n, block_size)

    # C12 = A11 * B12 + A12 * B22
    cache_oblivious_multiply(A, B + half_n, C + half_n, half_n, block_size)
    cache_oblivious_multiply(A + half_n * n, B + half_n + half_n * n, C + half_n, half_n, block_size)

    # C21 = A21 * B11 + A22 * B21
    cache_oblivious_multiply(A + half_n * n * half_n, B, C + half_n * n, half_n, block_size)
    cache_oblivious_multiply(A + half_n * n * half_n + half_n * n, B + half_n, C + half_n * n, half_n, block_size)

    # C22 = A21 * B12 + A22 * B22
    cache_oblivious_multiply(A + half_n * n * half_n, B + half_n, C + half_n * n + half_n, half_n, block_size)
    cache_oblivious_multiply(A + half_n * n * half_n + half_n * n, B + half_n + half_n * n, C + half_n * n + half_n, half_n, block_size)

fn demonstrate_cache_optimization():
    print("\n=== Cache-Oblivious Matrix Multiplication ===")

    var size = 512  # Must be power of 2 for this implementation
    var A = Tensor[DType.float32](size, size)
    var B = Tensor[DType.float32](size, size)
    var C = Tensor[DType.float32](size, size)

    # Initialize matrices
    for i in range(size):
        for j in range(size):
            A[i, j] = Float((i + j) % 100) / 100.0
            B[i, j] = Float((i * j) % 100) / 100.0
            C[i, j] = 0.0

    var start_time = now()
    cache_oblivious_multiply(A.data(), B.data(), C.data(), size)
    var cache_time = now() - start_time

    print(f"Cache-oblivious multiplication time: {cache_time}μs")
    print(f"Result[256, 256] = {C[256, 256]:.6f}")

# 9. Memory Pool Allocator for AI Workloads
@value
struct MemoryPool:
    var buffer: Pointer[DType.uint8]
    var size: Int
    var offset: Int

    fn __init__(inout self, size: Int):
        self.size = size
        self.buffer = Pointer[DType.uint8].alloc(size)
        self.offset = 0

    fn __del__(self):
        self.buffer.free()

    fn allocate[T: AnyType](self, count: Int) -> Pointer[T]:
        var required_bytes = count * sizeof(T)
        var aligned_offset = (self.offset + 7) & ~7  # 8-byte alignment

        if aligned_offset + required_bytes > self.size:
            return Pointer[T].null()  # Out of memory

        var result = self.buffer.bitcast[T]() + (aligned_offset / sizeof(T))
        self.offset = aligned_offset + required_bytes

        return result

    fn reset(self):
        self.offset = 0

fn demonstrate_memory_pool():
    print("\n=== Memory Pool Allocator ===")

    var pool = MemoryPool(1024 * 1024)  # 1MB pool

    # Allocate tensors from pool
    var tensor1_size = 1000
    var tensor2_size = 2000

    var tensor1 = pool.allocate[DType.float32](tensor1_size)
    var tensor2 = pool.allocate[DType.float32](tensor2_size)

    print(f"Allocated tensor1: {tensor1 != Pointer[DType.float32].null()}")
    print(f"Allocated tensor2: {tensor2 != Pointer[DType.float32].null()}")
    print(f"Pool offset: {pool.offset} bytes")

    # Reset and reuse
    pool.reset()
    var tensor3 = pool.allocate[DType.float32](5000)
    print(f"After reset - allocated tensor3: {tensor3 != Pointer[DType.float32].null()}")
    print(f"New pool offset: {pool.offset} bytes")

# Main function demonstrating all high-performance features
fn main():
    print("=== Mojo High-Performance Computing ===\n")

    simd_vector_operations()
    demonstrate_matrix_ops()
    parallel_processing()
    memory_optimized_operations()
    vectorized_operations()
    custom_ai_kernels()
    parallel_reduction()
    demonstrate_cache_optimization()
    demonstrate_memory_pool()

    print("\n=== All High-Performance Features Completed ===")