Exemples de Science des Données

Exemples complets de science des données couvrant pandas, numpy, matplotlib pour le traitement, l'analyse et la visualisation de données

Key Facts

Category
Data Science
Items
3
Format Families
sample

Sample Overview

Exemples complets de science des données couvrant pandas, numpy, matplotlib pour le traitement, l'analyse et la visualisation de données This sample set belongs to Data Science and can be used to test related workflows inside Elysia Tools.

💻 Fondamentaux NumPy

🟢 simple ⭐⭐

Fondamentaux du calcul numérique avec NumPy incluant les opérations de tableaux, les opérations mathématiques et l'algèbre linéaire

⏱️ 30 min 🏷️ numpy, data-science, numerical-computing
Prerequisites: Python basics, Basic mathematics
# NumPy Fundamentals for Data Science

import numpy as np
import matplotlib.pyplot as plt

# 1. Creating Arrays
def array_creation():
    print("=== Array Creation ===")

    # From Python lists
    arr1 = np.array([1, 2, 3, 4, 5])
    print("1D array:", arr1)

    arr2 = np.array([[1, 2, 3], [4, 5, 6]])
    print("2D array:\n", arr2)

    # Using NumPy functions
    zeros = np.zeros((3, 4))
    print("Zeros array:\n", zeros)

    ones = np.ones((2, 3))
    print("Ones array:\n", ones)

    # Special arrays
    identity = np.eye(3)
    print("Identity matrix:\n", identity)

    # Range and linspace
    range_arr = np.arange(0, 10, 2)  # Start, stop, step
    print("Arange:", range_arr)

    linspace = np.linspace(0, 10, 5)  # Start, stop, num_points
    print("Linspace:", linspace)

    # Random arrays
    random_uniform = np.random.random((2, 3))
    print("Random uniform:\n", random_uniform)

    random_normal = np.random.randn(3, 3)
    print("Random normal:\n", random_normal)

# 2. Array Properties and Manipulation
def array_properties():
    print("\n=== Array Properties ===")

    arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

    print("Array shape:", arr.shape)
    print("Array dimensions:", arr.ndim)
    print("Array size:", arr.size)
    print("Data type:", arr.dtype)

    # Reshaping
    reshaped = arr.reshape(2, 6)
    print("Reshaped array:\n", reshaped)

    # Flattening
    flattened = arr.flatten()
    print("Flattened array:", flattened)

    # Transpose
    transposed = arr.T
    print("Transposed array:\n", transposed)

# 3. Array Indexing and Slicing
def array_indexing():
    print("\n=== Array Indexing and Slicing ===")

    arr = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])

    # Indexing
    print("Element at [1,2]:", arr[1, 2])
    print("First row:", arr[0])
    print("Second column:", arr[:, 1])

    # Slicing
    print("First two rows:\n", arr[:2])
    print("Last two columns:\n", arr[:, 2:])
    print("Sub-array (rows 0-1, cols 1-2):\n", arr[:2, 1:3])

    # Boolean indexing
    mask = arr > 5
    print("Elements > 5:", arr[mask])

    # Fancy indexing
    row_indices = np.array([0, 2])
    col_indices = np.array([1, 3])
    print("Fancy indexing:", arr[row_indices, col_indices])

# 4. Mathematical Operations
def mathematical_operations():
    print("\n=== Mathematical Operations ===")

    arr1 = np.array([1, 2, 3, 4])
    arr2 = np.array([5, 6, 7, 8])

    # Element-wise operations
    print("Addition:", arr1 + arr2)
    print("Multiplication:", arr1 * arr2)
    print("Division:", arr2 / arr1)
    print("Power:", arr1 ** 2)

    # Universal functions (ufuncs)
    print("Square root:", np.sqrt(arr1))
    print("Exponential:", np.exp(arr1))
    print("Logarithm:", np.log(arr2))
    print("Sine:", np.sin(arr1))

    # Array operations
    matrix_a = np.array([[1, 2], [3, 4]])
    matrix_b = np.array([[5, 6], [7, 8]])

    print("Matrix multiplication:\n", np.dot(matrix_a, matrix_b))
    print("Element-wise multiplication:\n", matrix_a * matrix_b)

    # Statistical operations
    data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    print("Mean:", np.mean(data))
    print("Median:", np.median(data))
    print("Standard deviation:", np.std(data))
    print("Variance:", np.var(data))
    print("Min:", np.min(data))
    print("Max:", np.max(data))
    print("Sum:", np.sum(data))
    print("Product:", np.prod(data))

# 5. Linear Algebra Operations
def linear_algebra():
    print("\n=== Linear Algebra Operations ===")

    # Matrix operations
    A = np.array([[1, 2], [3, 4]])
    B = np.array([[5, 6], [7, 8]])

    print("Matrix A:\n", A)
    print("Matrix B:\n", B)

    # Determinant
    print("Determinant of A:", np.linalg.det(A))

    # Inverse
    try:
        inv_A = np.linalg.inv(A)
        print("Inverse of A:\n", inv_A)
    except np.linalg.LinAlgError:
        print("Matrix A is singular")

    # Eigenvalues and eigenvectors
    eigenvalues, eigenvectors = np.linalg.eig(A)
    print("Eigenvalues:", eigenvalues)
    print("Eigenvectors:\n", eigenvectors)

    # Solving linear equations
    # Ax = b
    A_eq = np.array([[2, 1], [1, 3]])
    b_eq = np.array([5, 6])
    x = np.linalg.solve(A_eq, b_eq)
    print("Solution to Ax = b:", x)

# 6. Data Aggregation and Reduction
def data_aggregation():
    print("\n=== Data Aggregation ===")

    # Create sample data
    data = np.random.randint(0, 100, (5, 4))
    print("Sample data:\n", data)

    # Row-wise operations
    print("Row-wise sum:", data.sum(axis=1))
    print("Row-wise mean:", data.mean(axis=1))
    print("Row-wise max:", data.max(axis=1))

    # Column-wise operations
    print("Column-wise sum:", data.sum(axis=0))
    print("Column-wise mean:", data.mean(axis=0))
    print("Column-wise max:", data.max(axis=0))

    # Cumulative operations
    print("Cumulative sum:", np.cumsum(data))
    print("Cumulative product:", np.cumprod(data))

    # Sorting
    print("Sorted array:", np.sort(data.flatten()))
    print("Sorted by rows:", np.sort(data, axis=1))
    print("Sorted by columns:", np.sort(data, axis=0))

# 7. Broadcasting and Shape Manipulation
def broadcasting_examples():
    print("\n=== Broadcasting Examples ===")

    # Basic broadcasting
    a = np.array([[1, 2, 3], [4, 5, 6]])
    b = np.array([10, 20, 30])

    print("Array a:\n", a)
    print("Array b:", b)
    print("Broadcasted addition:\n", a + b)

    # More complex broadcasting
    x = np.array([[1], [2], [3]])  # Shape (3,1)
    y = np.array([10, 20, 30, 40])  # Shape (4,)

    result = x + y  # Results in (3,4) array
    print("Complex broadcasting result:\n", result)

    # Outer product
    outer = np.outer(x.flatten(), y)
    print("Outer product:\n", outer)

# 8. Performance Comparison
def performance_comparison():
    print("\n=== Performance Comparison ===")

    import time

    # Large arrays
    size = 1000000
    list1 = list(range(size))
    list2 = list(range(size, 2*size))

    arr1 = np.arange(size)
    arr2 = np.arange(size, 2*size)

    # Python list operation
    start = time.time()
    result_list = [a + b for a, b in zip(list1, list2)]
    list_time = time.time() - start

    # NumPy operation
    start = time.time()
    result_array = arr1 + arr2
    numpy_time = time.time() - start

    print(f"Python list time: {list_time:.4f} seconds")
    print(f"NumPy array time: {numpy_time:.4f} seconds")
    print(f"Speedup: {list_time/numpy_time:.2f}x")

# 9. Real-world Data Analysis Example
def sales_data_analysis():
    print("\n=== Sales Data Analysis ===")

    # Simulate sales data (products x months)
    np.random.seed(42)
    products = np.array(['Product_A', 'Product_B', 'Product_C', 'Product_D', 'Product_E'])
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

    # Random sales data
    sales_data = np.random.randint(100, 1000, (5, 6))

    print("Sales data (products x months):\n", sales_data)

    # Basic statistics
    print("\nTotal sales per product:", sales_data.sum(axis=1))
    print("Total sales per month:", sales_data.sum(axis=0))
    print("Best selling product:", products[np.argmax(sales_data.sum(axis=1))])
    print("Best month:", months[np.argmax(sales_data.sum(axis=0))])

    # Monthly growth rate
    monthly_totals = sales_data.sum(axis=0)
    growth_rate = (monthly_totals[1:] - monthly_totals[:-1]) / monthly_totals[:-1] * 100
    print("Monthly growth rates (%):", growth_rate)

    # Product performance categories
    product_totals = sales_data.sum(axis=1)
    avg_sales = np.mean(product_totals)

    high_performers = products[product_totals > avg_sales * 1.2]
    low_performers = products[product_totals < avg_sales * 0.8]

    print("High performers (>120% average):", high_performers)
    print("Low performers (<80% average):", low_performers)

# Run all examples
if __name__ == "__main__":
    print("NumPy Data Science Examples")
    print("=" * 40)

    array_creation()
    array_properties()
    array_indexing()
    mathematical_operations()
    linear_algebra()
    data_aggregation()
    broadcasting_examples()
    performance_comparison()
    sales_data_analysis()

💻 Analyse de Données avec Pandas

🟡 intermediate ⭐⭐⭐

Flux de travail complet avec pandas pour la manipulation, le nettoyage, l'analyse et les opérations de séries temporelles

⏱️ 45 min 🏷️ pandas, data-analysis, data-science
Prerequisites: Python basics, NumPy fundamentals, Basic statistics
# Pandas Data Analysis for Data Science

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Set styling
plt.style.use('default')
sns.set_palette("husl")

# 1. Creating and Loading Data
def data_creation_loading():
    print("=== Data Creation and Loading ===")

    # Create DataFrame from dictionary
    data = {
        'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
        'age': [25, 30, 35, 28, 32],
        'city': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney'],
        'salary': [70000, 80000, 90000, 75000, 85000],
        'department': ['IT', 'Finance', 'IT', 'Marketing', 'Finance']
    }

    df = pd.DataFrame(data)
    print("Created DataFrame:\n", df)

    # Create DataFrame from lists
    names = ['Product_A', 'Product_B', 'Product_C']
    prices = [100, 150, 200]
    quantities = [50, 30, 40]

    df_products = pd.DataFrame({
        'Product': names,
        'Price': prices,
        'Quantity': quantities
    })
    print("\nProducts DataFrame:\n", df_products)

    # Load CSV (simulated)
    # df_csv = pd.read_csv('data.csv')
    # print("Loaded from CSV:\n", df_csv.head())

    # Basic information
    print("\nDataFrame Info:")
    df.info()

    print("\nStatistical Summary:")
    print(df.describe())

# 2. Data Selection and Filtering
def data_selection_filtering():
    print("\n=== Data Selection and Filtering ===")

    # Sample data
    data = {
        'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank', 'Grace'],
        'age': [25, 30, 35, 28, 32, 45, 29],
        'city': ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'London', 'Paris'],
        'salary': [70000, 80000, 90000, 75000, 85000, 95000, 72000],
        'department': ['IT', 'Finance', 'IT', 'Marketing', 'Finance', 'IT', 'Marketing'],
        'join_date': pd.to_datetime(['2020-01-15', '2019-03-20', '2018-07-10',
                                   '2021-02-01', '2020-06-15', '2017-11-30', '2021-08-20'])
    }

    df = pd.DataFrame(data)

    # Column selection
    print("Name and Salary columns:\n", df[['name', 'salary']])

    # Row selection by position
    print("\nFirst 3 rows:\n", df.head(3))
    print("\nRows 2-4:\n", df.iloc[2:5])

    # Row selection by label
    print("\nRows with index 1-3:\n", df.loc[1:3])

    # Conditional filtering
    high_salary = df[df['salary'] > 80000]
    print("\nEmployees with salary > 80k:\n", high_salary)

    # Multiple conditions
    it_dept_30plus = df[(df['department'] == 'IT') & (df['age'] >= 30)]
    print("\nIT employees age 30+:\n", it_dept_30plus)

    # Using isin()
    target_cities = ['New York', 'London', 'Paris']
    eu_employees = df[df['city'].isin(target_cities)]
    print("\nEmployees in target cities:\n", eu_employees)

    # String methods
    print("\nNames starting with 'A':\n", df[df['name'].str.startswith('A')])

# 3. Data Cleaning and Preparation
def data_cleaning():
    print("\n=== Data Cleaning and Preparation ===")

    # Create messy data
    messy_data = {
        'name': ['Alice', 'Bob', None, 'Diana', 'Eve', '  Frank  ', 'Grace'],
        'age': [25, 30, 35, None, 32, 45, 29],
        'salary': [70000, 80000, 90000, 75000, None, 95000, 72000],
        'department': ['IT', 'Finance', 'IT', 'Marketing', 'FINANCE', 'IT', 'marketing'],
        'email': ['[email protected]', 'bob@email', '[email protected]',
                 None, '[email protected]', '[email protected]', '[email protected]']
    }

    df = pd.DataFrame(messy_data)
    print("Original messy data:\n", df)

    # Handle missing values
    print("\nMissing values count:\n", df.isnull().sum())

    # Drop rows with missing names
    df_clean = df.dropna(subset=['name'])

    # Fill missing ages with mean
    df_clean['age'] = df_clean['age'].fillna(df_clean['age'].mean())

    # Fill missing salaries with median
    df_clean['salary'] = df_clean['salary'].fillna(df_clean['salary'].median())

    print("\nAfter handling missing values:\n", df_clean)

    # Clean string data
    df_clean['name'] = df_clean['name'].str.strip()  # Remove whitespace
    df_clean['department'] = df_clean['department'].str.title()  # Title case
    df_clean['email'] = df_clean['email'].str.lower()  # Lowercase

    # Validate email format (simple check)
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    df_clean['valid_email'] = df_clean['email'].str.match(email_pattern)

    print("\nAfter cleaning strings:\n", df_clean)

    # Remove duplicate rows
    df_clean = df_clean.drop_duplicates()
    print("\nAfter removing duplicates (if any):\n", df_clean)

# 4. Data Transformation and Feature Engineering
def data_transformation():
    print("\n=== Data Transformation ===")

    # Sample sales data
    sales_data = {
        'product': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B', 'A'],
        'region': ['North', 'South', 'North', 'East', 'West', 'South', 'North', 'East', 'West'],
        'sales': [1000, 1500, 1200, 800, 2000, 900, 1100, 1800, 700],
        'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03',
                               '2023-01-04', '2023-01-05', '2023-01-06',
                               '2023-01-07', '2023-01-08', '2023-01-09'])
    }

    df = pd.DataFrame(sales_data)

    # Create new features
    df['sales_tax'] = df['sales'] * 0.08  # 8% tax
    df['total_revenue'] = df['sales'] + df['sales_tax']
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['weekday'] = df['date'].dt.day_name()

    # Categorical encoding
    df['product_encoded'] = df['product'].astype('category').cat.codes
    df['region_encoded'] = df['region'].astype('category').cat.codes

    # One-hot encoding
    df_encoded = pd.get_dummies(df, columns=['product', 'region'], prefix=['prod', 'reg'])

    print("Original DataFrame:\n", df.head())
    print("\nOne-hot encoded DataFrame:\n", df_encoded.head())

    # Grouping and aggregation
    product_stats = df.groupby('product')['sales'].agg(['mean', 'sum', 'count', 'std'])
    print("\nProduct statistics:\n", product_stats)

    # Pivot table
    pivot_table = df.pivot_table(values='sales', index='product', columns='region', aggfunc='sum', fill_value=0)
    print("\nPivot table:\n", pivot_table)

# 5. Time Series Analysis
def time_series_analysis():
    print("\n=== Time Series Analysis ===")

    # Create time series data
    date_range = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
    np.random.seed(42)

    # Simulate sales data with trend and seasonality
    trend = np.linspace(100, 200, len(date_range))
    seasonal = 50 * np.sin(2 * np.pi * np.arange(len(date_range)) / 30)  # Monthly seasonality
    noise = np.random.normal(0, 20, len(date_range))

    sales = trend + seasonal + noise
    sales[sales < 0] = 0  # Ensure non-negative

    df_ts = pd.DataFrame({
        'date': date_range,
        'sales': sales
    })

    df_ts.set_index('date', inplace=True)

    print("Time series data (first 10 days):\n", df_ts.head(10))

    # Resampling
    monthly_sales = df_ts.resample('M').sum()
    print("\nMonthly sales:\n", monthly_sales.head())

    weekly_sales = df_ts.resample('W').mean()
    print("\nWeekly average sales:\n", weekly_sales.head())

    # Moving averages
    df_ts['sales_7d_ma'] = df_ts['sales'].rolling(window=7).mean()
    df_ts['sales_30d_ma'] = df_ts['sales'].rolling(window=30).mean()

    print("\nWith moving averages (last 5 days):\n", df_ts.tail(5))

    # Date-based filtering
    q1_sales = df_ts['2023-01':'2023-03']
    print("\nQ1 2023 total sales:", q1_sales['sales'].sum())

    # Lag features
    df_ts['sales_lag_1'] = df_ts['sales'].shift(1)
    df_ts['sales_lag_7'] = df_ts['sales'].shift(7)

    print("\nWith lag features (last 3 days):\n",
          df_ts[['sales', 'sales_lag_1', 'sales_lag_7']].tail(3))

# 6. Data Visualization with Pandas
def pandas_visualization():
    print("\n=== Data Visualization ===")

    # Create comprehensive dataset
    np.random.seed(42)

    products = ['A', 'B', 'C', 'D', 'E']
    regions = ['North', 'South', 'East', 'West']
    months = pd.date_range('2023-01-01', '2023-06-30', freq='M')

    data = []
    for month in months:
        for product in products:
            for region in regions:
                base_sales = np.random.randint(500, 2000)
                seasonal_factor = 1 + 0.2 * np.sin(2 * np.pi * month.month / 12)
                sales = int(base_sales * seasonal_factor * np.random.normal(1, 0.1))

                data.append({
                    'date': month,
                    'product': product,
                    'region': region,
                    'sales': max(0, sales)
                })

    df_viz = pd.DataFrame(data)

    # Basic statistics
    print("Dataset shape:", df_viz.shape)
    print("Columns:", df_viz.columns.tolist())
    print("\nSales statistics:\n", df_viz['sales'].describe())

    # Product performance
    product_sales = df_viz.groupby('product')['sales'].sum().sort_values(ascending=False)
    print("\nTotal sales by product:\n", product_sales)

    # Region performance
    region_sales = df_viz.groupby('region')['sales'].sum().sort_values(ascending=False)
    print("\nTotal sales by region:\n", region_sales)

    # Monthly trends
    monthly_sales = df_viz.groupby(df_viz['date'].dt.strftime('%Y-%m'))['sales'].sum()
    print("\nMonthly sales trend:\n", monthly_sales)

    # Correlation analysis (if we had numeric variables)
    numeric_df = df_viz.select_dtypes(include=[np.number])
    if len(numeric_df.columns) > 1:
        print("\nCorrelation matrix:\n", numeric_df.corr())

# 7. Advanced Data Operations
def advanced_operations():
    print("\n=== Advanced Data Operations ===")

    # Merge operations
    employees = pd.DataFrame({
        'emp_id': [1, 2, 3, 4],
        'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
        'dept_id': [101, 102, 101, 103]
    })

    departments = pd.DataFrame({
        'dept_id': [101, 102, 103, 104],
        'dept_name': ['IT', 'Finance', 'Marketing', 'HR'],
        'budget': [1000000, 800000, 600000, 400000]
    })

    # Inner join
    merged_inner = pd.merge(employees, departments, on='dept_id', how='inner')
    print("Inner join:\n", merged_inner)

    # Left join
    merged_left = pd.merge(employees, departments, on='dept_id', how='left')
    print("\nLeft join:\n", merged_left)

    # Concatenation
    q1_data = pd.DataFrame({
        'month': ['Jan', 'Feb', 'Mar'],
        'sales': [1000, 1200, 1100]
    })

    q2_data = pd.DataFrame({
        'month': ['Apr', 'May', 'Jun'],
        'sales': [1300, 1400, 1350]
    })

    half_year = pd.concat([q1_data, q2_data], ignore_index=True)
    print("\nConcatenated data:\n", half_year)

    # Apply custom functions
    performance_data = pd.DataFrame({
        'employee': ['Alice', 'Bob', 'Charlie', 'Diana'],
        'score': [85, 92, 78, 95],
        'projects': [5, 7, 3, 8]
    })

    def calculate_performance(row):
        score_weight = 0.7
        projects_weight = 0.3
        return (row['score'] * score_weight + row['projects'] * projects_weight * 10)

    performance_data['performance_score'] = performance_data.apply(calculate_performance, axis=1)
    print("\nPerformance data with calculated score:\n", performance_data)

    # Window functions
    sales_data = pd.DataFrame({
        'date': pd.date_range('2023-01-01', periods=10),
        'daily_sales': [100, 120, 110, 130, 125, 140, 135, 150, 145, 160]
    })

    sales_data['cumulative_sum'] = sales_data['daily_sales'].cumsum()
    sales_data['rolling_mean_3'] = sales_data['daily_sales'].rolling(window=3).mean()
    sales_data['rolling_std_3'] = sales_data['daily_sales'].rolling(window=3).std()

    print("\nSales data with window functions:\n", sales_data)

# Run all examples
if __name__ == "__main__":
    print("Pandas Data Science Examples")
    print("=" * 40)

    data_creation_loading()
    data_selection_filtering()
    data_cleaning()
    data_transformation()
    time_series_analysis()
    pandas_visualization()
    advanced_operations()

    print("\n" + "=" * 40)
    print("Examples completed successfully!")

💻 Visualisation avec Matplotlib

🟡 intermediate ⭐⭐⭐

Guide complet pour créer des graphiques de qualité publication en utilisant matplotlib et seaborn

⏱️ 40 min 🏷️ matplotlib, seaborn, visualization, data-science
Prerequisites: Python basics, NumPy, pandas, Basic statistics
# Matplotlib and Seaborn Data Visualization

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Set style and figure parameters
plt.style.use('default')  # or 'seaborn-v0_8', 'ggplot', etc.
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# 1. Basic Plot Types
def basic_plots():
    print("=== Basic Plot Types ===")

    # Sample data
    x = np.linspace(0, 10, 100)
    y1 = np.sin(x)
    y2 = np.cos(x)
    y3 = np.exp(-x/5) * np.sin(x)

    # Line plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Simple line plot
    axes[0, 0].plot(x, y1, 'b-', linewidth=2, label='sin(x)')
    axes[0, 0].plot(x, y2, 'r--', linewidth=2, label='cos(x)')
    axes[0, 0].set_title('Trigonometric Functions')
    axes[0, 0].set_xlabel('x')
    axes[0, 0].set_ylabel('y')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)

    # Scatter plot
    np.random.seed(42)
    x_scatter = np.random.randn(100)
    y_scatter = x_scatter * 0.5 + np.random.randn(100) * 0.3
    colors = np.random.rand(100)

    scatter = axes[0, 1].scatter(x_scatter, y_scatter, c=colors, alpha=0.7,
                                 cmap='viridis', s=50)
    axes[0, 1].set_title('Scatter Plot with Color Mapping')
    axes[0, 1].set_xlabel('X values')
    axes[0, 1].set_ylabel('Y values')
    plt.colorbar(scatter, ax=axes[0, 1])

    # Bar plot
    categories = ['A', 'B', 'C', 'D', 'E']
    values = [23, 45, 56, 78, 32]

    bars = axes[1, 0].bar(categories, values, color=['red', 'green', 'blue', 'orange', 'purple'])
    axes[1, 0].set_title('Bar Chart')
    axes[1, 0].set_xlabel('Categories')
    axes[1, 0].set_ylabel('Values')

    # Add value labels on bars
    for bar, value in zip(bars, values):
        axes[1, 0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                       str(value), ha='center', va='bottom')

    # Histogram
    data_hist = np.random.normal(100, 15, 1000)
    axes[1, 1].hist(data_hist, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[1, 1].set_title('Histogram')
    axes[1, 1].set_xlabel('Value')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].axvline(data_hist.mean(), color='red', linestyle='--',
                       label=f'Mean: {data_hist.mean():.2f}')
    axes[1, 1].legend()

    plt.tight_layout()
    plt.show()

# 2. Advanced Plotting Techniques
def advanced_plots():
    print("\n=== Advanced Plotting Techniques ===")

    # Create sample dataset
    np.random.seed(42)
    n_points = 200

    # Multiple subplots with different types
    fig = plt.figure(figsize=(16, 12))

    # Create grid layout
    gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

    # 2D density plot
    ax1 = fig.add_subplot(gs[0, 0])
    x_2d = np.random.multivariate_normal([0, 0], [[1, 0.5], [0.5, 1]], 1000)
    hb = ax1.hexbin(x_2d[:, 0], x_2d[:, 1], gridsize=20, cmap='Blues')
    ax1.set_title('2D Hexbin Density Plot')
    ax1.set_xlabel('X')
    ax1.set_ylabel('Y')
    plt.colorbar(hb, ax=ax1)

    # Box plot with multiple groups
    ax2 = fig.add_subplot(gs[0, 1])
    data_box = [np.random.normal(0, std, 100) for std in range(1, 4)]
    bp = ax2.boxplot(data_box, patch_artist=True, labels=['Group 1', 'Group 2', 'Group 3'])
    colors = ['lightblue', 'lightgreen', 'lightpink']
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
    ax2.set_title('Box Plot Comparison')
    ax2.set_ylabel('Values')

    # Violin plot
    ax3 = fig.add_subplot(gs[0, 2])
    data_violin = [np.random.normal(0, std, 100) for std in range(1, 4)]
    vp = ax3.violinplot(data_violin, positions=[1, 2, 3], showmeans=True)
    ax3.set_title('Violin Plot')
    ax3.set_xticks([1, 2, 3])
    ax3.set_xticklabels(['Group 1', 'Group 2', 'Group 3'])
    ax3.set_ylabel('Values')

    # Error bar plot
    ax4 = fig.add_subplot(gs[1, 0])
    x_err = np.arange(5)
    y_err = np.array([20, 35, 30, 45, 40])
    y_err_values = np.array([2, 3, 4, 2, 3])
    ax4.errorbar(x_err, y_err, yerr=y_err_values, fmt='o-',
                capsize=5, capthick=2, linewidth=2)
    ax4.set_title('Error Bar Plot')
    ax4.set_xlabel('Category')
    ax4.set_ylabel('Mean ± Error')

    # Stacked area plot
    ax5 = fig.add_subplot(gs[1, 1])
    x_area = np.arange(10)
    y1_area = np.random.randint(1, 5, 10)
    y2_area = np.random.randint(1, 5, 10)
    y3_area = np.random.randint(1, 5, 10)

    ax5.stackplot(x_area, y1_area, y2_area, y3_area,
                 labels=['Series 1', 'Series 2', 'Series 3'],
                 colors=['skyblue', 'lightgreen', 'lightcoral'])
    ax5.set_title('Stacked Area Plot')
    ax5.set_xlabel('X')
    ax5.set_ylabel('Cumulative Values')
    ax5.legend()

    # Polar plot
    ax6 = fig.add_subplot(gs[1, 2], projection='polar')
    theta = np.linspace(0, 2*np.pi, 100)
    r = np.abs(np.sin(theta) * np.cos(2*theta))
    ax6.plot(theta, r, 'b-', linewidth=2)
    ax6.fill(theta, r, alpha=0.3)
    ax6.set_title('Polar Plot')

    # Heatmap
    ax7 = fig.add_subplot(gs[2, :2])
    data_heatmap = np.random.randn(10, 12)
    im = ax7.imshow(data_heatmap, cmap='coolwarm', aspect='auto')
    ax7.set_title('Heatmap')
    ax7.set_xlabel('Columns')
    ax7.set_ylabel('Rows')
    plt.colorbar(im, ax=ax7)

    # Pie chart
    ax8 = fig.add_subplot(gs[2, 2])
    sizes = [30, 25, 20, 15, 10]
    labels = ['A', 'B', 'C', 'D', 'E']
    colors_pie = ['gold', 'lightcoral', 'lightskyblue', 'lightgreen', 'plum']
    explode = (0.1, 0, 0, 0, 0)  # explode first slice

    wedges, texts, autotexts = ax8.pie(sizes, explode=explode, labels=labels, colors=colors_pie,
                                       autopct='%1.1f%%', shadow=True, startangle=90)
    ax8.set_title('Pie Chart')

    # Enhance text appearance
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_weight('bold')

    plt.suptitle('Advanced Visualization Techniques', fontsize=16, y=1.02)
    plt.show()

# 3. Seaborn Statistical Plots
def seaborn_plots():
    print("\n=== Seaborn Statistical Plots ===")

    # Create sample datasets
    np.random.seed(42)

    # Dataset for regression plots
    x_reg = np.linspace(0, 10, 100)
    y_reg = 2 * x_reg + 1 + np.random.normal(0, 2, 100)

    # Dataset for categorical plots
    categories = ['A', 'B', 'C', 'D']
    data_cat = []
    for cat in categories:
        data_cat.extend([(cat, np.random.normal(100 + categories.index(cat) * 10, 15))
                        for _ in range(50)])

    df_cat = pd.DataFrame(data_cat, columns=['Category', 'Value'])

    # Dataset for distribution plots
    data_dist = np.random.gamma(2, 2, 1000)

    # Create comprehensive seaborn plot
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))

    # Regression plot with confidence interval
    sns.regplot(x=x_reg, y=y_reg, ax=axes[0, 0],
                scatter_kws={'alpha': 0.6}, line_kws={'color': 'red'})
    axes[0, 0].set_title('Regression Plot with Confidence Interval')

    # Box plot with swarm overlay
    sns.boxplot(x='Category', y='Value', data=df_cat, ax=axes[0, 1])
    sns.swarmplot(x='Category', y='Value', data=df_cat, ax=axes[0, 1],
                  color='black', alpha=0.5, size=4)
    axes[0, 1].set_title('Box Plot with Swarm Overlay')

    # Violin plot
    sns.violinplot(x='Category', y='Value', data=df_cat, ax=axes[0, 2])
    axes[0, 2].set_title('Violin Plot by Category')

    # Distribution plot (histogram + KDE)
    sns.histplot(data_dist, kde=True, ax=axes[1, 0])
    axes[1, 0].set_title('Distribution Plot with KDE')

    # Pair plot (requires 2D data)
    # Creating correlation data
    np.random.seed(42)
    n_samples = 200
    corr_data = {
        'Variable1': np.random.normal(0, 1, n_samples),
        'Variable2': np.random.normal(0, 1, n_samples),
        'Variable3': np.random.normal(0, 1, n_samples)
    }
    # Add some correlation
    corr_data['Variable2'] = 0.7 * corr_data['Variable1'] + 0.3 * corr_data['Variable2']

    df_corr = pd.DataFrame(corr_data)

    # Correlation heatmap
    correlation_matrix = df_corr.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, ax=axes[1, 1])
    axes[1, 1].set_title('Correlation Heatmap')

    # Count plot
    count_data = np.random.choice(categories, 100)
    sns.countplot(x=count_data, ax=axes[1, 2])
    axes[1, 2].set_title('Count Plot')
    axes[1, 2].set_xlabel('Category')

    plt.tight_layout()
    plt.show()

# 4. Real-world Data Visualization
def real_world_visualization():
    print("\n=== Real-world Data Visualization ===")

    # Create realistic sales dataset
    np.random.seed(42)

    # Time series sales data
    dates = pd.date_range('2022-01-01', '2023-12-31', freq='D')
    base_sales = 1000
    trend = np.linspace(0, 500, len(dates))
    seasonal = 200 * np.sin(2 * np.pi * np.arange(len(dates)) / 365.25)
    noise = np.random.normal(0, 50, len(dates))

    sales = base_sales + trend + seasonal + noise
    sales[sales < 0] = 0  # Ensure non-negative

    df_sales = pd.DataFrame({
        'date': dates,
        'sales': sales
    })

    # Create monthly aggregations
    df_monthly = df_sales.set_index('date').resample('M').agg({
        'sales': ['sum', 'mean', 'std']
    }).reset_index()
    df_monthly.columns = ['date', 'total_sales', 'avg_sales', 'sales_std']

    # Create comprehensive dashboard
    fig = plt.figure(figsize=(20, 15))
    gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

    # 1. Time series plot with trend
    ax1 = fig.add_subplot(gs[0, :])
    ax1.plot(df_sales['date'], df_sales['sales'], alpha=0.7, linewidth=1, label='Daily Sales')

    # Add moving average
    df_sales['MA30'] = df_sales['sales'].rolling(window=30).mean()
    ax1.plot(df_sales['date'], df_sales['MA30'], 'r-', linewidth=2, label='30-day MA')

    ax1.set_title('Daily Sales with Moving Average', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Sales')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # 2. Monthly sales bar chart
    ax2 = fig.add_subplot(gs[1, 0])
    monthly_totals = df_sales.set_index('date').resample('M').sum()
    ax2.bar(range(len(monthly_totals)), monthly_totals['sales'],
            color=plt.cm.viridis(np.linspace(0, 1, len(monthly_totals))))
    ax2.set_title('Monthly Total Sales')
    ax2.set_xlabel('Month')
    ax2.set_ylabel('Total Sales')
    ax2.set_xticks(range(0, len(monthly_totals), 3))
    ax2.set_xticklabels([month.strftime('%Y-%m') for month in monthly_totals.index[::3]],
                       rotation=45)

    # 3. Sales distribution
    ax3 = fig.add_subplot(gs[1, 1])
    ax3.hist(df_sales['sales'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    ax3.axvline(df_sales['sales'].mean(), color='red', linestyle='--',
               label=f'Mean: {df_sales["sales"].mean():.0f}')
    ax3.axvline(df_sales['sales'].median(), color='orange', linestyle='--',
               label=f'Median: {df_sales["sales"].median():.0f}')
    ax3.set_title('Sales Distribution')
    ax3.set_xlabel('Sales')
    ax3.set_ylabel('Frequency')
    ax3.legend()

    # 4. Monthly statistics
    ax4 = fig.add_subplot(gs[1, 2])
    months = [month.strftime('%Y-%m') for month in monthly_totals.index]
    ax4.plot(monthly_totals.index, monthly_totals['sales'], 'o-', label='Monthly Sales')
    ax4.fill_between(monthly_totals.index, monthly_totals['sales'], alpha=0.3)
    ax4.set_title('Monthly Sales Trend')
    ax4.set_xlabel('Month')
    ax4.set_ylabel('Sales')
    ax4.set_xticks(range(0, len(monthly_totals), 3))
    ax4.set_xticklabels([months[i] for i in range(0, len(monthly_totals), 3)],
                       rotation=45)
    ax4.legend()

    # 5. Seasonal pattern
    ax5 = fig.add_subplot(gs[2, 0])
    df_sales['month'] = df_sales['date'].dt.month
    monthly_avg = df_sales.groupby('month')['sales'].mean()
    ax5.bar(monthly_avg.index, monthly_avg.values, color='lightgreen')
    ax5.set_title('Average Sales by Month')
    ax5.set_xlabel('Month')
    ax5.set_ylabel('Average Sales')
    ax5.set_xticks(range(1, 13))
    ax5.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                        'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

    # 6. Day of week analysis
    ax6 = fig.add_subplot(gs[2, 1])
    df_sales['day_of_week'] = df_sales['date'].dt.day_name()
    dow_avg = df_sales.groupby('day_of_week')['sales'].mean()
    dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    dow_avg = dow_avg.reindex(dow_order)

    bars = ax6.bar(dow_avg.index, dow_avg.values, color='coral')
    ax6.set_title('Average Sales by Day of Week')
    ax6.set_xlabel('Day of Week')
    ax6.set_ylabel('Average Sales')
    ax6.tick_params(axis='x', rotation=45)

    # Add value labels on bars
    for bar, value in zip(bars, dow_avg.values):
        ax6.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10,
                f'{value:.0f}', ha='center', va='bottom')

    # 7. Sales heatmap (by month and day of week)
    ax7 = fig.add_subplot(gs[2, 2])
    df_sales['year_month'] = df_sales['date'].dt.to_period('M')
    heatmap_data = df_sales.groupby(['year_month', 'day_of_week'])['sales'].mean().unstack()
    heatmap_data = heatmap_data.reindex(columns=dow_order)

    sns.heatmap(heatmap_data.T, cmap='YlOrRd', ax=ax7, cbar_kws={'label': 'Average Sales'})
    ax7.set_title('Sales Heatmap (Month vs Day of Week)')
    ax7.set_xlabel('Month')
    ax7.set_ylabel('Day of Week')

    plt.suptitle('Sales Analytics Dashboard', fontsize=16, fontweight='bold', y=1.02)
    plt.show()

# 5. Publication Quality Plots
def publication_quality():
    print("\n=== Publication Quality Plots ===")

    # Create high-quality scientific plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Plot 1: Comparison of methods with error bars
    methods = ['Method A', 'Method B', 'Method C', 'Method D']
    means = [85.2, 78.9, 92.1, 88.7]
    stds = [3.2, 4.1, 2.8, 3.5]

    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
    bars = ax1.bar(methods, means, yerr=stds, capsize=5,
                   color=colors, alpha=0.8, edgecolor='black', linewidth=1.2)

    ax1.set_title('Algorithm Performance Comparison', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Accuracy (%)', fontsize=12)
    ax1.set_ylim(0, 100)
    ax1.grid(True, alpha=0.3, axis='y')

    # Add significance markers
    significance = ['***', 'ns', '****', '**']
    for i, (bar, sig) in enumerate(zip(bars, significance)):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + stds[i] + 2,
                sig, ha='center', va='bottom', fontsize=12, fontweight='bold')

    # Plot 2: ROC curves
    fpr = np.linspace(0, 1, 100)

    # Simulate ROC curves for different models
    tpr_model1 = 1 - np.exp(-3 * fpr)
    tpr_model2 = fpr**0.5
    tpr_model3 = np.sqrt(fpr)

    ax2.plot(fpr, tpr_model1, 'b-', linewidth=2, label=f'Model 1 (AUC = 0.91)')
    ax2.plot(fpr, tpr_model2, 'r-', linewidth=2, label=f'Model 2 (AUC = 0.82)')
    ax2.plot(fpr, tpr_model3, 'g-', linewidth=2, label=f'Model 3 (AUC = 0.76)')
    ax2.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random')

    ax2.set_title('ROC Curves Comparison', fontsize=14, fontweight='bold')
    ax2.set_xlabel('False Positive Rate', fontsize=12)
    ax2.set_ylabel('True Positive Rate', fontsize=12)
    ax2.legend(loc='lower right')
    ax2.grid(True, alpha=0.3)

    # Style improvements for publication
    for ax in [ax1, ax2]:
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_linewidth(1.2)
        ax.spines['bottom'].set_linewidth(1.2)
        ax.tick_params(axis='both', which='major', labelsize=10)

    plt.tight_layout()
    plt.show()

# Main execution
if __name__ == "__main__":
    print("Matplotlib and Seaborn Visualization Examples")
    print("=" * 50)

    basic_plots()
    advanced_plots()
    seaborn_plots()
    real_world_visualization()
    publication_quality()

    print("\n" + "=" * 50)
    print("All visualization examples completed successfully!")