managing-apps/scripts/influxdb/import-csv-data.sh

#!/bin/bash

# InfluxDB CSV Data Import Script
# Usage: ./import-csv-data.sh

set -e  # Exit on any error

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Get the directory where the script is located
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
PROJECT_ROOT="$(dirname "$(dirname "$SCRIPT_DIR")")"
SRC_DIR="$PROJECT_ROOT/src"
EXPORTS_BASE_DIR="$SCRIPT_DIR/exports"

# Logging functions
log() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')] $1${NC}"
}

warn() {
    echo -e "${YELLOW}[$(date +'%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}

error() {
    echo -e "${RED}[$(date +'%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}"
    exit 1
}

info() {
    echo -e "${BLUE}[$(date +'%Y-%m-%d %H:%M:%S')] INFO: $1${NC}"
}

# Check if influx CLI is installed
command -v influx >/dev/null 2>&1 || error "InfluxDB CLI is not installed. Please install it first: brew install influxdb-cli"

# Check if jq is installed for JSON parsing
if ! command -v jq >/dev/null 2>&1; then
    warn "jq is not installed. Installing it for JSON parsing..."
    if command -v brew >/dev/null 2>&1; then
        brew install jq || error "Failed to install jq. Please install it manually: brew install jq"
    else
        error "jq is not installed and brew is not available. Please install jq manually."
    fi
fi

echo ""
echo "============================================"
echo "  InfluxDB CSV Data Import"
echo "============================================"
echo ""

# Check if exports directory exists
if [ ! -d "$EXPORTS_BASE_DIR" ]; then
    error "Exports directory not found: $EXPORTS_BASE_DIR"
fi

# List available source environments
echo "Available export source environments:"
ENVIRONMENTS=($(ls -d "$EXPORTS_BASE_DIR"/*/ 2>/dev/null | xargs -n 1 basename))

if [ ${#ENVIRONMENTS[@]} -eq 0 ]; then
    error "No export environments found in: $EXPORTS_BASE_DIR"
fi

for i in "${!ENVIRONMENTS[@]}"; do
    echo "$((i+1))) ${ENVIRONMENTS[$i]}"
done
echo ""
read -p "Select source environment (1-${#ENVIRONMENTS[@]}): " ENV_CHOICE

if [ "$ENV_CHOICE" -lt 1 ] || [ "$ENV_CHOICE" -gt ${#ENVIRONMENTS[@]} ]; then
    error "Invalid choice"
fi

SOURCE_ENV="${ENVIRONMENTS[$((ENV_CHOICE-1))]}"
ENV_EXPORT_DIR="$EXPORTS_BASE_DIR/$SOURCE_ENV"

log "Selected source environment: $SOURCE_ENV"

# List available export timestamps
echo ""
echo "Available exports for $SOURCE_ENV:"
EXPORTS=($(ls -d "$ENV_EXPORT_DIR"/*/ 2>/dev/null | xargs -n 1 basename | sort -r))

if [ ${#EXPORTS[@]} -eq 0 ]; then
    error "No exports found for environment: $SOURCE_ENV"
fi

for i in "${!EXPORTS[@]}"; do
    EXPORT_PATH="$ENV_EXPORT_DIR/${EXPORTS[$i]}"
    METADATA_FILE="$EXPORT_PATH/export-metadata.txt"

    if [ -f "$METADATA_FILE" ]; then
        EXPORT_SIZE=$(grep "Export Size:" "$METADATA_FILE" | cut -d: -f2 | xargs)
        DATA_POINTS=$(grep "Data Points:" "$METADATA_FILE" | cut -d: -f2 | xargs)
        EXPORT_DATE=$(grep "Date:" "$METADATA_FILE" | cut -d: -f2- | xargs)
        echo "$((i+1))) ${EXPORTS[$i]} - $EXPORT_DATE ($EXPORT_SIZE, $DATA_POINTS points)"
    else
        echo "$((i+1))) ${EXPORTS[$i]}"
    fi
done
echo ""
read -p "Select export to import (1-${#EXPORTS[@]}): " EXPORT_CHOICE

if [ "$EXPORT_CHOICE" -lt 1 ] || [ "$EXPORT_CHOICE" -gt ${#EXPORTS[@]} ]; then
    error "Invalid choice"
fi

SELECTED_EXPORT="${EXPORTS[$((EXPORT_CHOICE-1))]}"
IMPORT_FROM_DIR="$ENV_EXPORT_DIR/$SELECTED_EXPORT"

log "Selected export: $SELECTED_EXPORT"
info "Export location: $IMPORT_FROM_DIR"

# Find CSV file
CSV_FILE=$(find "$IMPORT_FROM_DIR" -name "*.csv" | head -1)

if [ ! -f "$CSV_FILE" ]; then
    error "No CSV file found in: $IMPORT_FROM_DIR"
fi

CSV_SIZE=$(du -sh "$CSV_FILE" | cut -f1)
info "CSV file: $(basename "$CSV_FILE") ($CSV_SIZE)"

# Select target environment for import
echo ""
echo "Select TARGET environment for import:"
echo "1) SandboxLocal"
echo "2) ProductionLocal"
echo ""
read -p "Enter your choice (1 or 2): " TARGET_ENV_CHOICE

case $TARGET_ENV_CHOICE in
    1)
        TARGET_ENVIRONMENT="SandboxLocal"
        APPSETTINGS_FILE="$SRC_DIR/Managing.Api/appsettings.SandboxLocal.json"
        ;;
    2)
        TARGET_ENVIRONMENT="ProductionLocal"
        APPSETTINGS_FILE="$SRC_DIR/Managing.Api/appsettings.ProductionLocal.json"
        ;;
    *)
        error "Invalid choice. Please run the script again and select 1 or 2."
        ;;
esac

log "Target environment: $TARGET_ENVIRONMENT"

# Check if appsettings file exists
if [ ! -f "$APPSETTINGS_FILE" ]; then
    error "Configuration file not found: $APPSETTINGS_FILE"
fi

log "Reading configuration from: $APPSETTINGS_FILE"

# Parse InfluxDB settings from JSON
INFLUX_URL=$(jq -r '.InfluxDb.Url' "$APPSETTINGS_FILE")
INFLUX_ORG=$(jq -r '.InfluxDb.Organization' "$APPSETTINGS_FILE")
INFLUX_TOKEN=$(jq -r '.InfluxDb.Token' "$APPSETTINGS_FILE")

# Validate parsed values
if [ "$INFLUX_URL" = "null" ] || [ -z "$INFLUX_URL" ]; then
    error "Failed to parse InfluxDb.Url from configuration file"
fi

if [ "$INFLUX_ORG" = "null" ] || [ -z "$INFLUX_ORG" ]; then
    error "Failed to parse InfluxDb.Organization from configuration file"
fi

if [ "$INFLUX_TOKEN" = "null" ] || [ -z "$INFLUX_TOKEN" ]; then
    error "Failed to parse InfluxDb.Token from configuration file"
fi

info "Target InfluxDB URL: $INFLUX_URL"
info "Organization: $INFLUX_ORG"

# Get bucket name
BUCKET_NAME="prices-bucket"

# Check if bucket exists
info "Checking if bucket '$BUCKET_NAME' exists..."
if influx bucket list --host "$INFLUX_URL" --org "$INFLUX_ORG" --token "$INFLUX_TOKEN" --name "$BUCKET_NAME" &>/dev/null; then
    log "✅ Bucket '$BUCKET_NAME' exists"
else
    warn "Bucket '$BUCKET_NAME' does not exist!"
    read -p "Create the bucket now? (yes/no): " CREATE_BUCKET
    if [ "$CREATE_BUCKET" = "yes" ]; then
        influx bucket create \
            --name "$BUCKET_NAME" \
            --retention 0 \
            --host "$INFLUX_URL" \
            --org "$INFLUX_ORG" \
            --token "$INFLUX_TOKEN" || error "Failed to create bucket"
        log "✅ Bucket created successfully"
    else
        error "Cannot proceed without target bucket"
    fi
fi

# Final confirmation
echo ""
warn "⚠️  IMPORTANT INFORMATION:"
echo "   Source: $SOURCE_ENV/$SELECTED_EXPORT"
echo "   Target: $TARGET_ENVIRONMENT ($INFLUX_URL)"
echo "   Bucket: $BUCKET_NAME"
echo "   Data Size: $CSV_SIZE"
warn "   This will ADD data to the bucket (existing data will be preserved)"
warn "   Duplicate timestamps may cause overwrites"
echo ""
read -p "Are you sure you want to continue? (yes/no): " CONFIRM

if [ "$CONFIRM" != "yes" ]; then
    log "Import cancelled by user"
    exit 0
fi

# Perform import
echo ""
log "🚀 Starting import operation..."
log "This may take several minutes for large files..."
echo ""

# Create a temporary file for line protocol conversion
TEMP_LP_FILE=$(mktemp)
trap "rm -f $TEMP_LP_FILE" EXIT

info "Converting CSV to line protocol format..."

# Convert annotated CSV to line protocol using awk
# Skip annotation lines (starting with #) and empty lines
awk -F',' '
    BEGIN {OFS=","}
    # Skip annotation lines
    /^#/ {next}
    # Skip empty lines
    /^[[:space:]]*$/ {next}
    # Process header to get field positions
    NR==1 {
        for (i=1; i<=NF; i++) {
            field[$i] = i
        }
        next
    }
    # Process data rows
    {
        # Extract values
        time = $field["_time"]
        measurement = $field["_measurement"]
        exchange = $field["exchange"]
        ticker = $field["ticker"]
        timeframe = $field["timeframe"]

        # Skip if essential fields are missing
        if (time == "" || measurement == "" || exchange == "" || ticker == "" || timeframe == "") next

        # Build line protocol
        # Format: measurement,tag1=value1,tag2=value2 field1=value1,field2=value2 timestamp
        printf "%s,exchange=%s,ticker=%s,timeframe=%s ", measurement, exchange, ticker, timeframe

        # Add fields
        first = 1
        for (fname in field) {
            if (fname != "_time" && fname != "_start" && fname != "_stop" && fname != "_measurement" &&
                fname != "exchange" && fname != "ticker" && fname != "timeframe" &&
                fname != "result" && fname != "table" && fname != "") {
                val = $field[fname]
                if (val != "" && val != "NaN") {
                    if (!first) printf ","
                    # Check if value is numeric
                    if (val ~ /^[0-9]+$/) {
                        printf "%s=%si", fname, val
                    } else {
                        printf "%s=%s", fname, val
                    }
                    first = 0
                }
            }
        }

        # Add timestamp (convert RFC3339 to nanoseconds if needed)
        printf " %s\n", time
    }
' "$CSV_FILE" > "$TEMP_LP_FILE" 2>/dev/null || {
    warn "CSV parsing method 1 failed, trying direct import..."

    # Alternative: Use influx write with CSV format directly
    info "Attempting direct CSV import..."

    if influx write \
        --host "$INFLUX_URL" \
        --org "$INFLUX_ORG" \
        --token "$INFLUX_TOKEN" \
        --bucket "$BUCKET_NAME" \
        --format csv \
        --file "$CSV_FILE" 2>&1; then

        log "✅ Import completed successfully using direct CSV method!"

        echo ""
        log "📊 Import Summary"
        echo "============================================"
        info "Source: $SOURCE_ENV/$SELECTED_EXPORT"
        info "Target: $TARGET_ENVIRONMENT"
        info "Bucket: $BUCKET_NAME"
        log "Status: Success"
        echo "============================================"
        echo ""
        exit 0
    else
        error "Both import methods failed. Please check the error messages above."
    fi
}

# If line protocol was generated, import it
if [ -s "$TEMP_LP_FILE" ]; then
    LP_LINES=$(wc -l < "$TEMP_LP_FILE" | xargs)
    info "Generated $LP_LINES lines of line protocol"

    # Import in batches to avoid timeouts
    BATCH_SIZE=5000
    TOTAL_LINES=$LP_LINES
    CURRENT_LINE=0

    info "Importing in batches of $BATCH_SIZE lines..."

    while [ $CURRENT_LINE -lt $TOTAL_LINES ]; do
        END_LINE=$((CURRENT_LINE + BATCH_SIZE))
        BATCH_NUM=$((CURRENT_LINE / BATCH_SIZE + 1))
        PROGRESS=$((CURRENT_LINE * 100 / TOTAL_LINES))

        info "Processing batch $BATCH_NUM (Progress: ${PROGRESS}%)..."

        # Extract batch and import
        sed -n "$((CURRENT_LINE + 1)),${END_LINE}p" "$TEMP_LP_FILE" | \
        influx write \
            --host "$INFLUX_URL" \
            --org "$INFLUX_ORG" \
            --token "$INFLUX_TOKEN" \
            --bucket "$BUCKET_NAME" \
            --precision s 2>&1 || {
                warn "Batch $BATCH_NUM had errors, continuing..."
            }

        CURRENT_LINE=$END_LINE
    done

    log "✅ Import completed successfully!"
else
    error "Failed to generate line protocol data"
fi

# Final summary
echo ""
echo "============================================"
log "📊 Import Summary"
echo "============================================"
info "Source: $SOURCE_ENV/$SELECTED_EXPORT"
info "Target: $TARGET_ENVIRONMENT"
info "Bucket: $BUCKET_NAME"
info "File: $(basename "$CSV_FILE")"
info "Size: $CSV_SIZE"
log "Status: Complete"
echo "============================================"
echo ""

log "🎉 Data successfully imported to $TARGET_ENVIRONMENT!"
echo ""
info "Verify the import with:"
echo "  influx query 'from(bucket:\"$BUCKET_NAME\") |> range(start:-1d) |> limit(n:10)' \\"
echo "    --host \"$INFLUX_URL\" --org \"$INFLUX_ORG\" --token \"$INFLUX_TOKEN\""
echo ""