JSON Output

All DataSpoc CLI commands support --output json for machine-readable output. Use this in shell scripts, CI/CD pipelines, or any automation tool that can parse JSON.

Lens Commands

List tables

dataspoc-lens catalog --output json

{
  "tables": [
    {
      "name": "raw.my_source.orders",
      "row_count": 125000,
      "columns": 12,
      "last_updated": "2025-01-17T14:30:00Z",
      "size_bytes": 4521984
    },
    {
      "name": "raw.my_source.customers",
      "row_count": 8500,
      "columns": 8,
      "last_updated": "2025-01-17T14:30:00Z",
      "size_bytes": 312576
    }
  ]
}

Run a SQL query

dataspoc-lens query "SELECT customer, SUM(revenue) as total FROM raw.my_source.orders GROUP BY customer ORDER BY total DESC LIMIT 3" --output json

{
  "columns": ["customer", "total"],
  "rows": [
    ["Globex Inc", 28000],
    ["Acme Corp", 19200],
    ["Initech", 12000]
  ],
  "row_count": 3,
  "elapsed_ms": 42
}

Ask a natural language question

dataspoc-lens ask "top customers by revenue" --output json

{
  "question": "top customers by revenue",
  "sql": "SELECT customer, SUM(revenue) as total_revenue FROM raw.my_source.orders GROUP BY customer ORDER BY total_revenue DESC LIMIT 10",
  "columns": ["customer", "total_revenue"],
  "rows": [
    ["Globex Inc", 28000],
    ["Acme Corp", 19200],
    ["Initech", 12000]
  ],
  "row_count": 3,
  "elapsed_ms": 187
}

Cache status

dataspoc-lens cache --list --output json

{
  "tables": [
    {
      "name": "raw.my_source.orders",
      "cached": true,
      "stale": false,
      "cache_size_bytes": 4521984,
      "cached_at": "2025-01-17T14:30:00Z",
      "source_updated_at": "2025-01-17T14:30:00Z"
    },
    {
      "name": "raw.my_source.customers",
      "cached": true,
      "stale": true,
      "cache_size_bytes": 312576,
      "cached_at": "2025-01-16T10:00:00Z",
      "source_updated_at": "2025-01-17T14:30:00Z"
    }
  ]
}

Pipe Commands

Pipeline status

dataspoc-pipe status --output json

{
  "pipelines": [
    {
      "name": "my-source",
      "status": "success",
      "last_run": "2025-01-17T14:30:00Z",
      "rows_synced": 125000,
      "tables": 5,
      "duration_seconds": 45
    }
  ]
}

Pipeline logs

dataspoc-pipe logs my-source --output json

{
  "pipeline": "my-source",
  "entries": [
    {
      "timestamp": "2025-01-17T14:30:00Z",
      "level": "info",
      "message": "Starting pipeline my-source"
    },
    {
      "timestamp": "2025-01-17T14:30:15Z",
      "level": "info",
      "message": "Extracted 125000 rows from orders"
    },
    {
      "timestamp": "2025-01-17T14:30:45Z",
      "level": "info",
      "message": "Pipeline completed successfully"
    }
  ]
}

Bucket manifest

dataspoc-pipe manifest --output json

{
  "version": "1.0",
  "bucket": "s3://my-data",
  "tables": [
    {
      "path": "raw/my-source/orders",
      "format": "parquet",
      "row_count": 125000,
      "partitions": ["dt"],
      "schema": {
        "columns": [
          {"name": "order_id", "type": "int64"},
          {"name": "customer", "type": "string"},
          {"name": "revenue", "type": "float64"},
          {"name": "dt", "type": "date"}
        ]
      }
    }
  ]
}

Validate pipeline

dataspoc-pipe validate my-source --output json

{
  "pipeline": "my-source",
  "valid": true,
  "errors": [],
  "warnings": [
    "Table 'legacy_orders' has no primary key configured"
  ]
}

Using JSON Output in Scripts

Bash with jq

# Get row count for a specific table
dataspoc-lens catalog --output json | jq '.tables[] | select(.name == "raw.my_source.orders") | .row_count'

# Check if any pipeline failed
dataspoc-pipe status --output json | jq '.pipelines[] | select(.status == "failed") | .name'

# Run pipeline only if validation passes
if dataspoc-pipe validate my-source --output json | jq -e '.valid' > /dev/null; then
  dataspoc-pipe run my-source
fi

Python with subprocess

import json
import subprocess

result = subprocess.run(
    ["dataspoc-lens", "catalog", "--output", "json"],
    capture_output=True,
    text=True,
)
catalog = json.loads(result.stdout)

for table in catalog["tables"]:
    print(f"{table['name']}: {table['row_count']} rows")