Athena - Part 4 - Analytics API

Setting up the analytics API to query the data

Steve Clements

April 16, 2023

Steve Clements

Cover Image for Athena - Part 4 - Analytics API

Back to Part 3 - Data Transformation

Part 4 - Athena Analytics API

or how to use the aws sdk version 3 to query Athena.

There are a couple of ways to do this. Firstly, are the results of the query going to returned to the user or is it part of a process that will be used by another service?

If the results are going to be returned to the user then the API needs to be able to handle the query and return the results. If the results are going to be used by another service then the API just needs to be able to handle the query and return the query id.

For this example we're going to return the results to the user.

Setting up the API

Before we write the code we need to set up the API. This is done using terraform:

resource "aws_api_gateway_rest_api" "analytics_api" {
  name        = "analytics_api"
  description = "Analytics API"
}

resource "aws_api_gateway_resource" "analytics_api" {
  rest_api_id = aws_api_gateway_rest_api.analytics.id
  parent_id   = aws_api_gateway_rest_api.analytics.root_resource_id
  path_part   = "analytics"
}

resource "aws_api_gateway_method" "analytics_api" {
  rest_api_id      = aws_api_gateway_rest_api.analytics_api.id
  resource_id      = aws_api_gateway_resource.analytics_api.id
  http_method      = "GET"
  authorization    = "NONE"
  api_key_required = true // protected by an API key

  request_validator_id = aws_api_gateway_request_validator.analytics_api.id
}

resource "aws_api_gateway_integration" "analytics_api" {
  rest_api_id             = aws_api_gateway_rest_api.analytics_api.id
  resource_id             = aws_api_gateway_resource.analytics_api.id
  http_method             = aws_api_gateway_method.analytics_api.http_method
  integration_http_method = "POST"
  type                    = "AWS_PROXY"
  uri                     = aws_lambda_function.analytics_api.invoke_arn
}

resource "aws_api_gateway_method" "analytics_api_options" {
  rest_api_id   = aws_api_gateway_rest_api.analytics_api.id
  resource_id   = aws_api_gateway_resource.analytics_api.id
  http_method   = "OPTIONS"
  authorization = "NONE"

  request_validator_id = aws_api_gateway_request_validator.analytics_api.id
}

resource "aws_api_gateway_integration" "analytics_api_options" {
  rest_api_id = aws_api_gateway_rest_api.analytics_api.id
  resource_id = aws_api_gateway_resource.analytics_api.id
  http_method = aws_api_gateway_method.analytics_api_options.http_method
  type        = "MOCK"
  request_templates = {
    "application/json" = jsonencode(
      {
        statusCode = 200
      }
    )
  }
}

resource "aws_api_gateway_method_response" "analytics_api_options" {
  rest_api_id = aws_api_gateway_rest_api.analytics_api.id
  resource_id = aws_api_gateway_resource.analytics_api.id
  http_method = aws_api_gateway_method.analytics_api_options.http_method
  status_code = 200

  response_parameters = {
    "method.response.header.Access-Control-Allow-Headers" = true,
    "method.response.header.Access-Control-Allow-Methods" = true,
    "method.response.header.Access-Control-Allow-Origin"  = true
  }
}

resource "aws_api_gateway_integration_response" "analytics_api_options" {
  rest_api_id = aws_api_gateway_rest_api.analytics_api.id
  resource_id = aws_api_gateway_resource.analytics_api.id
  http_method = aws_api_gateway_method_response.analytics_api_options.http_method
  status_code = aws_api_gateway_method_response.analytics_api_options.status_code

  response_parameters = {
    "method.response.header.Access-Control-Allow-Headers" = "'Content-Type,X-Amz-Date,Authorization,X-Api-Key,X-Amz-Security-Token,X-Amz-User-Agent,X-Amzn-Trace-Id'",
    "method.response.header.Access-Control-Allow-Methods" = "'OPTIONS,GET'",
    "method.response.header.Access-Control-Allow-Origin"  = "'${var.cors_origin}'"
  }

  response_templates = {
    "application/json" = ""
  }
}


resource "aws_api_gateway_method_settings" "analytics_api" {
  rest_api_id = aws_api_gateway_rest_api.analytics_api.id
  stage_name  = aws_api_gateway_stage.analytics_api.stage_name
  method_path = "${aws_api_gateway_resource.analytics_api.path}/*/GET"

  settings {
    metrics_enabled = true
    logging_level   = "INFO"
  }
}

resource "aws_api_gateway_request_validator" "analytics_api" {
  name                        = "${var.namespace}-analytics-api-${var.environment}"
  rest_api_id                 = aws_api_gateway_rest_api.analytics_api.id
  validate_request_parameters = true
}

resource "aws_api_gateway_deployment" "analytics_api" {
  rest_api_id = aws_api_gateway_rest_api.analytics_api.id

  triggers = {
    redeployment = sha1(join(",", [
      jsonencode(aws_api_gateway_rest_api.analytics_api),
      jsonencode(aws_api_gateway_resource.analytics_api),
      jsonencode(aws_api_gateway_resource.analytics_api_measure),
      jsonencode(aws_api_gateway_resource.analytics_api_period),
      jsonencode(aws_api_gateway_resource.analytics_api_start),
      jsonencode(aws_api_gateway_method.analytics_api),
      jsonencode(aws_api_gateway_integration.analytics_api),
      jsonencode(aws_api_gateway_method.analytics_api_options),
      jsonencode(aws_api_gateway_integration.analytics_api_options),
      jsonencode(aws_api_gateway_method.analytics_api_options),
      jsonencode(aws_api_gateway_method_response.analytics_api_options),
      jsonencode(aws_api_gateway_integration_response.analytics_api_options)
    ]))
  }

  lifecycle {
    create_before_destroy = true
  }
}

resource "aws_api_gateway_stage" "analytics_api" {
  deployment_id = aws_api_gateway_deployment.analytics_api.id
  rest_api_id   = aws_api_gateway_rest_api.analytics_api.id
  stage_name    = "default"
}

resource "aws_api_gateway_usage_plan" "analytics_api" {
  name = "${var.namespace}-analytics-api-${var.environment}"

  api_stages {
    api_id = aws_api_gateway_rest_api.analytics_api.id
    stage  = aws_api_gateway_stage.analytics_api.stage_name
  }
}

resource "aws_api_gateway_api_key" "analytics_api" {
  name = "${var.namespace}-analytics-api-${var.environment}"
}

resource "aws_api_gateway_usage_plan_key" "analytics_api" {
  key_id        = aws_api_gateway_api_key.analytics_api.id
  key_type      = "API_KEY"
  usage_plan_id = aws_api_gateway_usage_plan.analytics_api.id
}

Whew! That's a lot of code. I could break it down, but that is the bare minimum required to deploy a secure api in aws using terraform.

The lambda API handler

To return results directly from Athena we need to split the code into 3 areas:

start the query
wait for the query to finish
process and return the results

Start the query

const client = new AthenaClient({ region: 'eu-west-2' })

// This is the simple part
const command = new StartQueryExecutionCommand({
  QueryString: 'select * from analytics limit 10',
})
const result = await client.send(command)

The above kicks off the Athena query and is an asynchronous operation. We need to wait for the query to finish before we can return the results. We can check the status of the query using the 'getQueryExecution' command. Here's a function that manages the response from the 'getQueryExecution' command:

const waitForQueryResults = async (id: string) => {
  let response
  let isQueryStillRunning = true
  while (isQueryStillRunning) {
    isQueryStillRunning = false
    const command = new GetQueryExecutionCommand({
      QueryExecutionId: id,
    })
    response = await client.send(command)
    const state = response.QueryExecution?.Status?.State
    if (state === 'QUEUED' || state === 'RUNNING') {
      isQueryStillRunning = true
    }
    if (state === 'FAILED') {
      logger.error({ message: response.QueryExecution?.Status })
      throw Error('query failed')
    }
  }
  logger.info({ message: 'results for query id returned', queryId: id })
}
//invoke like this:
await waitForQueryResults(result.QueryExecutionId)

Process and return the results

Once the query has finished we can return the results. We can use the getQueryResults command to get the results:

const command = new GetQueryResultsCommand({
  QueryExecutionId: id,
})
const queryResults = await client.send(command)

Conclusion

So there we have it! A full pipeline for recording analytics data, storing it cheaply and querying it quickly. We can now use the data to power our dashboards and make decisions based on the data. If you have any questions or comments please get in touch!