# athena datasource name: rds1_db1 rds1_db2 # Lambda environment: rds1_db1_connection_string: postgres://jdbc:postgresql://.../db1 rds1_db2_connection_string: postgres://jdbc:postgresql://.../db2
Parameters: ... ConnectionString1: Description: 'ConnectionString1' Type: String ConnectionString2: Description: 'ConnectionString2' Type: String ... Resources: JdbcConnectorConfig: Type: AWS::Serverless::Function Properties: Environment: Variables: ... rds1_db1_connection_string: Ref: ConnectionString1 rds1_db2_connection_string: Ref: ConnectionString2
問題:
解決:
WITH SERDEPROPERTIES ( "mapping.ses_configurationset"="ses:configuration-set", "mapping.ses_source_ip"="ses:source-ip", "mapping.ses_from_domain"="ses:from-domain", "mapping.ses_caller_identity"="ses:caller-identity" )
{ "aws_accounts": [ { "id": "123456789012", "name": "myaccount1", "profile": "account1" } ] }
CREATE EXTERNAL TABLE IF NOT EXISTS aws_account ( aws_accounts array< struct< id: string, name: string, profile: string > > ) COMMENT 'aws_account table for s3://example bucket' ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' LOCATION 's3://athena-bucket/id=aws_accounts/json';
SELECT cols.id, cols.profile FROM aws_account, UNNEST(aws_accounts) t(cols)
記事:
例: AWS billing (CUR)から、月別/サービス別のコストを抽出。サービスを行、月を列に変換。
WITH t1 AS ( SELECT bill_billing_period_start_date AS TIME, 'Total' AS product, SUM(line_item_blended_cost) AS cost FROM cost.monthly WHERE bill_billing_period_start_date BETWEEN TIMESTAMP '2023-07-01 00:00:00' AND TIMESTAMP '2023-09-30 23:59:59' GROUP BY 1 UNION ALL SELECT bill_billing_period_start_date AS TIME, line_item_product_code AS product, SUM(line_item_blended_cost) AS cost FROM cost.monthly WHERE bill_billing_period_start_date BETWEEN TIMESTAMP '2023-07-01 00:00:00' AND TIMESTAMP '2023-09-30 23:59:59' GROUP BY 1, 2 ), t2 AS ( SELECT TIME, map_agg(product, cost) AS kv_product FROM t1 GROUP BY TIME ) SELECT t2.time, t2.kv_product['Total'] AS Total, t2.kv_product['AmazonEC2'] AS AmazonEC2, t2.kv_product['AmazonRDS'] AS AmazonRDS, t2.kv_product['AmazonS3'] AS AmazonS3 FROM t2 ORDER BY t2.time
WITH SERDEPROPERTIES ( 'ignore.malformed.json' = 'true')
空白行を返す場合:
記事:
resource "aws_athena_named_query" "create_table1" { name = "[example] select from table1" description = "terraform managed:" workgroup = aws_athena_workgroup.main.id database = aws_athena_database.main.id query = <<EOT CREATE EXTERNAL TABLE IF NOT EXISTS table1 ( ... ); EOT } locals { create_table_query = [ aws_athena_named_query.create_table1.query, ] } resource "null_resource" "exec_create_table_query" { for_each = toset(local.create_table_query) provisioner "local-exec" { command = <<-EOF aws athena start-query-execution \ --work-group "${aws_athena_workgroup.main.id}" \ --query-execution-context Database="${aws_athena_database.main.id}" \ --query-string "${replace(replace(replace(replace(each.value, "`", "\\`"), "\"", "\\\""), "$", "\\$"), "\\\\\"", "\\\"")}" \ --profile ${var.aws_profile} \ --region ${var.aws_region} EOF } }
replace(..., "\\\\\"", "\\\"")
terraform state rm null_resource.<name> terraform apply
記事:
記事:
記事:
regexp_extract(url, '/id/([\w]+)/', 1)
SELECT dataset.name, ...
SELECT json_extract(BLOB, '$.name') AS name
WITH t1 AS ( SELECT * FROM a ) SELECT t1.* FROM t1
記事:
SELECT "$path" FROM db WHERE DATE='2022/08/01'
記事:
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
msck repair TABLE <db>.<table>;
aws --profile <example> s3 cp s3://<example>/athena/monthly/20231001-20231101/monthly-create-table.sql - | grep resource_tags
workgroupで設定した出力s3 bucketの形式はCSV固定。
任意のs3 bucketや指定形式で出力したい場合。
UNLOAD (SELECT * FROM "example"."user") TO 's3://example/user/tsv/' WITH (format = 'TEXTFILE', field_delimiter = '\t')
UNLOAD (SELECT id, name, YEAR, MONTH, DAY FROM "example"."user") TO 's3://example/user/tsv/' WITH (format = 'TEXTFILE', field_delimiter = '\t', partitioned_by = ARRAY['year','month','day'])
SELECT * FROM ( VALUES (1, 'a'), (2, 'b'), (3, 'c') ) AS t (id, name)
s3:/bucket/prefix/id=<id>/dt=<YYYY>-<MM>-<DD>/file.csv.gz s3:/bucket/prefix/id=<id>/dt=<YYYY>-<MM>-<DD>-<HH>/file.csv.gz s3:/bucket/prefix/id=<id>/year=<YYYY>/month=<MM>/day=<DD>/file.csv.gz
SELECT * FROM t WHERE t.year || t.month || t.day || t.hour BETWEEN '2022080100' AND '2022080200'
SELECT * FROM t WHERE concat(t.year, t.month, t.day) BETWEEN date_format(CURRENT_TIMESTAMP AT TIME ZONE 'Asia/Tokyo' - INTERVAL '7' DAY, '%Y%m%d') AND date_format(CURRENT_TIMESTAMP AT TIME ZONE 'Asia/Tokyo' - INTERVAL '1' DAY, '%Y%m%d')
2018-12-24T02:34:48.895006Z
SELECT substr(TIME,1,13) || ':00:00Z' AS time_hour, COUNT(elb) AS request_count FROM example.alb_logs WHERE DATE = format_datetime(now() - INTERVAL '1' DAY,'YYYY/MM/dd') GROUP BY substr(TIME,1,13) ORDER BY time_hour ASC;
SELECT substr(TIME,1,16) || ':00Z' AS time_min, COUNT(elb) AS request_count FROM example.alb_logs WHERE DATE = format_datetime(now() - INTERVAL '1' DAY,'YYYY/MM/dd') GROUP BY substr(TIME,1,16) ORDER BY time_min ASC;
SELECT date_trunc('hour', from_iso8601_timestamp(TIME)) AS time_hour, COUNT(elb) AS request_count FROM example.alb_logs WHERE DATE = format_datetime(now() - INTERVAL '1' DAY,'YYYY/MM/dd') GROUP BY date_trunc('hour', from_iso8601_timestamp(TIME)) ORDER BY time_hour ASC;
SELECT date_trunc('minute', from_iso8601_timestamp(TIME)) AS time_min, COUNT(elb) AS request_count FROM example.alb_logs WHERE DATE = format_datetime(now() - INTERVAL '1' DAY,'YYYY/MM/dd') GROUP BY date_trunc('minute', from_iso8601_timestamp(TIME)) ORDER BY time_min ASC;
SELECT date_parse(REPLACE('May 1 07:12:47', ' ', ' '), '%b %e %T')
SELECT now() AS _now, YEAR(TIMESTAMP '2018-12-24 02:34:48 UTC') AS _year, MONTH(TIMESTAMP '2018-12-24 02:34:48 UTC') AS _month, DAY(TIMESTAMP '2018-12-24 02:34:48 UTC') AS _day, week(TIMESTAMP '2018-12-24 02:34:48 UTC') AS _week, HOUR(TIMESTAMP '2018-12-24 02:34:48 UTC') AS _hour, MINUTE(TIMESTAMP '2018-12-24 02:34:48 UTC') AS _minute, SECOND(TIMESTAMP '2018-12-24 02:34:48 UTC') AS _second; -- _now: 2020-08-01 02:03:04.953 UTC -- _year: 2018 -- _month: 12 -- _day: 24 -- _week: 52 -- _hour: 2 -- _minute: 34 -- _second: 48
SELECT current_timezone(); -- 結果: UTC SELECT now() AT TIME ZONE 'Asia/Tokyo'; -- 結果: 2020-07-31 12:56:13.373 Asia/Tokyo SELECT TIMESTAMP '2018-12-24 02:34:48 UTC' AT TIME ZONE 'Asia/Tokyo'; -- 結果: 2018-12-24 11:34:48.000 Asia/Tokyo
SELECT DATE '2018-12-24' AS DATE; -- 結果: 2018-12-24 SELECT DATE '2018/12/24' AS DATE; -- 結果: INVALID_CAST_ARGUMENT: Value cannot be cast to date: 2018-12-24 SELECT date_parse('2018-12-24','%Y/%m/%d') AS TIMESTAMP; -- 結果: 2018-12-24 00:00:00.000 -- iso8601は専用の関数がある。from_iso8601_date()もある SELECT from_iso8601_timestamp('2018-12-24T02:34:48Z') AS TIMESTAMP; -- 結果: 2018-12-24 02:34:48.000 UTC
-- 月だけ取得 SELECT format_datetime(now(), 'MM') -- 結果: 07 -- 今日 SELECT format_datetime(now(),'YYYY/MM/dd') AS DATE; -- 結果: 2020/07/31 -- 1ヶ月前 SELECT format_datetime(now() - INTERVAL '1' MONTH, 'YYYY/MM/dd') AS DATE; -- 結果: 2020/06/30
SELECT format_datetime(date_parse('06/Feb/2019:00:00:38 +0000', '%d/%b/%Y:%H:%i:%s +0000'),'YYYY-MM-dd HH:mm:ss') -- 結果: 2019-02-06 00:00:38
-- 月初を取得 SELECT date_trunc('month', now()); -- 結果: 2020-07-01 00:00:00.000 UTC SELECT date_trunc('minute', now()); -- 結果: 2020-09-07 04:17:00.000 UTC SELECT date_trunc('hour', now()); -- 結果: 2020-09-07 04:00:00.000 UTC
# 3ヶ月前〜 "projection.date.range" = "NOW-3MONTH,NOW", # 3年前〜 "projection.date.range" = "NOW-3YEARS,NOW",
"projection.date.range" = "NOW-3MONTH,NOW+9HOURS", "projection.date.format" = "yyyy/MM/dd/HH",
記事:
HIVE形式で「s3://bucket/year=${year}/month=${month}/day=${day}」とパーティションを分けているとする。
2019〜YYYYまでをパーティションとして認識させたい。
TBLPROPERTIES ( ... 'projection.enabled'='true', "projection.year.type" = 'date', "projection.year.range" = '2019,NOW', "projection.year.format" = 'yyyy', "projection.year.interval" = '1' , "projection.year.interval.unit" = 'YEARS', 'projection.month.type'='integer', 'projection.month.range'='1,12', 'projection.month.interval'='1', 'projection.month.digits'='2', 'projection.day.type'='integer', 'projection.day.range'='1,31', 'projection.day.interval'='1', 'projection.day.digits'='2', 'storage.location.template'='s3://bucket/year=${year}/month=${month}/day=${day}' );
GENERIC_INTERNAL_ERROR: Text '2019' could not be parsed: Unable to obtain YearMonth from TemporalAccessor: {Year=2020},ISO of type java.time.format.Parsed
TBLPROPERTIES ( ... 'projection.year.type'='integer', 'projection.year.range'='2019,2030', 'projection.year.interval'='1', 'projection.year.digits'='4', ...
SELECT * FROM <table> WHERE YEAR=YEAR(now() - INTERVAL '1' DAY) AND MONTH=MONTH(now() - INTERVAL '1' DAY) AND DAY=DAY(now() - INTERVAL '1' DAY)
TBLPROPERTIES ( ... "projection.date.type" = "date", "projection.date.range" = "2020/07/01,NOW", "projection.date.format" = "year=yyyy/month=MM/day=dd", "projection.date.interval" = "1" , "projection.date.interval.unit" = "DAYS",
実行順序:
SQL実行順序:
local sql=$(cat << EOD CREATE EXTERNAL TABLE IF NOT EXISTS dns ( Date string, QueryType string, Client string, SendReceive string, Protocol string, RecordType string, Query string, Results string ) PARTITIONED BY ( year int, month int, day int ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( 'separatorChar' = ',', 'quoteChar' = '\"', 'escapeChar' = '\\\\' ) STORED AS TEXTFILE LOCATION 's3://${S3_BUCKET}/${prefix}id=DNS/' TBLPROPERTIES ( 'skip.header.line.count'='1', 'has_encrypted_data'='false' ) ; EOD )
s3:/bucket/prefix/id=<id>/dt=<YYYY>-<MM>-<DD>/file.csv.gz s3:/bucket/prefix/id=<id>/dt=<YYYY>-<MM>-<DD>-<HH>/file.csv.gz s3:/bucket/prefix/id=<id>/year=<YYYY>/month=<MM>/day=<DD>/file.csv.gz
Athenaは1TBのスキャンあたり、1回$5かかる。スキャンデータを減らす工夫をしないと、コストが非常に増えるまた遅い。
s3:/bucket/prefix/id=<id>/year=<YYYY>/month=<MM>/day=<DD>/file.csv.gz
MSCK REPAIR TABLE TABLE_NAME;