2023.06.02 (Fri) ํ์ต์ ๋ฆฌ
#Hadoop #Hive #AWS #HQL
1. Hive ๋ฅผ ์ด์ฉํ ๋ฐ์ดํฐ ์ฒ๋ฆฌ
1-1 .ํ์ฉ ๋ฐ์ดํฐ
์์ธ์ ์ด๋ฆฐ๋ฐ์ดํฐ ๊ด์ฅ์ '์์ธ์ ๋ถ๋์ฐ ์ค๊ฑฐ๋๊ฐ ์ ๋ณด' ๋ฐ์ดํฐ๋ฅผ ํ์ฉํ์ฌ ์์ด๊ตฌ ๋ด ์ค๊ฑฐ๋๊ฐ ์์ฝ ์ ๋ณด, ๊ฑด๋ฌผ๋ณ ์ ๋ณด๋ฅผ ์ถ์ถํ๋ pipeline์ ์์ฑ
1-2. pipeline
- aws s3์์ ๋ฐ์ดํฐ๋ฅผ local์ ์ ์ฅ
- hive์ ํด๋น ๋ฐ์ดํฐ๊ฐ ์กด์ฌํ๋ฉด skip, ์กด์ฌํ์ง ์์ผ๋ฉด ์ ์ก
- ํด๋น ๋ฐ์ดํฐ๋ฅผ seoul table๋ก ์์ฑ
- seoul table์์ ์์ด๊ตฌ ์ ๋ณด ์ถ์ถ ํ ์กฐํ

1-3. hql ๋ฌธ
๐ seoul.hql > ์์ธ์ ๋ถ๋์ฐ ์ค๊ฑฐ๋์ ์ ๋ณด ํ ์ด๋ธ ์์ฑ
RAW ๋ฐ์ดํฐ์์ ์ฐ๋, ์์น๊ตฌ์ฝ๋, ์์น๊ตฌ๋ช , ๋ฒ์ ๋์ฝ๋, ๋ฒ์ ๋๋ช , ๊ฑด๋ฌผ๋ช , ๊ณ์ฝ์ผ, ๋ฌผ๊ฑด๊ธ์ก(๋ง์), ๊ฑด๋ฌผ์ฉ๋, ์ฃผ์. ํ์๋ง ์ถ์ถ ๋ฐ ๊ธฐํ ์ ์ฒ๋ฆฌ ํ ํ์ฉ
CREATE EXTERNAL TABLE IF NOT EXISTS tb_seoul(
idx INT,
year INT,
guCode INT,
guNm STRING,
dongCode INT,
dongNm STRING,
houseNm STRING,
contractDate DATE,
amount INT,
houseType STRING,
address STRING,
area INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE
LOCATION '/sub/hive/seoul/' --ํด๋น ๊ฒฝ๋ก์ ์๋ ํ์ผ ๊ฐ์ ธ์ค๊ธฐ
tblproperties ("skip.header.line.count"="1");
๐ seocho_desc.hql > ์์ด๊ตฌ ์์ฝ ์ ๋ณด ํ ์ด๋ธ ์์ฑ : 2023๋ ๋ ์์ด๊ตฌ ๋ด ๊ฑด๋ฌผ์ ํ๋ณ ์ค๊ฑฐ๋๊ฐ ๋ฐ ๋ฉด์ ํ๊ท
CREATE TABLE IF NOT EXISTS seocho_describe_2023 AS
SELECT
guNm,
dongNm,
houseType,
ROUND(AVG(amount),2) amount,
ROUND(AVG(area),2) area
FROM tb_seoul
WHERE guNm = '์์ด๊ตฌ' AND year = 2023
GROUP BY guNm, dongNm, houseType
ORDER BY guNm, dongNm, houseType;
๐ seocho_desc.hql > ์์ด๊ตฌ ๊ฑด๋ฌผ์ ํ๋ณ ์์ธ ํ ์ด๋ธ ์์ฑ : 2023๋ ๋ ์์ด๊ตฌ ๊ฑด๋ฌผ์ ํ ๋ด ๊ฑด๋ฌผ๋ณ ์ค๊ฑฐ๋๊ฐ ๋ฐ ๋ฉด์ ์ ๋ณด
-- ์์ด๊ตฌ ์ํํธ ์ ๋ณด
CREATE TABLE IF NOT EXISTS seocho_apt_2023 AS
SELECT
guNm,
dongNm,
houseType,
houseNm,
address,
AVG(amount) avg_amount,
MAX(amount) max_amount,
MIN(amount) min_amount,
AVG(area) avg_area
FROM tb_seoul
WHERE year = 2023 and guNm = '์์ด๊ตฌ' and houseType = '์ํํธ'
GROUP BY guNm, dongNm,houseType, address, houseNm
ORDER BY avg_amount desc;
-- ์์ด๊ตฌ ์คํผ์คํ
์ ๋ณด
CREATE TABLE IF NOT EXISTS seocho_effapt_2023 AS
SELECT
guNm,
dongNm,
houseType,
houseNm,
address,
AVG(amount) avg_amount,
MAX(amount) max_amount,
MIN(amount) min_amount,
AVG(area) avg_area
FROM tb_seoul
WHERE guNm = '์์ด๊ตฌ' AND year = 2023 AND houseType = '์คํผ์คํ
'
GROUP BY guNm, dongNm,houseType, address, houseNm
ORDER BY avg_amount desc;
-- ์์ด๊ตฌ ์ฐ๋ฆฝ๋ค์ธ๋ ์ ๋ณด
CREATE TABLE IF NOT EXISTS seocho_mul_2023 AS
SELECT
guNm,
dongNm,
houseType,
houseNm,
address,
AVG(amount) avg_amount,
MAX(amount) max_amount,
MIN(amount) min_amount,
AVG(area) avg_area
FROM tb_seoul
WHERE guNm = '์์ด๊ตฌ' AND year = 2023 AND houseType = '์ฐ๋ฆฝ๋ค์ธ๋'
GROUP BY guNm, dongNm,houseType, address, houseNm
ORDER BY houseType, avg_amount desc;
-- ์์ด๊ตฌ ๋จ๋
๋ค๊ฐ๊ตฌ ์ ๋ณด
CREATE TABLE IF NOT EXISTS seocho_det_2023 AS
SELECT
guNm,
dongNm,
houseType,
houseNm,
address,
AVG(amount) avg_amount,
MAX(amount) max_amount,
MIN(amount) min_amount,
AVG(area) avg_area
FROM tb_seoul
WHERE guNm = '์์ด๊ตฌ' AND year = 2023 AND houseType = '๋จ๋
๋ค๊ฐ๊ตฌ'
GROUP BY guNm, dongNm,houseType, address, houseNm
ORDER BY houseType, avg_amount desc;
๐ show_seocho_tb.hql > ์ ์ฒด ํ ์ด๋ธ ๋ด 10๊ฑด ์ถ๋ ฅ
SELECT * FROM seocho_describe_2023 limit 10;
SELECT * FROM seocho_apt_2023 limit 10;
SELECT * FROM seocho_effapt_2023 limit 10;
SELECT * FROM seocho_mul_2023 limit 10;
SELECT * FROM seocho_det_2023 limit 10;
1-4. ์ถ๋ ฅ ๊ฒฐ๊ณผ

2. ๋ณด์์
- ์์น๊ตฌ ์ ๋ณด๋ฅผ input ๊ฐ์ผ๋ก ๋ฐ์์ ํด๋น ๊ตฌ์ ์ ๋ณด๋ฅผ ํ
์ด๋ธ๋ก ๋ง๋ค๊ณ ์ถ๋ ฅํ ์ ์๋?
- airflow UI์์ input ๋ฐ๊ณ dag๋ก ์ ๋ฌ → input ๊ฐ์ hql๋ก ์ ๋ฌ
- local์ ๊ฑฐ์น์ง ์๊ณ aws s3 → hadoop ์ผ๋ก ๋ฐ๋ก ๋ณด๋ด๊ธฐ (์๋ํ์ง๋ง ๊ณ์ ์ค๋ฅ ๋ฐ์ ๐ฅ)
'๐ Data > Engineering' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
| [Sqoop] Sqoop ์ค์น ๋ฐ ๊ฐ์ (0) | 2023.06.08 |
|---|---|
| [Spark] Spark, Zeppelin Notebook ์ฌ์ฉํ๊ธฐ (0) | 2023.06.05 |
| [Hive] Hive ์ค์น ๋ฐ ๊ฐ์ (0) | 2023.05.31 |
| [Hadoop] Hadoop ์ค์น ๋ฐ ๊ฐ์ (0) | 2023.05.30 |
| [Airflow] Airflow Standalone ์ค์น ๋ฐ ํ ์คํธ (0) | 2023.05.26 |