访问hdfs
pxf访问hdfs源¶
PXF访问hdfs源支持的数据格式
-
text – 支持写入
-
csv –支持写入
-
text:multi
-
avro
-
parquet – 支持写入
-
json
选项说明
选项 | 说明 |
---|---|
delimiter | 同csv和text |
is_custom_write | 创建可写parquet表时设置为true |
formatter | 创建可写parqeut表时设置为pxfwritable_export |
file_as_row | bool类型, 整个文件作为一行,只有在'text:multi'格式才有用, 同时被加载的表也必须只有一列 |
- 对于text和csv,控制参数和本地文件导入所支持的参数一致,请参考COPY。
以下是访问文件相关说明
数据库中的表test,copy命令的目标表
create table test(a int, b varchar(20));
- 读取和写入text文件
$ hdfs dfs -cat /data/test.txt
100,string1
200,string2
300,string3
连接数据库执行以下语句:
seaboxsql=# create foreign table test_text(a int, b varchar(20))server hdfs_test options(resource '/data/test.txt', format 'text', delimiter ',');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_text;
a | b
-----+---------
100 | string1
200 | string2
300 | string3
(3 rows)
seaboxsql=# copy test from 'fdw://hdfs_test/data/test.txt' (format 'text', delimiter ',');
COPY 3
seaboxsql=# create foreign table txt_write(a int, b varchar(20))server hdfs_test options(resource '/data/write_text/', delimiter ',', format 'text');
CREATE FOREIGN TABLE
seaboxsql=# insert into txt_write values(100, 'string1'),(200, 'string2'),(300, 'string3');
INSERT 0 3
seaboxsql=# select * from txt_write;
a | b
-----+---------
200 | string2
100 | string1
300 | string3
(3 rows)
- 读取和写入CSV文件
$ hdfs dfs -cat /data/test.csv
100,"first
line"
200,"second line"
300,"third line"
seaboxsql=# create foreign table test_csv(a int, b varchar(20))server hdfs_test options(resource '/data/test.csv', format 'csv', delimiter ',');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_csv;
a | b
-----+---------
100 | string1
200 | string2
300 | string3
(3 rows)
seaboxsql=# copy test from 'fdw://hdfs_test/data/test.csv' (format 'csv', delimiter ',');
COPY 3
seaboxsql=# create foreign table csv_write(a int, b varchar(20))server hdfs_test options(resource '/data/csv_text/', format 'csv');
CREATE FOREIGN TABLE
seaboxsql=# insert into csv_write values(100,'string1'),(200,'string
2'),(300, 'string3');
INSERT 0 3
seaboxsql=# select * from csv_write;
a | b
-----+---------
300 | string3
100 | string1
200 | string +
| 2
(3 rows)
-
访问多行文本文件,每个文件当做一条数据
-
format设置为
test:multi
-
设置
file_as_row
为true
$ hdfs dfs -cat /data/test.txt
100,string1
200,string2
300,string3
seaboxsql=# create foreign table test_multi(a text)server hdfs_test options (resource '/data/test.txt', format 'text:multi', file_as_row 'true');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_multi;
a
-------------
100,string1+
200,string2+
300,string3
(1 row)
- 访问avro格式文件
seaboxsql=# create foreign table test_avro(a int, b varchar(20))server hdfs_test options(resource '/data/test.avro', format 'avro');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_avro;
a | b
------+---------
1000 | string1
2000 | string2
3000 | string3
(3 rows)
seaboxsql=# copy test from 'fdw://hdfs_test/data/test.avro' (format 'avro');
COPY 3
-
读取和写入parquet格式文件
-
创建parquet格式的可写表,需要设置参数
is_custom_write
和formatter
。
seaboxsql=# create foreign table test_parquet(a int, b varchar(20))server hdfs_test options(resource '/data/test.parquet', format 'parquet');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_parquet;
a | b
------+---------
1000 | string1
2000 | string2
3000 | string3
(3 rows)
seaboxsql=# copy test from 'fdw://hdfs_test/data/test.parquet' (format 'parquet');
COPY 3
seaboxsql=# create foreign table parquet_write(a int, b varchar(20))server hdfs_test options(resource '/data/parquet_write/', format 'parquet', is_custom_write 'true', formatter 'pxfwritable_export');
CREATE FOREIGN TABLE
seaboxsql=# insert into parquet_write values(100,'string1'),(200,'string2'),(300, 'string3');
INSERT 0 3
seaboxsql=# select * from parquet_write;
a | b
-----+---------
100 | string1
200 | string2
300 | string3
(3 rows)
- 读取json文件
$ hdfs dfs -cat /data/test.json
{"a":"1000","b":"string1","c":{"d":10000,"e":"string10"},"f":null},
{"a":"2000","b":"string2","c":{"d":20000,"e":"string20"},"f":{"g":"dummy","h": [100,200]}}
seaboxsql=# create foreign table test_json(a int, b varchar(20), "c.d" int, "f.h [0]" int) server hdfs_test options(resource '/data/test.json', format 'json');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_json;
a | b | c.d | f.h[0]
------+---------+-------+--------
1000 | string1 | 10000 |
2000 | string2 | 20000 | 100
(2 rows)
seaboxsql=# copy json from 'fdw://hdfs_test/data/test.json' (format 'json');
COPY 2