跳转至

访问hdfs

pxf访问hdfs源

PXF访问hdfs源支持的数据格式

  • text – 支持写入

  • csv –支持写入

  • text:multi

  • avro

  • parquet – 支持写入

  • json

选项说明

选项 说明
delimiter 同csv和text
is_custom_write 创建可写parquet表时设置为true
formatter 创建可写parqeut表时设置为pxfwritable_export
file_as_row bool类型, 整个文件作为一行,只有在'text:multi'格式才有用, 同时被加载的表也必须只有一列
  • 对于text和csv,控制参数和本地文件导入所支持的参数一致,请参考COPY。

以下是访问文件相关说明

数据库中的表test,copy命令的目标表

create table test(a int, b varchar(20));
  1. 读取和写入text文件
$ hdfs dfs -cat /data/test.txt
100,string1
200,string2
300,string3

连接数据库执行以下语句:

seaboxsql=# create foreign table test_text(a int, b varchar(20))server hdfs_test    options(resource '/data/test.txt', format 'text', delimiter ',');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_text;
  a  |    b    
-----+---------
 100 | string1
 200 | string2
 300 | string3
(3 rows)
seaboxsql=# copy test from 'fdw://hdfs_test/data/test.txt' (format 'text',    delimiter ',');
COPY 3
seaboxsql=# create foreign table txt_write(a int, b varchar(20))server hdfs_test    options(resource '/data/write_text/', delimiter ',', format 'text');
CREATE FOREIGN TABLE
seaboxsql=# insert into txt_write values(100, 'string1'),(200, 'string2'),(300,    'string3');
INSERT 0 3
seaboxsql=# select * from txt_write;
  a  |    b    
-----+---------
 200 | string2
 100 | string1
 300 | string3
(3 rows)
  1. 读取和写入CSV文件
$ hdfs dfs -cat /data/test.csv
100,"first 
line"
200,"second line"
300,"third line"
seaboxsql=# create foreign table test_csv(a int, b varchar(20))server hdfs_test    options(resource '/data/test.csv', format 'csv', delimiter ',');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_csv;
  a  |    b    
-----+---------
 100 | string1
 200 | string2
 300 | string3
(3 rows)
seaboxsql=# copy test from 'fdw://hdfs_test/data/test.csv' (format 'csv',    delimiter ',');
COPY 3
seaboxsql=# create foreign table csv_write(a int, b varchar(20))server hdfs_test    options(resource '/data/csv_text/', format 'csv');
CREATE FOREIGN TABLE
seaboxsql=# insert into csv_write values(100,'string1'),(200,'string
2'),(300, 'string3');
INSERT 0 3
seaboxsql=# select * from csv_write;
  a  |    b    
-----+---------
 300 | string3
 100 | string1
 200 | string +
     | 2
(3 rows)
  1. 访问多行文本文件,每个文件当做一条数据

  2. format设置为test:multi

  3. 设置file_as_rowtrue

$ hdfs dfs -cat /data/test.txt
100,string1
200,string2
300,string3
seaboxsql=# create foreign table test_multi(a text)server hdfs_test options   (resource '/data/test.txt', format 'text:multi', file_as_row 'true');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_multi;
      a      
-------------
 100,string1+
 200,string2+
 300,string3
(1 row)
  1. 访问avro格式文件
seaboxsql=# create foreign table test_avro(a int, b varchar(20))server hdfs_test    options(resource '/data/test.avro', format 'avro');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_avro;
  a   |    b    
------+---------
 1000 | string1
 2000 | string2
 3000 | string3
(3 rows)
seaboxsql=# copy test from 'fdw://hdfs_test/data/test.avro' (format 'avro');
COPY 3
  1. 读取和写入parquet格式文件

  2. 创建parquet格式的可写表,需要设置参数is_custom_writeformatter

seaboxsql=# create foreign table test_parquet(a int, b varchar(20))server    hdfs_test options(resource '/data/test.parquet', format 'parquet');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_parquet;
  a   |    b    
------+---------
 1000 | string1
 2000 | string2
 3000 | string3
(3 rows)
seaboxsql=# copy test from 'fdw://hdfs_test/data/test.parquet' (format 'parquet');
COPY 3
seaboxsql=# create foreign table parquet_write(a int, b varchar(20))server    hdfs_test options(resource '/data/parquet_write/', format 'parquet',    is_custom_write 'true', formatter 'pxfwritable_export');
CREATE FOREIGN TABLE
seaboxsql=# insert into parquet_write values(100,'string1'),(200,'string2'),(300,    'string3');
INSERT 0 3
seaboxsql=# select * from parquet_write;
  a  |    b    
-----+---------
 100 | string1
 200 | string2
 300 | string3
(3 rows)
  1. 读取json文件

$ hdfs dfs -cat /data/test.json
{"a":"1000","b":"string1","c":{"d":10000,"e":"string10"},"f":null},
{"a":"2000","b":"string2","c":{"d":20000,"e":"string20"},"f":{"g":"dummy","h":   [100,200]}}
seaboxsql=# create foreign table  test_json(a int, b varchar(20), "c.d" int, "f.h   [0]" int) server hdfs_test options(resource '/data/test.json', format 'json');
CREATE FOREIGN TABLE
seaboxsql=# select * from test_json;
  a   |    b    |  c.d  | f.h[0] 
------+---------+-------+--------
 1000 | string1 | 10000 |       
 2000 | string2 | 20000 |    100
(2 rows)
seaboxsql=# copy json from 'fdw://hdfs_test/data/test.json' (format 'json');
COPY 2