Monday, June 19, 2017

Hive-Json Procession[semi structured data]

A json is a



{
  "name":"manohar",
  "gender":"male",
  "color":"white",
  "age":34
}

[cloudera@quickstart ~]$ cat >  json1
{"name":"manohar","gender":"male","age":"34"}
{"name":"vidhatri","gender":"female","age":"1","city":"nellore"}
^C

hive> create database jsondatabase;
hive> use jsondatabase;
hive> create table person(line string);
hive> load data local inpath 'json1' into table person;
hive> select *from person;

hive:get_json_object
--------------------
This method is used to select a property/node from a json file/record in a hive table.

It takes 2  parameters
i)colname
ii)propertyname

syntax
------
get_json_object(colname,'propertyname');

$-->It is rootnode of json record.

hive> select get_json_object(line,'$.name') from person;

hive> select
    > get_json_object(line,'$.name'),
    > get_json_object(line,'$.age'),
    > get_json_object(line,'$.gender'),
    > get_json_object(line,'$.city')
    > from person;

converting semistructured to structured in hive
-----------------------------------------------
we can split the properties of a json into cols of a hive table,which makes json strcuctured.

hive> create table personal_details(name string,age int,gender string,city string);

hive> insert overwrite table personal_details
    > select
    > get_json_object(line,'$.name')
    > ,get_json_object(line,'$.age'),
    > get_json_object(line,'$.gender'),
    > get_json_object(line,'$.city')
    > from person;


hive:json_tuple
---------------
A json_tuple is used to split raw data into cols/properties.

hive> create table personal_details(name string,gender string,age int,city string);

hive>insert ....
   >select alisname.* from tablename
    lateral view
    json_tuple(colname,'prop1,'prop2',.....)
    aliasname as name1,name2,.....;
   

hive> insert overwrite table personal_details
    > select x.* from json_rawtable
    > lateral view json_tuple
    > (line,'name','gender','age','city')
    > x as n,g,a,c;


Json with child node
--------------------
[cloudera@quickstart ~]$ cat >  json2
{"name":"manohar","age":"34","address":{"state":"india","city":"hyd"},"gender":"male"}


hive> create table person_info(line string);
hive> load data local inpath 'json2' into table person_info;
hive> select *from person_info;

Loading data col wise
---------------------
hive> create table person_info_table(name string,age int,address string,gender string);

hive> insert overwrite table person_info_table
    > select x.* from person_info
    > lateral view
    > json_tuple(line,'name','age','address','gender')x
    > as n,a,addr,g;
hive> select *from person_info_table;

Loading child node by spitting col wise
---------------------------------------
hive> create table per_info(name string,age int,state string,city string,gender string);

hive> insert overwrite table json_strdata
    > select
    > get_json_object(line,'$.name'),
    > get_json_object(line,'$.age'),
    > get_json_object(line,'$.address.state'),
    > get_json_object(line,'$.address.city'),
    > get_json_object(line,'$.gender')
    > from json_rawtable;


hive> insert overwrite table per_info
    > select name,age,get_json_object(address,'$.state'),
    > get_json_object(address,'$.city'),
    > gender from person_info_table;

No comments: