Kettle (第一章 表对表数据抽取)
2023-12-13 17:37:01
前言
最近在研究学习kettle的api,通过java程序调用kettle的api,实现表对表的数据抽取
一、流程图
二、代码
1.表输入
代码如下(示例):
/**
* 表输入
* @param mySQLDatabaseMeta
* @return
*/
private static StepMeta tableInputMeta(DatabaseMeta mySQLDatabaseMeta) {
//表输入
TableInputMeta tableInputMeta = new TableInputMeta();
//设置数据源
tableInputMeta.setDatabaseMeta(mySQLDatabaseMeta);
//设置查询的SQL
tableInputMeta.setSQL("select StuNo, StuName, StuAge, StuClass from Student");
// 获取注册插件的id,这里的id类似于策略模式,进行任务分发
String tableInputPluginId = PluginRegistry.getInstance().getPluginId(StepPluginType.class, tableInputMeta);
StepMeta tableInputStepMeta = new StepMeta(tableInputPluginId, "表输入", tableInputMeta);
return tableInputStepMeta;
}
2.表输出
代码如下(示例):
/**
* 表输出
* @param mySQLDatabaseMeta
* @return
*/
private static StepMeta tableOutputMeta(DatabaseMeta mySQLDatabaseMeta) {
//表输出
TableOutputMeta tableOutputMeta = new TableOutputMeta();
//设置数据源
tableOutputMeta.setDatabaseMeta(mySQLDatabaseMeta);
//设置数据输出表
tableOutputMeta.setTableName("Student_new");
//设置是否清空表
tableOutputMeta.setTruncateTable(true);
//指定数据库字段,流里的字段和数据库字段做对比
tableOutputMeta.setSpecifyFields(true);
//数据库输出表中的字段
tableOutputMeta.setFieldDatabase("StuNo,StuName,StuAge,StuClass".split(","));
//上一个步骤,数据流中的字段
tableOutputMeta.setFieldStream("StuNo,StuName,StuAge,StuClass".split(","));
String tableOutputPluginId = PluginRegistry.getInstance().getPluginId(StepPluginType.class, tableOutputMeta);
StepMeta tableOutputStepMeta = new StepMeta(tableOutputPluginId, "表输出", tableOutputMeta);
return tableOutputStepMeta;
}
三.完整代码
代码如下(示例):
public static void main(String[] args) throws KettleException,IOException {
//初始化kettle环境
KettleEnvironment.init();
//初始化数据库连接
DatabaseMeta mySQLDatabaseMeta = dbconnect();
//表输入
StepMeta tableInputStepMeta = tableInputMeta(mySQLDatabaseMeta);
//是否渲染在画布上
tableInputStepMeta.setDraw(true);
// 插件在画布的位置
Point ipoint = new Point(100, 100);
tableInputStepMeta.setLocation(ipoint);
//表输出
StepMeta tableOutputStepMeta = tableOutputMeta(mySQLDatabaseMeta);
//是否渲染在画布上
tableOutputStepMeta.setDraw(true);
// 插件在画布的位置
Point opoint = new Point(200, 100);
tableOutputStepMeta.setLocation(opoint);
//转换
TransMeta transMeta = new TransMeta();
//设置转换的名称
transMeta.setName("trans");
//设置数据源
transMeta.addDatabase(mySQLDatabaseMeta);
//将插件添加到转换中
transMeta.addStep(tableInputStepMeta);
transMeta.addStep(tableOutputStepMeta);
//建立连接 hop
transMeta.addTransHop(new TransHopMeta(tableInputStepMeta, tableOutputStepMeta));
//执行转换
Trans trans = new Trans(transMeta);
//设置打印日志的等级
trans.setLogLevel(LogLevel.BASIC);
trans.execute(null);
trans.waitUntilFinished();
//生成ktr文件
String xml = transMeta.getXML();
String filePath = "F:\\str.ktr";
BufferedWriter bw = new BufferedWriter(new FileWriter(filePath));
bw.write(xml);
bw.close();
}
/**
* 数据库连接
* @return
*/
private static DatabaseMeta dbconnect() {
String host = "xxx.xxx.xxx.xxx";
String port = "xxx";
String db = "xxx";
String user = "xxx";
String password = "xxx";
DatabaseMeta mySQLDatabaseMeta = new DatabaseMeta();
//设置数据库的名称
mySQLDatabaseMeta.setName("sqlserver");
//设置数据库的类型
mySQLDatabaseMeta.setDatabaseType("MSSQL");
mySQLDatabaseMeta.setHostname(host);
mySQLDatabaseMeta.setDBPort(port);
mySQLDatabaseMeta.setDBName(db);
mySQLDatabaseMeta.setUsername(user);
mySQLDatabaseMeta.setPassword(password);
return mySQLDatabaseMeta;
}
/**
* 表输入
* @param mySQLDatabaseMeta
* @return
*/
private static StepMeta tableInputMeta(DatabaseMeta mySQLDatabaseMeta) {
//表输入
TableInputMeta tableInputMeta = new TableInputMeta();
//设置数据源
tableInputMeta.setDatabaseMeta(mySQLDatabaseMeta);
//设置查询的SQL
tableInputMeta.setSQL("select StuNo, StuName, StuAge, StuClass from Student");
// 获取注册插件的id,这里的id类似于策略模式,进行任务分发
String tableInputPluginId = PluginRegistry.getInstance().getPluginId(StepPluginType.class, tableInputMeta);
StepMeta tableInputStepMeta = new StepMeta(tableInputPluginId, "表输入", tableInputMeta);
return tableInputStepMeta;
}
/**
* 表输出
* @param mySQLDatabaseMeta
* @return
*/
private static StepMeta tableOutputMeta(DatabaseMeta mySQLDatabaseMeta) {
//表输出
TableOutputMeta tableOutputMeta = new TableOutputMeta();
//设置数据源
tableOutputMeta.setDatabaseMeta(mySQLDatabaseMeta);
//设置数据输出表
tableOutputMeta.setTableName("Student_new");
//设置是否清空表
tableOutputMeta.setTruncateTable(true);
//指定数据库字段,流里的字段和数据库字段做对比
tableOutputMeta.setSpecifyFields(true);
//数据库输出表中的字段
tableOutputMeta.setFieldDatabase("StuNo,StuName,StuAge,StuClass".split(","));
//上一个步骤,数据流中的字段
tableOutputMeta.setFieldStream("StuNo,StuName,StuAge,StuClass".split(","));
String tableOutputPluginId = PluginRegistry.getInstance().getPluginId(StepPluginType.class, tableOutputMeta);
StepMeta tableOutputStepMeta = new StepMeta(tableOutputPluginId, "表输出", tableOutputMeta);
return tableOutputStepMeta;
}
ktr文件
<transformation>
<info>
<name>trans</name>
<description/>
<extended_description/>
<trans_version/>
<trans_type>Normal</trans_type>
<directory>/</directory>
<parameters>
</parameters>
<log>
<trans-log-table>
<connection/>
<schema/>
<table/>
<size_limit_lines/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STATUS</id>
<enabled>Y</enabled>
<name>STATUS</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
<subject/>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
<subject/>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
<subject/>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
<subject/>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
<subject/>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
<subject/>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>STARTDATE</id>
<enabled>Y</enabled>
<name>STARTDATE</name>
</field>
<field>
<id>ENDDATE</id>
<enabled>Y</enabled>
<name>ENDDATE</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>DEPDATE</id>
<enabled>Y</enabled>
<name>DEPDATE</name>
</field>
<field>
<id>REPLAYDATE</id>
<enabled>Y</enabled>
<name>REPLAYDATE</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>Y</enabled>
<name>LOG_FIELD</name>
</field>
<field>
<id>EXECUTING_SERVER</id>
<enabled>N</enabled>
<name>EXECUTING_SERVER</name>
</field>
<field>
<id>EXECUTING_USER</id>
<enabled>N</enabled>
<name>EXECUTING_USER</name>
</field>
<field>
<id>CLIENT</id>
<enabled>N</enabled>
<name>CLIENT</name>
</field>
</trans-log-table>
<perf-log-table>
<connection/>
<schema/>
<table/>
<interval/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>SEQ_NR</id>
<enabled>Y</enabled>
<name>SEQ_NR</name>
</field>
<field>
<id>LOGDATE</id>
<enabled>Y</enabled>
<name>LOGDATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>INPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>INPUT_BUFFER_ROWS</name>
</field>
<field>
<id>OUTPUT_BUFFER_ROWS</id>
<enabled>Y</enabled>
<name>OUTPUT_BUFFER_ROWS</name>
</field>
</perf-log-table>
<channel-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>LOGGING_OBJECT_TYPE</id>
<enabled>Y</enabled>
<name>LOGGING_OBJECT_TYPE</name>
</field>
<field>
<id>OBJECT_NAME</id>
<enabled>Y</enabled>
<name>OBJECT_NAME</name>
</field>
<field>
<id>OBJECT_COPY</id>
<enabled>Y</enabled>
<name>OBJECT_COPY</name>
</field>
<field>
<id>REPOSITORY_DIRECTORY</id>
<enabled>Y</enabled>
<name>REPOSITORY_DIRECTORY</name>
</field>
<field>
<id>FILENAME</id>
<enabled>Y</enabled>
<name>FILENAME</name>
</field>
<field>
<id>OBJECT_ID</id>
<enabled>Y</enabled>
<name>OBJECT_ID</name>
</field>
<field>
<id>OBJECT_REVISION</id>
<enabled>Y</enabled>
<name>OBJECT_REVISION</name>
</field>
<field>
<id>PARENT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>PARENT_CHANNEL_ID</name>
</field>
<field>
<id>ROOT_CHANNEL_ID</id>
<enabled>Y</enabled>
<name>ROOT_CHANNEL_ID</name>
</field>
</channel-log-table>
<step-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>TRANSNAME</id>
<enabled>Y</enabled>
<name>TRANSNAME</name>
</field>
<field>
<id>STEPNAME</id>
<enabled>Y</enabled>
<name>STEPNAME</name>
</field>
<field>
<id>STEP_COPY</id>
<enabled>Y</enabled>
<name>STEP_COPY</name>
</field>
<field>
<id>LINES_READ</id>
<enabled>Y</enabled>
<name>LINES_READ</name>
</field>
<field>
<id>LINES_WRITTEN</id>
<enabled>Y</enabled>
<name>LINES_WRITTEN</name>
</field>
<field>
<id>LINES_UPDATED</id>
<enabled>Y</enabled>
<name>LINES_UPDATED</name>
</field>
<field>
<id>LINES_INPUT</id>
<enabled>Y</enabled>
<name>LINES_INPUT</name>
</field>
<field>
<id>LINES_OUTPUT</id>
<enabled>Y</enabled>
<name>LINES_OUTPUT</name>
</field>
<field>
<id>LINES_REJECTED</id>
<enabled>Y</enabled>
<name>LINES_REJECTED</name>
</field>
<field>
<id>ERRORS</id>
<enabled>Y</enabled>
<name>ERRORS</name>
</field>
<field>
<id>LOG_FIELD</id>
<enabled>N</enabled>
<name>LOG_FIELD</name>
</field>
</step-log-table>
<metrics-log-table>
<connection/>
<schema/>
<table/>
<timeout_days/>
<field>
<id>ID_BATCH</id>
<enabled>Y</enabled>
<name>ID_BATCH</name>
</field>
<field>
<id>CHANNEL_ID</id>
<enabled>Y</enabled>
<name>CHANNEL_ID</name>
</field>
<field>
<id>LOG_DATE</id>
<enabled>Y</enabled>
<name>LOG_DATE</name>
</field>
<field>
<id>METRICS_DATE</id>
<enabled>Y</enabled>
<name>METRICS_DATE</name>
</field>
<field>
<id>METRICS_CODE</id>
<enabled>Y</enabled>
<name>METRICS_CODE</name>
</field>
<field>
<id>METRICS_DESCRIPTION</id>
<enabled>Y</enabled>
<name>METRICS_DESCRIPTION</name>
</field>
<field>
<id>METRICS_SUBJECT</id>
<enabled>Y</enabled>
<name>METRICS_SUBJECT</name>
</field>
<field>
<id>METRICS_TYPE</id>
<enabled>Y</enabled>
<name>METRICS_TYPE</name>
</field>
<field>
<id>METRICS_VALUE</id>
<enabled>Y</enabled>
<name>METRICS_VALUE</name>
</field>
</metrics-log-table>
</log>
<maxdate>
<connection/>
<table/>
<field/>
<offset>0.0</offset>
<maxdiff>0.0</maxdiff>
</maxdate>
<size_rowset>10000</size_rowset>
<sleep_time_empty>50</sleep_time_empty>
<sleep_time_full>50</sleep_time_full>
<unique_connections>N</unique_connections>
<feedback_shown>Y</feedback_shown>
<feedback_size>50000</feedback_size>
<using_thread_priorities>Y</using_thread_priorities>
<shared_objects_file/>
<capture_step_performance>N</capture_step_performance>
<step_performance_capturing_delay>1000</step_performance_capturing_delay>
<step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
<dependencies>
</dependencies>
<partitionschemas>
</partitionschemas>
<slaveservers>
</slaveservers>
<clusterschemas>
</clusterschemas>
<created_user>-</created_user>
<created_date>2023/09/07 15:49:41.910</created_date>
<modified_user>-</modified_user>
<modified_date>2023/09/07 15:49:41.910</modified_date>
<key_for_session_key/>
<is_key_private>N</is_key_private>
</info>
<notepads>
</notepads>
<connection>
<name>sqlserver</name>
<server>xxx</server>
<type>xxx</type>
<access>Native</access>
<database>xxx</database>
<port>xxx</port>
<username>xxx</username>
<password>xxx</password>
<servername/>
<data_tablespace/>
<index_tablespace/>
<attributes>
<attribute>
<code>PORT_NUMBER</code>
<attribute>xxx</attribute>
</attribute>
</attributes>
</connection>
<order>
<hop>
<from>表输入</from>
<to>表输出</to>
<enabled>Y</enabled>
</hop>
</order>
<step>
<name>表输入</name>
<type>TableInput</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<connection>sqlserver</connection>
<sql>select StuNo, StuName, StuAge, StuClass from Student</sql>
<limit/>
<lookup/>
<execute_each_row>N</execute_each_row>
<variables_active>N</variables_active>
<lazy_conversion_active>N</lazy_conversion_active>
<cached_row_meta_active>N</cached_row_meta_active>
<attributes/>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>100</xloc>
<yloc>100</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>表输出</name>
<type>TableOutput</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<connection>sqlserver</connection>
<schema/>
<table>Student_new</table>
<commit>1000</commit>
<truncate>N</truncate>
<ignore_errors>N</ignore_errors>
<use_batch>Y</use_batch>
<specify_fields>Y</specify_fields>
<partitioning_enabled>N</partitioning_enabled>
<partitioning_field/>
<partitioning_daily>N</partitioning_daily>
<partitioning_monthly>N</partitioning_monthly>
<tablename_in_field>N</tablename_in_field>
<tablename_field/>
<tablename_in_table>N</tablename_in_table>
<return_keys>N</return_keys>
<return_field/>
<fields>
<field>
<column_name>StuNo</column_name>
<stream_name>StuNo</stream_name>
</field>
<field>
<column_name>StuName</column_name>
<stream_name>StuName</stream_name>
</field>
<field>
<column_name>StuAge</column_name>
<stream_name>StuAge</stream_name>
</field>
<field>
<column_name>StuClass</column_name>
<stream_name>StuClass</stream_name>
</field>
</fields>
<attributes/>
<cluster_schema/>
<remotesteps>
<input>
</input>
<output>
</output>
</remotesteps>
<GUI>
<xloc>200</xloc>
<yloc>100</yloc>
<draw>Y</draw>
</GUI>
</step>
<step_error_handling>
</step_error_handling>
<slave-step-copy-partition-distribution>
</slave-step-copy-partition-distribution>
<slave_transformation>N</slave_transformation>
<attributes/>
</transformation>
总结
以上就是今天要讲的内容,本文仅仅简单介绍了如何通过java程序调用kettle的api,来实现表对表的数据抽取,而kettle其他插件的api的使用,将在下一章介绍。
文章来源:https://blog.csdn.net/qq_34165124/article/details/132735835
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。 如若内容造成侵权/违法违规/事实不符,请联系我的编程经验分享网邮箱:veading@qq.com进行投诉反馈,一经查实,立即删除!