App-Oozie
view release on metacpan or search on metacpan
eg/workflows/cpan-sample-workflow/workflow.xml view on Meta::CPAN
<!-- flagDirForToday is populated on coordinator.xml -->
<mkdir path='${flagDirForToday}' />
<touchz path='${flagDirForToday}/OK' />
</fs>
<ok to="join_example" />
<error to="kill" />
</action>
<!-- Sqoop action -->
<action name = "sqoop_example"
retry-max = "3"
retry-interval = "10"
>
<shell xmlns="uri:oozie:shell-action:0.3">
<exec>get-config.pl</exec>
<file>get-config.pl</file>
<capture-output/>
</shell>
<ok to="sqoop_example_sqoop" />
<error to="join_example" />
</action>
<action name = "sqoop_example_sqoop"
retry-max = "3"
retry-interval = "10"
>
<sqoop xmlns="uri:oozie:sqoop-action:0.4">
<!--
Accepts delete and mkdir; for hive imports, remove the
leftovers of a potentially failed previous import
-->
<prepare>
<delete path="hdfs:///user/hive/warehouse/my_table_name" />
</prepare>
<configuration>
<!-- Put the import in the proper pool for throttling -->
<property>
<name>mapred.fairscheduler.pool</name>
<value>my-big-fat-pool</value>
</property>
<!-- Other properties, like job name, pool name, etc -->
</configuration>
<!--
Needs some variables, put them in global for instance.
For the DB passwords, instead of passing a password
as a bare string, you need to use an HDFS secret (see below)/
The secret may not be available for new databases, if
this is the case please contact Team BigData.
-->
<arg>import</arg>
<!--
Notice that the configuration is acceseed by the
action name: "sqoop_example"
-->
<arg>--connect</arg>
<arg>jdbc:mysql://${wf:actionData('sqoop_example')['db_host']}/${wf:actionData('sqoop_example')['db_schema']?connectTimeout=300000&socketTimeout=7200000</arg>
<arg>--username</arg>
<arg>${wf:actionData('sqoop_example')['db_user']}</arg>
<arg>--password-file</arg>
<arg>${wf:actionData('sqoop_example')['db_password_file']}</arg>
<arg>--num-mappers</arg>
<arg>64</arg>
<!-- other sqoop options can be added at this point -->
</sqoop>
<ok to="join_example" />
<error to="kill" />
</action>
<!-- Shell (any binary) action -->
<action name = "shell_example"
retry-max = "3"
retry-interval = "10"
>
<shell xmlns="uri:oozie:shell-action:0.3">
<!-- Every action has a configuration section scoped to the action itself -->
<configuration>
<property>
<name>mapreduce.job.name</name>
<value>example-thingie</value>
</property>
</configuration>
<exec>myprogram.pl</exec>
<argument>--dryrun</argument>
<argument>--whatever=foo</argument>
<file>myprogram.pl</file>
<!-- Capture the output (foo=bar pairs on the program's stdout) -->
<capture-output/>
</shell>
<ok to="join_example" />
<error to="kill" />
</action>
<!-- Fork no more -->
<join name = "join_example"
to = "mail_example"
/>
<!-- Send an email -->
<action name = "mail_example"
retry-max = "3"
retry-interval = "10"
>
<email xmlns="uri:oozie:email-action:0.1">
<!--
Remember the emailTo variable in global section?
Also note that ${clusterName} is populated by default when a
job is deployed using oozie-deploy
( run in 0.771 second using v1.01-cache-2.11-cpan-97f6503c9c8 )