
Apache Airflow(或简称 Airflow)是一个以编程方式创作、安排和监控工作流的平台。
当工作流被定义为代码时,它们变得更易于维护、可版本化、可测试和协作。
使用 Airflow 将工作流创作为任务的有向无环图 (DAG)。Airflow 调度程序在遵循指定的依赖项的同时在一组工作人员上执行您的任务。丰富的命令行实用程序使在 DAG 上执行复杂的手术变得轻而易举。丰富的用户界面可以轻松可视化生产中运行的管道、监控进度并在需要时解决问题。
1、 安装python3.6.8
2、 在线安装airflow
pip3 install apache-airflow
报错:
Attempting uninstall: PyYAML
Found existing installation: PyYAML 3.13
ERROR: Cannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.
错误:无法卸载“ PyYAML”。 这是一个distutils安装的项目,因此我们无法准确确定属于该文件的文件,而这只会导致部分卸载。
解决方法:忽略某些依赖,此处为PyYAML:
pip3 install apache-airflow --ignore-installed PyYAML
3、执行airflow命令,在~/airflow/生成配置文件airflow.cfg
mysql要创建一个airflow库,同时 my.cnf 设置 explicit_defaults_for_timestamp=1
修改airflow.cfg配置文件
# url路径
base_url = http://localhost:8080/airflow
报错:
https://www.cnblogs.com/jhno1/p/14980599.html https://zhuanlan.zhihu.com/p/42239028
airflow.exceptions.AirflowConfigException: error: sqlite C library version too old (< 3.15.0). See https://airflow.apache.org/docs/apache-airflow/2.2.4/howto/set-up-database.html#setting-up-a-sqlite-database
解决方法:升级sqlite3:
1)下载源码
[root@stg-airflow001 ~]$ wget https://www.sqlite.org/2019/sqlite-autoconf-3290000.tar.gz
2) 编译
[root@stg-airflow001 ~]$ tar zxvf sqlite-autoconf-3290000.tar.gz
[root@stg-airflow001 ~]$ cd sqlite-autoconf-3290000/
[root@stg-airflow001 ~/sqlite-autoconf-3290000]$ ./configure --prefix=/usr/local
[root@stg-airflow001 ~/sqlite-autoconf-3290000]$ make && make install
3)替换系统低版本 sqlite3
[root@stg-airflow001 ~/sqlite-autoconf-3290000]$ cd
[root@stg-airflow001 ~]$ mv /usr/bin/sqlite3 /usr/bin/sqlite3.bak
4) 设置共享库
[root@stg-airflow001 ~]$ ln -s /usr/local/bin/sqlite3 /usr/bin/sqlite3
[root@stg-airflow001 ~$ echo "/usr/local/lib" > /etc/ld.so.conf.d/sqlite3.conf
[root@stg-airflow001 ~]$ ldconfig
[root@stg-airflow001 ~]$ sqlite3 -version
3.29.0 2019-07-10 17:32:03 fc82b73eaac8b36950e527f12c4b5dc1e147e6f4ad2217ae43ad82882a88bfa6
4、初始化数据库配置
airflow db init
5、创建页面登录用户
airflow users create --lastname user --firstname airflow --username airflow --email xxx@qq.com --role Admin --password airflow
6、启动服务
airflow scheduler -D
airflow webserver -D
7、页面访问
http://localhost:8080
http://localhost:8080/airflow/
8、结束airflow
ps -ef|egrep 'scheduler|airflow-webserver'|grep -v grep|awk '{print $2}'|xargs kill -9
rm -rf airflow/airflow-scheduler.pid
官方的Airflow镜像没有JAVA、Hadoop、Hive等组件,需要自己制作镜像
FROM apache/airflow:2.2.1-python3.7
USER root
RUN sed -i 's/.*security.*//g' /etc/apt/sources.list \
&& rm /etc/apt/sources.list.d/mysql.list && sed -i 's/deb.debian.org/mirrors.ustc.edu.cn/g' /etc/apt/sources.list \
&& apt-get update && apt-get install wget procps telnet net-tools dnsutils iputils-ping curl vim git -y \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& for x in jdk-8u202-linux-x64.tar.gz hadoop-3.2.1.tar.gz apache-hive-3.1.2-bin.tar.gz; do curl -SLf https://cos-1252366911.cos.ap-shanghai.myqcloud.com/public/packages/${x} | tar -zx ;done \
&& mv jdk1.8.0_202 hadoop-3.2.1 apache-hive-3.1.2-bin /opt
ENV JAVA_HOME=/opt/jdk1.8.0_202 HADOOP_HOME=/opt/hadoop-3.2.1 HIVE_HOME=/opt/apache-hive-3.1.2-bin
ENV PATH=${JAVA_HOME}/bin:${HADOOP_HOME}/bin:${HIVE_HOME}/bin:${PATH}
USER airflow
RUN pip3 install --upgrade pip && pip3 install -i https://pypi.douban.com/simple/ --no-cache-dir airflow-gitlab-webhook airflow-code-editor apache-airflow-providers-apache-livy flask-admin
apache-airflow v2.2.1 依赖 docker-compose >= 1.29.0
curl -LfO 'https://airflow.apache.org/docs/apache-airflow/2.2.1/docker-compose.yaml'
# 设置环境参数
echo -e "AIRFLOW_UID=$(id -u)" > .env
## postgres持久化
# 创建目录
mkdir -p ./postgres
# 配置postgres-db-volume,默认为空
volumes:
postgres-db-volume:
driver: local
driver_opts:
type: none
o: bind
device: ./postgres
# 上传airflow.cfg
# 启动airflow
docker-compose up -d
docker-compose down
git仓库用户名密码:git-credentials.yaml
---
apiVersion: v1
kind: Secret
metadata:
name: git-credentials
data:
GIT_SYNC_USERNAME: c2hhbmhhaS56aHVAY29udmVydGxhYi5jb20=
GIT_SYNC_PASSWORD: MTIzNDU2QHF3ZQ==
airflow访问Hadoop集群
在./airflow/files/pod-template-file.kubernetes-helm-yaml末尾添加hostAliases:
hostAliases:
- ip: "10.254.136.15"
hostnames:
- "emr-header-3.cluster-330678"
- "emr-header-3"
- "iz2zed1lfyebocp9r0cortz"
- ip: "10.254.136.1"
hostnames:
- "emr-header-2.cluster-330678"
- "emr-header-2"
- "iz2zed1lfyebocp9r0corsz"
修改values.yaml
defaultAirflowRepository: ccr.ccs.tencentyun.com/pretool-hub/airflow
repository: convertlab.tencentcloudcr.com/devops-public/git-sync
executor: "KubernetesExecutor"
env:
- name: "TZ"
value: "Asia/Shanghai"
user: ma_uat
pass: z4Cy^rzGqId9*tNV
protocol: mysql
host: rm-2ze27jxae3legwsrm.mysql.rds.aliyuncs.com
port: 3306
db: airflow
subPath: "dags"
Ingress:
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: airflow-airflow-ingress
labels:
tier: airflow
component: airflow-ingress
release: airflow
chart: "airflow-1.3.0"
heritage: Helm
spec:
tls:
- hosts:
- ma-airflow.ain-test.digitalyili.com
secretName: ain-test.digitalyili.com
rules:
- http:
paths:
- backend:
service:
name: airflow-webserver
port:
name: airflow-ui
path: /
pathType: ImplementationSpecific
host: "ma-airflow.ain-test.digitalyili.com"
ingressClassName: nginx-in
发布
helm pull apache-airflow/airflow --version 1.3.0
helm upgrade --install --namespace xxx airflow apache-airflow/airflow --version 1.3.0 -f values.yaml --debug --timeout 10m0s
helm upgrade --install --no-hooks --namespace xxx airflow apache-airflow/airflow --version 1.3.0 -f values.yaml --debug --timeout 10m0s
helm template airflow apache-airflow/airflow --version 1.3.0 -f values.yaml --debug --timeout 10m0s