DPVS的数据流量从以流入流出划分为:inbond outbond;
内网服务器需要访问外网某个功能接口,请求数据包到达了 DPVS 服务器。数据包从网卡某个队列 queuex 进入后,被 cpux 接收并开始相关的逻辑处理。收包路径大概包括如下:
在 dp_vs_in 中其实调用的是 __dp_vs_in 函数,主逻辑入口在这个函数中体现:
到此从 SNAT 数据包的接收、处理、转发等过程,完成 SNAT outbond 方向的整个流程。
内网服务器发送请求数据包后,外网服务器执行相关操作,发出响应数据包,此数据包到达了 DPVS 服务器。此时 DPDK 驱动程序会根据关键字段进行 FDIR 规则匹配,数据包从同样的网卡度列 queuex 进入后,被 cpux 接收并开始相关的逻辑处理,这样整个连接都是被同一个 cpux 来处理的。收包路径大概包括如下:
在 dp_vs_in 前的收包处理流程和 outbond 基本一致,这里就不分析了。
不断的重复 outbond 和 inbond 流程,就能够通过多次数据包的交互完成连接过程中业务的传输,从而实现 SNAT 功能
存在问题:
对于inbound方向的流量,实际上做的是dnat,将目标ip由lb vip转换成真正的rs ip,此时后端rs是能拿到client ip的。outbond的流量做snat,将源地址换成lb vip
/* return verdict INET_XXX
* af from mbuf->l3_type? No! The field is rewritten by netif and conflicts with
* m.packet_type(an union), so using a wrapper to get af.
* */
static int __dp_vs_in(void *priv, struct rte_mbuf *mbuf,
const struct inet_hook_state *state, int af)
{
struct dp_vs_iphdr iph;
struct dp_vs_proto *prot;
struct dp_vs_conn *conn;
int dir, verdict, err, related;
bool drop = false;
lcoreid_t cid, peer_cid;
eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */
assert(mbuf && state);
//获取当前运行的lcore id
cid = peer_cid = rte_lcore_id();
//数据包不是发往本机的,直接返回ACCEPT,之后执行ipv4_rcv_fin
if (unlikely(etype != ETH_PKT_HOST))
return INET_ACCEPT;
//填充内部dp_vs_iphdr,如果出错,主要是协议族不正确,直接返回ACCEPT
if (dp_vs_fill_iphdr(af, mbuf, &iph) != EDPVS_OK)
return INET_ACCEPT;
//处理ICMP消息,类似于linux内核中icmp_error相关的处理逻辑
if (unlikely(iph.proto == IPPROTO_ICMP ||
iph.proto == IPPROTO_ICMPV6)) {
/* handle related ICMP error to existing conn */
verdict = dp_vs_in_icmp(af, mbuf, &related);
if (related || verdict != INET_ACCEPT)
return verdict;
/* let unrelated and valid ICMP goes down,
* may implement ICMP fwd in the futher. */
}
//查找四层处理协议,目前实现了tcp,udp和icmp
prot = dp_vs_proto_lookup(iph.proto);
if (unlikely(!prot))
return INET_ACCEPT;
/*
* Defrag ipvs-forwarding TCP/UDP is not supported for some reasons,
*
* - RSS/flow-director do not support TCP/UDP fragments, means it's
* not able to direct frags to same lcore as original TCP/UDP packets.
* - per-lcore conn table will miss if frags reachs wrong lcore.
*
* If we redirect frags to "correct" lcore, it may cause performance
* issue. Also it need to understand RSS algorithm. Moreover, for the
* case frags in same flow are not occur in same lcore, a global lock is
* needed, which is not a good idea.
*/ //目前不支持ip分片,此处与flow director相关
if (af == AF_INET && ip4_is_frag(ip4_hdr(mbuf))) {
RTE_LOG(DEBUG, IPVS, "%s: frag not support.
", __func__);
return INET_DROP;
}
//调用proto相关conn_lookup函数查找会话,tcp中为 tcp_conn_lookup 。有可能会 drop 掉。dir 是设置数据流方向,从client到LB,
//还是从real server到LB,peer_cid为查找时决定处理该连接的lcore id
/* packet belongs to existing connection ? */
conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop, &peer_cid);
if (unlikely(drop)) {
RTE_LOG(DEBUG, IPVS, "%s: deny ip try to visit.
", __func__);
return INET_DROP;
}
/*
* The connection is not locally found, however the redirect is found so
* forward the packet to the remote redirect owner core.
*/
//如果不在当前lcore上处理,则恢复mbuf->data_off指向L2 header后转发给其他lcore处理,此处ring enqueue成功后返回INET_STOLEN,
//否则返回DROP丢弃数据包
if (cid != peer_cid) {
/* recover mbuf.data_off to outer Ether header */
rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct rte_ether_hdr));
return dp_vs_redirect_pkt(mbuf, peer_cid);
}
//对于新建的连接,肯定是没有会话的,conn_sched根据请求选择一个后端real server建立连接
if (unlikely(!conn)) {
/* try schedule RS and create new connection */
//调用proto中conn_sched接口选择一个后端rs建立连接,如果创建连接失败,返回verdict
if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
/* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.
", __func__); */
return verdict;
}
//snat模式,则是内部服务器访问外部服务,内网服务器--->dpvs--->外网服务器(baidu),
//所以设置dir=DPVS_CONN_DIR_OUTBOUND
/* only SNAT triggers connection by inside-outside traffic. */
if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
dir = DPVS_CONN_DIR_OUTBOUND;
else//其余模式设置dir=DPVS_CONN_DIR_INBOUND
dir = DPVS_CONN_DIR_INBOUND;
} else {
/* assert(conn->dest->svc != NULL); */
if (conn->dest && conn->dest->svc &&
prot->conn_expire_quiescent &&
(conn->dest->svc->flags & DPVS_CONN_F_EXPIRE_QUIESCENT)) {
if (rte_atomic16_read(&conn->dest->weight) == 0) {
RTE_LOG(INFO, IPVS, "%s: the conn is quiescent, expire it right now,"
" and drop the packet!
", __func__);
prot->conn_expire_quiescent(conn);
dp_vs_conn_put(conn);
return INET_DROP;
}
}
}
//特殊处理,syn_proxy
if (conn->flags & DPVS_CONN_F_SYNPROXY) {
if (dir == DPVS_CONN_DIR_INBOUND) {
/* Filter out-in ack packet when cp is at SYN_SENT state.
* Drop it if not a valid packet, store it otherwise */
if (0 == dp_vs_synproxy_filter_ack(mbuf, conn, prot,
&iph, &verdict)) {
dp_vs_stats_in(conn, mbuf);
dp_vs_conn_put(conn);
return verdict;
}
/* "Reuse" synproxy sessions.
* "Reuse" means update syn_proxy_seq struct
* and clean ack_mbuf etc. */
if (0 != dp_vs_synproxy_ctrl_conn_reuse) {
if (0 == dp_vs_synproxy_reuse_conn(af, mbuf, conn, prot,
&iph, &verdict)) {
dp_vs_stats_in(conn, mbuf);
dp_vs_conn_put(conn);
return verdict;
}
}
} else {
/* Syn-proxy 3 logic: receive syn-ack from rs */
if (dp_vs_synproxy_synack_rcv(mbuf, conn, prot,
iph.len, &verdict) == 0) {
dp_vs_stats_out(conn, mbuf);
dp_vs_conn_put(conn);
return verdict;
}
}
}
//状态迁移,tcp中为tcp_state_trans
if (prot->state_trans) {
err = prot->state_trans(prot, conn, mbuf, dir);
if (err != EDPVS_OK)
RTE_LOG(WARNING, IPVS, "%s: fail to trans state.", __func__);
}
conn->old_state = conn->state;
/* holding the conn, need a "put" later. */
//根据流量方向dir,来选择如何发送数据
if (dir == DPVS_CONN_DIR_INBOUND)
return xmit_inbound(mbuf, prot, conn);
else
return xmit_outbound(mbuf, prot, conn);
}
原文链接:https://www.cnblogs.com/codestack/p/15717792.html
页面更新:2024-03-16
本站资料均由网友自行发布提供,仅用于学习交流。如有版权问题,请与我联系,QQ:4156828
© CopyRight 2020-2024 All Rights Reserved. Powered By 71396.com 闽ICP备11008920号-4
闽公网安备35020302034903号