diff mbox

IPv4 IPv6 parallel dns lookup in combination with nfqueue is problematic

Message ID CAMxdDZBwqRxZjywAfHUm-bbe-0veLPqPwAfFpw90cb0As80Dmg@mail.gmail.com
State Deferred
Delegated to: Pablo Neira
Headers show

Commit Message

Tarik Demirci July 24, 2015, 10:34 a.m. UTC
Hi Everyone,

Problem:
I have a simple daemon listening for packets coming from nfqueue. When
a client issues  parallel dns requests for IPv4 and IPv6 addresses
(since glibc 2.9 this is default behaviour), IPv6 request is dropped
on its way in gateway. Client, after 5 seconds timeout, sends these
requests sequentially and there is no problem in this case.

Workaround:
I applied a kernel patch from an earlier mail (
http://www.spinics.net/lists/netfilter-devel/msg15860.html ) to kernel
version 3.16. This patch solves the problem but I'm unaware of the
performance and security implications of this solution. I hope to find
a better solution that doesn't require patching kernel.


Regards,
Tarik.

Related links to the problem:
https://bbs.archlinux.org/viewtopic.php?id=75770
https://www.astaro.org/gateway-products/management-networking-logging-reporting/51569-slow-dns-queries-parallel-requests-ipv6.html
http://www.spinics.net/lists/netfilter-devel/msg15860.html

Extra info:
I insert packets to nfqueue in mangle table (rather than raw) because
the daemon will need to process connection marks in the future.
Currently, it reads packets from queue, marks them and allows them to
pass (NF_ACCEPT).

Network topology:
In my topology, a client (10.21.0.100) sends dns requests to 8.8.4.4
via gateway (10.21.0.1). Gateway performs snat (to 10.100.0.21) and
sends packets. The daemon runs on gateway.
10.21.0.100 (client)  ---->  10.21.0.1 (gw internal interface) ------>
(snat) 10.100.0.21 (gw external interface) -----> 8.8.4.4

Iptables rule:
iptables -t mangle -A FORWARD -m mark --mark 0x0/0x3000000 -j NFQUEUE
--queue-num 10 --queue-bypass

--------

tcpdump output (unpatched kernel):
11:08:13.580903 IP 10.21.0.100.40004 > 8.8.4.4.53:  34824+ A? httpbin.org. (29)
11:08:13.580958 IP 10.21.0.100.40004 > 8.8.4.4.53:  17101+ AAAA?
httpbin.org. (29)
11:08:13.581084 IP 10.100.0.21.40004 > 8.8.4.4.53:  34824+ A? httpbin.org. (29)
11:08:13.604559 IP 8.8.4.4.53 > 10.100.0.21.40004:  34824 1/0/0 A
54.175.222.246 (45)
11:08:13.604607 IP 8.8.4.4.53 > 10.21.0.100.40004:  34824 1/0/0 A
54.175.222.246 (45)
11:08:18.585022 IP 10.21.0.100.40004 > 8.8.4.4.53:  34824+ A? httpbin.org. (29)
11:08:18.585097 IP 10.100.0.21.40004 > 8.8.4.4.53:  34824+ A? httpbin.org. (29)
11:08:18.606474 IP 8.8.4.4.53 > 10.100.0.21.40004:  34824 1/0/0 A
54.175.222.246 (45)
11:08:18.606563 IP 8.8.4.4.53 > 10.21.0.100.40004:  34824 1/0/0 A
54.175.222.246 (45)
11:08:18.607175 IP 10.21.0.100.40004 > 8.8.4.4.53:  17101+ AAAA?
httpbin.org. (29)
11:08:18.607246 IP 10.100.0.21.40004 > 8.8.4.4.53:  17101+ AAAA?
httpbin.org. (29)
11:08:18.664119 IP 8.8.4.4.53 > 10.100.0.21.40004:  17101 0/1/0 (110)
11:08:18.664201 IP 8.8.4.4.53 > 10.21.0.100.40004:  17101 0/1/0 (110)

----

tcpdump output (patched kernel):

15:39:53.141114 IP 10.21.0.100.58891 > 8.8.4.4.53:  43314+ A? httpbin.org. (29)
15:39:53.141247 IP 10.21.0.100.58891 > 8.8.4.4.53:  25492+ AAAA?
httpbin.org. (29)
15:39:53.141362 IP 10.100.0.21.58891 > 8.8.4.4.53:  43314+ A? httpbin.org. (29)
15:39:53.141672 IP 10.100.0.21.58891 > 8.8.4.4.53:  25492+ AAAA?
httpbin.org. (29)
15:39:53.166438 IP 8.8.4.4.53 > 10.100.0.21.58891:  25492 0/1/0 (110)
15:39:53.166507 IP 8.8.4.4.53 > 10.21.0.100.58891:  25492 0/1/0 (110)
15:39:53.167052 IP 8.8.4.4.53 > 10.100.0.21.58891:  43314 1/0/0 A
54.175.219.8 (45)
15:39:53.167095 IP 8.8.4.4.53 > 10.21.0.100.58891:  43314 1/0/0 A
54.175.219.8 (45)

-------


Kernel patch(3.16.3):

+               }
+
                /* Seen it before?  This can happen for loopback, retrans,
                 * or local packets.
                 */


-------

Source code of daemon:

#include <stdlib.h>
#include <signal.h>
#include <poll.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <arpa/inet.h>
#include <linux/netfilter.h>
#include <libnetfilter_queue/libnetfilter_queue.h>
#include <syslog.h>


#define CUSTOM_MARK 0x2000000

/* how long to wait for a new packet */
#define POLL_TIME 10

int g_shutdown = 0;

int nfq_callback_handler(struct nfq_q_handle *queue_handler, struct
nfgenmsg *nfmsg, struct nfq_data *tb, void *arg){
    unsigned char *data;
    int datalen = nfq_get_payload(tb, &data);
    if (datalen > 0)
    {

        struct nfqnl_msg_packet_hdr *hdr = nfq_get_msg_packet_hdr(tb);

        nfq_set_verdict2(queue_handler,
                         hdr ? ntohl(hdr->packet_id) : 0,
                         NF_ACCEPT,
                         CUSTOM_MARK,
                         0,
                         NULL);
    }
    return 0;
}

void initialize_queue() {
    struct nfq_handle *nfqh= NULL;
    struct nfq_q_handle *queue_handler = NULL;
    unsigned int queue_num = 10;
    if ((nfqh = nfq_open()) == 0){
        syslog(LOG_ERR, "nfq_open failed.");
    }
    else
    {
        /* ignore return code for this since it's inconsistent between
kernel versions */
        /* see http://www.spinics.net/lists/netfilter/msg42063.html */
        nfq_unbind_pf(nfqh, AF_INET);

        if (nfq_bind_pf(nfqh, AF_INET) < 0){
            syslog(LOG_ERR,"nfq_bind_pf failed.");
        }
        else if ((queue_handler = nfq_create_queue(nfqh, queue_num,
&nfq_callback_handler, NULL)) == 0){
            syslog(LOG_ERR,"nfq_create_queue on %u failed.", queue_num);
        }
        else if (nfq_set_mode(queue_handler, NFQNL_COPY_PACKET, 0xffff) < 0) {
            syslog(LOG_ERR,"failed to set NFQNL_COPY_PACKET.");
        }
        else
        {
            /* get the file descriptor for netlink queue */
            int fd = nfnl_fd(nfq_nfnlh(nfqh));

            //set buf size
            int on = 1024 * 1024;
            unsigned int queue_size = 10000;
            if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &on, sizeof(int)) == -1 ){
                syslog(LOG_WARNING,"Buffer size could not be set");
            }

            //set queue size that is given by -s option
            if((nfq_set_queue_maxlen(queue_handler, queue_size)) == -1){
                syslog(LOG_WARNING,"Queue size could not be set.");
            }

            ssize_t ret;
            char buf[10000];
            struct pollfd pollinfo;
            while (!g_shutdown)
            {
                pollinfo.fd = fd;
                pollinfo.events = POLLIN;

                ret = poll(&pollinfo, 1, POLL_TIME);
                if ((ret < 0) && (errno != EINTR))
                {
                    syslog(LOG_ERR,"poll error nfq fd %d (%d/%s)", fd,
errno, strerror(errno));
                    break;
                }

                while ((ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT)) > 0) {
                    nfq_handle_packet(nfqh, buf, (int)ret);
                }

                if (ret == -1)
                {
                    if (errno == EAGAIN || errno == EINTR || errno == ENOBUFS)
                        ;
                    else
                    {
                        syslog(LOG_ERR, "recv error nfq fd %d
(%d/%s)", fd, errno, strerror(errno));
                        break;
                    }
                }
                else if (ret == 0)
                {
                    syslog(LOG_ERR,"nfq socket closed");
                    break;
                }
            }
            nfq_destroy_queue(queue_handler);
            nfq_close( nfqh );
            queue_handler = NULL;
            nfqh = NULL;
        }
    }
}

static void sig_handler(int signum){
    /**
     * This function handles cathed signals
     *
     * @param signum : Sended signal
     * @return void
     */

    if(signum == SIGINT){
        g_shutdown = 1;
        syslog(LOG_INFO,"Interrupted.");

    }
    else if(signum == SIGTERM){
        g_shutdown = 1;
        syslog(LOG_INFO,"Killed.");

    }
}

int main(int argc, char *argv[]){
    int logOpt = LOG_PID;

    signal(SIGINT, sig_handler);  //sig number 2
    signal(SIGTERM, sig_handler); //sig number 15
    signal(SIGHUP, sig_handler);  //sig number 1
    signal(SIGUSR1, sig_handler);  //sig number 10
    openlog("sniffer", logOpt, LOG_USER);

    syslog(LOG_INFO, "Program is  started.");
    initialize_queue();
    closelog();

    return 0;
}

Comments

Pablo Neira Ayuso July 30, 2015, 12:21 p.m. UTC | #1
On Fri, Jul 24, 2015 at 01:34:19PM +0300, Tarik Demirci wrote:
> Hi Everyone,
> 
> Problem:
> I have a simple daemon listening for packets coming from nfqueue. When
> a client issues  parallel dns requests for IPv4 and IPv6 addresses
> (since glibc 2.9 this is default behaviour), IPv6 request is dropped
> on its way in gateway. Client, after 5 seconds timeout, sends these
> requests sequentially and there is no problem in this case.
> 
> Workaround:
> I applied a kernel patch from an earlier mail (
> http://www.spinics.net/lists/netfilter-devel/msg15860.html ) to kernel
> version 3.16. This patch solves the problem but I'm unaware of the
> performance and security implications of this solution. I hope to find
> a better solution that doesn't require patching kernel.

I think we can resolve this from nf_reinject() which is slow path,
with something that looks like this:

{
        struct nf_conntrack_tuple_hash *h;
        enum ip_conntrack_info ctinfo;
        struct nf_conn *ct;

        ct = nf_ct_get(skb, &ctinfo);
        if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
                return;

        h = nf_conntrack_find_get(nf_ct_net(ct), nf_ct_zone(ct),
                                  &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        if (h) {
                nf_conntrack_put(ct);
                ct = nf_ct_tuplehash_to_ctrack(h);
                skb->nfct = &ct->ct_general;
                skb->nfctinfo = ctinfo;
        }
}

But to avoid dependencies with ct we have to add a RCU hook pointer to
function, so this code it only invoked if conntrack is loaded.

I'll try to find some spare time to send a patch, otherwise if there
is anyone else willing to work on this, just drop me a line privately.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index f1787c0..b9f282a 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -18,6 +18,11 @@ 
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_l3proto.h>

+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+#include <net/netfilter/nf_conntrack_zones.h>
 static const struct xt_table nf_nat_ipv4_table = {
        .name           = "nat",
        .valid_hooks    = (1 << NF_INET_PRE_ROUTING) |
@@ -107,6 +112,20 @@  nf_nat_ipv4_fn(const struct nf_hook_ops *ops,
                }
                /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
        case IP_CT_NEW:
+               /* Nasty asynchronous DNS hack: Avoid NAT and
conntrack_confirm race */
+               if (!nf_ct_is_confirmed(ct) && CTINFO2DIR(ctinfo) ==
IP_CT_DIR_ORIGINAL &&
+
ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == IPPROTO_UDP) {
+                       struct nf_conntrack_tuple_hash *h =
nf_conntrack_find_get(
+                                       nf_ct_net(ct),
+                                       NF_CT_DEFAULT_ZONE,
+
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+                       if (h) {
+                               ct = nf_ct_tuplehash_to_ctrack(h);
+                               nf_conntrack_put(skb->nfct);
+                               skb->nfct = &ct->ct_general;
+                       }