Switch the receive path to memory-mapped I/O with PACKET_RX_RING.

Change-Id: I93231de8f108db782eaa43aceb6cc5281d3739c4
diff --git a/Android.mk b/Android.mk
index 5079082..5a35e22 100644
--- a/Android.mk
+++ b/Android.mk
@@ -1,7 +1,7 @@
 LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES:=clatd.c dump.c checksum.c translate.c icmp.c ipv4.c ipv6.c config.c dns64.c logging.c getaddr.c netlink_callbacks.c netlink_msg.c setif.c mtu.c tun.c
+LOCAL_SRC_FILES:=clatd.c dump.c checksum.c translate.c icmp.c ipv4.c ipv6.c config.c dns64.c logging.c getaddr.c netlink_callbacks.c netlink_msg.c setif.c mtu.c tun.c ring.c
 
 LOCAL_CFLAGS := -Wall -Werror -Wunused-parameter
 LOCAL_C_INCLUDES := external/libnl/include bionic/libc/dns/include
diff --git a/clatd.c b/clatd.c
index e2c96e6..5cbcc1e 100644
--- a/clatd.c
+++ b/clatd.c
@@ -51,6 +51,7 @@
 #include "getaddr.h"
 #include "dump.h"
 #include "tun.h"
+#include "ring.h"
 
 #define DEVICENAME4 "clat4"
 
@@ -229,13 +230,10 @@
 
   tunnel->write_fd6 = rawsock;
 
-  int packetsock = socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_IPV6));
-  if (packetsock < 0) {
-    logmsg(ANDROID_LOG_FATAL, "packet socket failed: %s", strerror(errno));
+  tunnel->read_fd6 = ring_create(tunnel);
+  if (tunnel->read_fd6 < 0) {
     exit(1);
   }
-
-  tunnel->read_fd6 = packetsock;
 }
 
 /* function: configure_interface
@@ -282,16 +280,16 @@
 }
 
 /* function: read_packet
- * reads a packet from the tunnel fd and passes it down the stack
- * active_fd - tun file descriptor marked ready for reading
- * tunnel    - tun device data
+ * reads a packet from the tunnel fd and translates it
+ * read_fd  - file descriptor to read original packet from
+ * write_fd - file descriptor to write translated packet to
+ * to_ipv6  - whether the packet is to be translated to ipv6 or ipv4
  */
-void read_packet(int active_fd, const struct tun_data *tunnel) {
+void read_packet(int read_fd, int write_fd, int to_ipv6) {
   ssize_t readlen;
   uint8_t buf[PACKETLEN], *packet;
-  int fd;
 
-  readlen = read(active_fd, buf, PACKETLEN);
+  readlen = read(read_fd, buf, PACKETLEN);
 
   if(readlen < 0) {
     logmsg(ANDROID_LOG_WARN,"read_packet/read error: %s", strerror(errno));
@@ -302,41 +300,32 @@
     return;
   }
 
-  if (active_fd == tunnel->fd4) {
-    ssize_t header_size = sizeof(struct tun_pi);
-
-    if (readlen < header_size) {
-      logmsg(ANDROID_LOG_WARN,"read_packet/short read: got %ld bytes", readlen);
-      return;
-    }
-
-    struct tun_pi *tun_header = (struct tun_pi *) buf;
-    uint16_t proto = ntohs(tun_header->proto);
-    if (proto != ETH_P_IP) {
-      logmsg(ANDROID_LOG_WARN, "%s: unknown packet type = 0x%x", __func__, proto);
-      return;
-    }
-
-    if(tun_header->flags != 0) {
-      logmsg(ANDROID_LOG_WARN, "%s: unexpected flags = %d", __func__, tun_header->flags);
-    }
-
-    fd = tunnel->write_fd6;
-    packet = buf + header_size;
-    readlen -= header_size;
-  } else {
-    fd = tunnel->fd4;
-    packet = buf;
+  struct tun_pi *tun_header = (struct tun_pi *) buf;
+  if (readlen < (ssize_t) sizeof(*tun_header)) {
+    logmsg(ANDROID_LOG_WARN,"read_packet/short read: got %ld bytes", readlen);
+    return;
   }
 
-  translate_packet(fd, (fd == tunnel->write_fd6), packet, readlen);
+  uint16_t proto = ntohs(tun_header->proto);
+  if (proto != ETH_P_IP) {
+    logmsg(ANDROID_LOG_WARN, "%s: unknown packet type = 0x%x", __func__, proto);
+    return;
+  }
+
+  if(tun_header->flags != 0) {
+    logmsg(ANDROID_LOG_WARN, "%s: unexpected flags = %d", __func__, tun_header->flags);
+  }
+
+  packet = (uint8_t *) (tun_header + 1);
+  readlen -= sizeof(*tun_header);
+  translate_packet(write_fd, to_ipv6, packet, readlen);
 }
 
 /* function: event_loop
  * reads packets from the tun network interface and passes them down the stack
  * tunnel - tun device data
  */
-void event_loop(const struct tun_data *tunnel) {
+void event_loop(struct tun_data *tunnel) {
   time_t last_interface_poll;
   struct pollfd wait_fd[] = {
     { tunnel->read_fd6, POLLIN, 0 },
@@ -352,16 +341,16 @@
         logmsg(ANDROID_LOG_WARN,"event_loop/poll returned an error: %s",strerror(errno));
       }
     } else {
-      size_t i;
-      for(i = 0; i < ARRAY_SIZE(wait_fd); i++) {
-        // Call read_packet if the socket has data to be read, but also if an
-        // error is waiting. If we don't call read() after getting POLLERR, a
-        // subsequent poll() will return immediately with POLLERR again,
-        // causing this code to spin in a loop. Calling read() will clear the
-        // socket error flag instead.
-        if(wait_fd[i].revents != 0) {
-          read_packet(wait_fd[i].fd,tunnel);
-        }
+      // Call read_packet if the socket has data to be read, but also if an
+      // error is waiting. If we don't call read() after getting POLLERR, a
+      // subsequent poll() will return immediately with POLLERR again,
+      // causing this code to spin in a loop. Calling read() will clear the
+      // socket error flag instead.
+      if (wait_fd[0].revents) {
+        ring_read(&tunnel->ring, tunnel->fd4, 0 /* to_ipv6 */);
+      }
+      if (wait_fd[1].revents) {
+        read_packet(tunnel->fd4, tunnel->write_fd6, 1 /* to_ipv6 */);
       }
     }
 
diff --git a/ring.c b/ring.c
new file mode 100644
index 0000000..5e99fd5
--- /dev/null
+++ b/ring.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * ring.c - packet ring buffer functions
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <linux/if.h>
+#include <linux/if_packet.h>
+
+#include "logging.h"
+#include "ring.h"
+#include "translate.h"
+#include "tun.h"
+
+int ring_create(struct tun_data *tunnel) {
+  int packetsock = socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_IPV6));
+  if (packetsock < 0) {
+    logmsg(ANDROID_LOG_FATAL, "packet socket failed: %s", strerror(errno));
+    return -1;
+  }
+
+  int ver = TPACKET_V2;
+  if (setsockopt(packetsock, SOL_PACKET, PACKET_VERSION, (void *) &ver, sizeof(ver))) {
+    logmsg(ANDROID_LOG_FATAL, "setsockopt(PACKET_VERSION, %d) failed: %s", ver, strerror(errno));
+    return -1;
+  }
+
+  int on = 1;
+  if (setsockopt(packetsock, SOL_PACKET, PACKET_LOSS, (void *) &on, sizeof(on))) {
+    logmsg(ANDROID_LOG_WARN, "PACKET_LOSS failed: %s", strerror(errno));
+  }
+
+  struct packet_ring *ring = &tunnel->ring;
+  ring->numblocks = TP_NUM_BLOCKS;
+
+  int total_frames = TP_FRAMES * ring->numblocks;
+
+  struct tpacket_req req = {
+      .tp_frame_size = TP_FRAME_SIZE,  // Frame size.
+      .tp_block_size = TP_BLOCK_SIZE,  // Frames per block.
+      .tp_block_nr = ring->numblocks,  // Number of blocks.
+      .tp_frame_nr = total_frames,     // Total frames.
+  };
+
+  if (setsockopt(packetsock, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req)) < 0) {
+    logmsg(ANDROID_LOG_FATAL, "PACKET_RX_RING failed: %s", strerror(errno));
+    return -1;
+  }
+
+  size_t buflen = TP_BLOCK_SIZE * ring->numblocks;
+  ring->base = mmap(NULL, buflen, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_LOCKED|MAP_POPULATE,
+                    packetsock, 0);
+  if (ring->base == MAP_FAILED) {
+    logmsg(ANDROID_LOG_FATAL, "mmap %lu failed: %s", buflen, strerror(errno));
+    return -1;
+  }
+
+  ring->block = 0;
+  ring->slot = 0;
+  ring->numslots = TP_BLOCK_SIZE / TP_FRAME_SIZE;
+  ring->next = (struct tpacket2_hdr *) ring->base;
+
+  logmsg(ANDROID_LOG_INFO, "Using ring buffer with %d frames (%d bytes) at %p",
+         total_frames, buflen, ring->base);
+
+  return packetsock;
+}
+
+/* function: ring_advance
+ * advances to the next position in the packet ring
+ * ring - packet ring buffer
+ */
+static struct tpacket2_hdr* ring_advance(struct packet_ring *ring) {
+  uint8_t *next = (uint8_t *) ring->next;
+
+  ring->slot++;
+  next += TP_FRAME_SIZE;
+
+  if (ring->slot == ring->numslots) {
+    ring->slot = 0;
+    ring->block++;
+
+    if (ring->block < ring->numblocks) {
+      next += TP_FRAME_GAP;
+    } else {
+      ring->block = 0;
+      next = (uint8_t *) ring->base;
+    }
+  }
+
+  ring->next = (struct tpacket2_hdr *) next;
+  return ring->next;
+}
+
+/* function: ring_read
+ * reads a packet from the ring buffer and translates it
+ * read_fd  - file descriptor to read original packet from
+ * write_fd - file descriptor to write translated packet to
+ * to_ipv6  - whether the packet is to be translated to ipv6 or ipv4
+ */
+void ring_read(struct packet_ring *ring, int write_fd, int to_ipv6) {
+  struct tpacket2_hdr *tp = ring->next;
+  if (tp->tp_status & TP_STATUS_USER) {
+    uint8_t *packet = ((uint8_t *) tp) + tp->tp_net;
+    translate_packet(write_fd, to_ipv6, packet, tp->tp_len);
+    tp->tp_status = TP_STATUS_KERNEL;
+    tp = ring_advance(ring);
+  }
+}
diff --git a/ring.h b/ring.h
new file mode 100644
index 0000000..b9b8c11
--- /dev/null
+++ b/ring.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * ring.c - packet ring buffer functions
+ */
+#ifndef __RING_H__
+#define __RING_H__
+
+#include <linux/if.h>
+#include <linux/if_packet.h>
+
+#include "clatd.h"
+
+struct tun_data;
+
+// Frame size. Must be a multiple of TPACKET_ALIGNMENT (=16)
+// Why the 16? http://lxr.free-electrons.com/source/net/packet/af_packet.c?v=3.4#L1764
+#define TP_FRAME_SIZE (TPACKET_ALIGN(MAXMTU) + TPACKET_ALIGN(TPACKET2_HDRLEN) + 16)
+
+// Block size. Must be a multiple of the page size, and a power of two for efficient memory use.
+#define TP_BLOCK_SIZE 65536
+
+// In order to save memory, our frames are not an exact divider of the block size. Therefore, the
+// mmaped region will have gaps corresponding to the empty space at the end of each block.
+#define TP_FRAMES (TP_BLOCK_SIZE / TP_FRAME_SIZE)
+#define TP_FRAME_GAP (TP_BLOCK_SIZE % TP_FRAME_SIZE)
+
+// TODO: Make this configurable. This requires some refactoring because the packet socket is
+// opened before we drop privileges, but the configuration file is read after. A value of 16
+// results in 656 frames (1048576 bytes).
+#define TP_NUM_BLOCKS 16
+
+struct packet_ring {
+  uint8_t *base;
+  struct tpacket2_hdr *next;
+  int slot, numslots;
+  int block, numblocks;
+};
+
+int ring_create(struct tun_data *tunnel);
+void ring_read(struct packet_ring *ring, int write_fd, int to_ipv6);
+
+#endif
diff --git a/tun.h b/tun.h
index 946ab47..bcdd10e 100644
--- a/tun.h
+++ b/tun.h
@@ -21,10 +21,12 @@
 #include <linux/if.h>
 
 #include "clatd.h"
+#include "ring.h"
 
 struct tun_data {
   char device4[IFNAMSIZ];
   int read_fd6, write_fd6, fd4;
+  struct packet_ring ring;
 };
 
 int tun_open();