文章首发于安全客 ,本文由安全客原创发布
转载,请参考转载声明,注明出处: https://www.anquanke.com/post/id/202988
安全客 – 有思想的安全新媒体

Kernel Pwn 学习之路(五)

0x01 前言

由于关于Kernel安全的文章实在过于繁杂,本文有部分内容大篇幅或全文引用了参考文献,若出现此情况的,将在相关内容的开头予以说明,部分引用参考文献的将在文件结尾的参考链接中注明。

Kernel的相关知识以及一些实例在Kernel中的利用已经在Kernel Pwn 学习之路(一)(二)给予了说明

Kernel中内存管理的相关知识已经在Kernel Pwn 学习之路(三)给予了说明

本文主要接续Kernel Pwn 学习之路(四),继续研究内核中断的相关机制。本文涉及到的所有Linux Kernel相关代码均基于5.6.2版本。

限于篇幅的原因,本文仅介绍了异常中断前处理,下一篇文章将深入中断服务函数,介绍其内部实现~

【传送门】:Kernel Pwn 学习之路(一)

【传送门】:Kernel Pwn 学习之路(二)

【传送门】:Kernel Pwn 学习之路(三)

【传送门】:Kernel Pwn 学习之路(四)

0x02 通用内核代码中的IDT相关处理

在上一篇文章的分析中,处理机进入了保护模式以及长模式,在平台相关代码中完成了IDT的初始化。在那之后流程将转移到通用内核代码,接下来我们进行分析通用内核代码中的IDT相关处理代码。

入口函数在/source/init/main.c中实现(这里省略不分析的函数),这个函数将完成内核以pid - 1运行第一个init进程 之前的所有初始化工作。

asmlinkage __visible void __init start_kernel(void)
{
    char *command_line;
    char *after_dashes;

    ......

    local_irq_disable(); // Line 12
    early_boot_irqs_disabled = true;

    /*
     * Interrupts are still disabled. Do necessary setups, then
     * enable them.
     */

    ......

    setup_arch(&command_line);  // Line 23

    ......

    boot_init_stack_canary();  // Line 123

    ......

    early_boot_irqs_disabled = false;
    local_irq_enable();   // Line 133

    ......

}

为中断栈设置Stack Canary

start_kernel()line 123调用了boot_init_stack_canary()来设置canary值来缓解中断栈溢出。

此函数在/source/arch/x86/include/asm/stackprotector.h#L61处实现

/* SPDX-License-Identifier: GPL-2.0 */
/*
 * GCC stack protector support.
 *
 * Stack protector works by putting predefined pattern at the start of
 * the stack frame and verifying that it hasn't been overwritten when
 * returning from the function.  The pattern is called stack canary
 * and unfortunately gcc requires it to be at a fixed offset from %gs.
 * On x86_64, the offset is 40 bytes and on x86_32 20 bytes.  x86_64
 * and x86_32 use segment registers differently and thus handles this
 * requirement differently.
 *
 * On x86_64, %gs is shared by percpu area and stack canary.  All
 * percpu symbols are zero based and %gs points to the base of percpu
 * area.  The first occupant of the percpu area is always
 * fixed_percpu_data which contains stack_canary at offset 40.  Userland
 * %gs is always saved and restored on kernel entry and exit using
 * swapgs, so stack protector doesn't add any complexity there.
 *
 * On x86_32, it's slightly more complicated.  As in x86_64, %gs is
 * used for userland TLS.  Unfortunately, some processors are much
 * slower at loading segment registers with different value when
 * entering and leaving the kernel, so the kernel uses %fs for percpu
 * area and manages %gs lazily so that %gs is switched only when
 * necessary, usually during task switch.
 *
 * As gcc requires the stack canary at %gs:20, %gs can't be managed
 * lazily if stack protector is enabled, so the kernel saves and
 * restores userland %gs on kernel entry and exit.  This behavior is
 * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
 * system.h to hide the details.
 */

#ifndef _ASM_STACKPROTECTOR_H
#define _ASM_STACKPROTECTOR_H 1

#ifdef CONFIG_STACKPROTECTOR

#include <asm/tsc.h>
#include <asm/processor.h>
#include <asm/percpu.h>
#include <asm/desc.h>

#include <linux/random.h>
#include <linux/sched.h>

/*
 * 24 byte read-only segment initializer for stack canary.  Linker
 * can't handle the address bit shifting.  Address will be set in
 * head_32 for boot CPU and setup_per_cpu_areas() for others.
 */
#define GDT_STACK_CANARY_INIT                      \
    [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18),

/*
 * Initialize the stackprotector canary value.
 *
 * NOTE: this must only be called from functions that never return,
 * and it must always be inlined.
 */
static __always_inline void boot_init_stack_canary(void)
{
    u64 canary;
    u64 tsc;

/* 
 * 如果设置了内核配置选项 CONFIG_X86_64 ,那么一开始将检查结构体 fixed_percpu_data 的状态
 * 这个结构体代表了 per-cpu 中断栈,其与 stack_canary 值中间有 40 个字节的 offset
 */
#ifdef CONFIG_X86_64
    BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
#endif
    /*
     * We both use the random pool and the current TSC as a source
     * of randomness. The TSC only matters for very early init,
     * there it already has some randomness on most systems. Later
     * on during the bootup the random pool has true entropy too.
     * 使用随机数和时戳计数器计算新的 canary 值
     */
    get_random_bytes(&canary, sizeof(canary));
    tsc = rdtsc();
    canary += tsc + (tsc << 32UL);
    canary &= CANARY_MASK;

    current->stack_canary = canary;
#ifdef CONFIG_X86_64
    // 通过 this_cpu_write 宏将 canary 值写入了 fixed_percpu_data 中:
    this_cpu_write(fixed_percpu_data.stack_canary, canary);
#else
    this_cpu_write(stack_canary.canary, canary);
#endif
}
......
#else  /* STACKPROTECTOR */
......
#endif /* _ASM_STACKPROTECTOR_H */

它的实现取决于 CONFIG_STACKPROTECTOR 这个内核配置选项。如果该选项没有置位,那该函数将是一个空函数。

禁用/启用本地中断

start_kernel()line 12调用了local_irq_disable()来禁用本地中断。

start_kernel()line 133调用了local_irq_enable()来启用本地中断。

local_irq_enable()是一个宏定义,它定义在/source/include/linux/irqflags.h#L109

local_irq_disable()是一个宏定义,它定义在/source/include/linux/irqflags.h#L111

/*
 * The local_irq_*() APIs are equal to the raw_local_irq*()
 * if !TRACE_IRQFLAGS.
 */
#ifdef CONFIG_TRACE_IRQFLAGS
#define local_irq_enable() \
    do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0)
#define local_irq_disable() \
    do { raw_local_irq_disable(); trace_hardirqs_off(); } while (0)

......

#else /* !CONFIG_TRACE_IRQFLAGS */

#define local_irq_enable() do { raw_local_irq_enable(); } while (0)
#define local_irq_disable()    do { raw_local_irq_disable(); } while (0)

......

#endif /* CONFIG_TRACE_IRQFLAGS */

CONFIG_TRACE_IRQFLAGS_SUPPORT 选项置位时, local_irq_* 宏将同时调用 trace_hardirqs_* 函数。在Linux死锁检测模块lockdep中有一项功能 irq-flags tracing,它可以追踪 hardirqsoftirq 的状态。在这种情况下, lockdep 死锁检测模块可以提供系统中关于硬/软中断的开/关事件的相关信息。

函数 trace_hardirqs_* 的定义位于/source/kernel/trace/trace_preemptirq.c#L22

void trace_hardirqs_on(void)
{
    if (this_cpu_read(tracing_irq_cpu)) {
        if (!in_nmi())
            trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
        tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1);
        this_cpu_write(tracing_irq_cpu, 0);
    }

    lockdep_hardirqs_on(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_on);
NOKPROBE_SYMBOL(trace_hardirqs_on);

void trace_hardirqs_off(void)
{
    if (!this_cpu_read(tracing_irq_cpu)) {
        this_cpu_write(tracing_irq_cpu, 1);
        tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
        if (!in_nmi())
            trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
    }

    lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);

可见它只是调用了 lockdep_hardirqs_* 函数。 lockdep_hardirqs_* 函数,该函数检查了当前进程的 hardirqs_enabled 域,如果本次 local_irq_disable 调用是冗余的话,便使 redundant_hardirqs_off 域的值增长,否则便使 hardirqs_off_events 域的值增加。这两个域或其它与死锁检测模块 lockdep 统计相关的域定义在/source/kernel/locking/lockdep_internals.h#L168处的 lockdep_stats 结构体中:

/*
 * Various lockdep statistics.
 * We want them per cpu as they are often accessed in fast path
 * and we want to avoid too much cache bouncing.
 */
struct lockdep_stats {
    unsigned long  chain_lookup_hits;
    unsigned int   chain_lookup_misses;
    unsigned long  hardirqs_on_events;
    unsigned long  hardirqs_off_events;
    unsigned long  redundant_hardirqs_on;
    unsigned long  redundant_hardirqs_off;
    unsigned long  softirqs_on_events;
    unsigned long  softirqs_off_events;
    unsigned long  redundant_softirqs_on;
    unsigned long  redundant_softirqs_off;
    int            nr_unused_locks;
    unsigned int   nr_redundant_checks;
    unsigned int   nr_redundant;
    unsigned int   nr_cyclic_checks;
    unsigned int   nr_find_usage_forwards_checks;
    unsigned int   nr_find_usage_backwards_checks;

    /*
     * Per lock class locking operation stat counts
     */
    unsigned long lock_class_ops[MAX_LOCKDEP_KEYS];
};

如果开启了 CONFIG_DEBUG_LOCKDEP 内核配置选项,lockdep_stats_debug_show函数会将所有的调试信息写入 /proc/lockdep 文件中。

接下来来分析 raw_local_irq_disable ,这个宏定义在/source/include/linux/irqflags.h#L79处实现,其展开后的样子是:

/*
 * Wrap the arch provided IRQ routines to provide appropriate checks.
 */
#define raw_local_irq_disable()        arch_local_irq_disable()
#define raw_local_irq_enable()     arch_local_irq_enable()

// In /source/arch/x86/include/asm/irqflags.h#L87

static inline notrace void arch_local_irq_disable(void)
{
    native_irq_disable();
}

static inline notrace void arch_local_irq_enable(void)
{
    native_irq_enable();
}

// In /source/arch/x86/include/asm/irqflags.h#L47

static inline void native_irq_disable(void)
{
    asm volatile("cli": : :"memory");
}

static inline void native_irq_enable(void)
{
    asm volatile("sti": : :"memory");
}

cli/sti 指令将清除/设置IF标志位,这个标志位控制着处理器是否响应中断或异常。

早期版本的内核中提供了一个叫做 cli 的函数来禁用所有处理器的中断,该函数已经被移除,替代它的是 local_irq_{enabled,disable} 宏,用于禁用或启用当前处理器的中断。我们在调用 local_irq_disable 宏禁用中断以后,接着设置了变量值:

early_boot_irqs_disabled = true;

变量 early_boot_irqs_disabled 定义在文件/source/include/linux/kernel.h中:

extern bool early_boot_irqs_disabled;

并在另外的地方使用。例如在/source/kernel/smp.c中的 smp_call_function_many 函数中,通过这个变量来检查当前是否由于中断禁用而处于死锁状态:

WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress && !early_boot_irqs_disabled);
分类: CTF

0 条评论

发表评论

电子邮件地址不会被公开。 必填项已用*标注