分析开源项目DPT抽取壳源码实现 - 学习

# 分析开源项目 DPT 抽取壳源码实现

为了自实现抽取壳，对开源项目 dpt 进行分析

开始先创建一个代理类

package com.luoyesiqiu.shell;

import android.app.Application;
import android.content.Context;
import android.content.pm.ApplicationInfo;
import android.content.pm.PackageManager;
import android.text.TextUtils;
import android.util.Log;

import com.luoyesiqiu.shell.util.FileUtils;

/**
 * Created by luoyesiqiu
 */
public class ProxyApplication extends Application {
    private static final String TAG = ProxyApplication.class.getSimpleName();
    private String realApplicationName = "";
    private Application realApplication = null;

    private void replaceApplication() {
        if (Global.sNeedCalledApplication && !TextUtils.isEmpty(realApplicationName)) {
            // replaceApplication 替换application
            realApplication = (Application) JniBridge.ra(realApplicationName);
            Log.d(TAG, "applicationExchange: " + realApplicationName + ", realApplication: " + realApplication.getClass().getName());

            // callRealApplicationAttach 调用真实application的attachBaseContext
            JniBridge.craa(getApplicationContext(), realApplicationName);
            // callRealApplicationOnCreate 调用真实application的onCreate方法
            JniBridge.craoc(realApplicationName);
            Global.sNeedCalledApplication = false;
        }
    }
    @Override
    public void onCreate() {
        super.onCreate();
        Log.d(TAG, "dpt onCreate");
        replaceApplication();
    }
    @Override
    public Context createPackageContext(String packageName, int flags) throws PackageManager.NameNotFoundException {
        Log.d(TAG, "createPackageContext: " + realApplicationName);
        if(!TextUtils.isEmpty(realApplicationName)){
            replaceApplication();
            return realApplication;
        }
        return super.createPackageContext(packageName, flags);
    }

    @Override
    public String getPackageName() {
        if(!TextUtils.isEmpty(realApplicationName)){
            return "";
        }
        return super.getPackageName();
    }
    @Override
    protected void attachBaseContext(Context base) {
        super.attachBaseContext(base);
        Log.d(TAG,"dpt attachBaseContext classloader = " + base.getClassLoader());
        // 检查是否替换classerLoader
        if(!Global.sIsReplacedClassLoader) {
            // ApplicationInfo包含了关于应用程序的元信息
            ApplicationInfo applicationInfo = base.getApplicationInfo();
            if(applicationInfo == null) {
                throw new NullPointerException("application info is null");
            }
            // sourceDir 属性表示应用程序的安装路径 它通常是 APK 文件的路径
            // dataDir 属性表示应用程序的数据目录 它通常是应用程序的私有数据存储路径
            // 解压dataDir sourceDir
            FileUtils.unzipLibs(applicationInfo.sourceDir, applicationInfo.dataDir);
            // 加载dataDir目录下的so文件
            JniBridge.loadShellLibs(applicationInfo.dataDir);
            Log.d(TAG,"ProxyApplication init");
            // init_app 还原code指令
            JniBridge.ia();
            ClassLoader targetClassLoader = base.getClassLoader();
            // 设置PathList的Elements
            JniBridge.cbde(targetClassLoader);
            Global.sIsReplacedClassLoader = true;
        }

        // readApplicationName 获取真实的application类名
        realApplicationName = JniBridge.rapn();
    }

}

先从 attachBaseContext 方法看起

在 Global.java 中定义了全局变量来控制事件，这个方法做了一下几件事：

1. 解压并加载程序 so 文件

1 2	FileUtils.unzipLibs(applicationInfo.sourceDir, applicationInfo.dataDir); JniBridge.loadShellLibs(applicationInfo.dataDir);

2. 将原始的 code 指令保存在自定义 Map 中，这里在 native 方法中实现

1	JniBridge.ia();

3. 替换 pathList 的 Elements，在 native 方法中实现

1	JniBridge.cbde(targetClassLoader);

4. 获取真实的 application 类名，准备在 onCreate 方法中替换，同样是在 naitve 方法中实现

1	realApplicationName = JniBridge.rapn();

接下来是 onCreate 方法，对 application 做了替换，并调用真实 application 的 attachBaseContext 方法和 onCreate 方法，具体实现在 native 方法中

接下来看具体的 native 实现

# init_app

std::optional<std::tuple<uint8_t *,size_t>> g_codeItemFileData;
// DPT_ENCRYPT表明此方法需要进行混淆
DPT_ENCRYPT void init_app(JNIEnv *env, jclass __unused) {
    DLOGD("called!");
    clock_t start = clock();

    void *package_addr = nullptr;
    size_t package_size = 0;
    // 获取此apk的地址和size
    load_package(env, &package_addr, &package_size);

    // 获取抽取的code指令
    if(!g_codeItemFileData.has_value()) {
        // AY_OBFUSCATE 是一个在 C++ 安全圈和外挂/反外挂圈子里非常著名的开源编译期字符串混淆库（作者名叫 Adam Yaxley，所以前缀是 AY）
        // 对CODE_ITEM_NAME_IN_ZIP 即我们存放code指令的地方进行字符串混淆
        auto entry_data = read_zip_file_entry(package_addr, package_size, AY_OBFUSCATE(CODE_ITEM_NAME_IN_ZIP));
        if(entry_data.has_value()) {
            g_codeItemFileData = std::move(entry_data);
        }
        printTime("read codeitem data took =" , start);

    }
    else {
        DLOGD("no need read codeitem from zip");
    }
    // 解析code指令数据
    auto [entry_data, entry_size] = g_codeItemFileData.value();
    // 保存code指令
    readCodeItem((uint8_t *)entry_data, entry_size);

    pthread_mutex_lock(&g_write_dexes_mutex);
    extractDexesInNeeded(env, package_addr, package_size);
    pthread_mutex_unlock(&g_write_dexes_mutex);

    unload_package(package_addr, package_size);
    printTime("read package data took =" , start);
}

在 apk 中拿到自己的 code 指令后，将其解析并保存，主要在 readCodeItem 方法中实现

std::unordered_map<int,std::vector<data::CodeItem *> *> dexMap;
DPT_ENCRYPT void readCodeItem(uint8_t *data,size_t data_len) {

    if (data != nullptr && data_len >= 0) {
        data::MultiDexCode *dexCode = data::MultiDexCode::getInst();

        // 自定义一个类 存放解析出的code数据 创建类dexCode并初始化
        dexCode->init(data, data_len);
        DLOGI("version = %d, dexCount = %d", dexCode->readVersion(),
              dexCode->readDexCount());
        // indexCount为dex数量
        int indexCount = 0;
        // dexCodeIndex存放每个dex的起始索引
        uint32_t *dexCodeIndex = dexCode->readDexCodeIndex(&indexCount);
        dexMap.reserve(indexCount);
        for (int i = 0; i < indexCount; i++) {
            DLOGI("dexCodeIndex[%d] = %d", i, *(dexCodeIndex + i));
            // dexCodeOffset为当前dex数据的起始地址
            uint32_t dexCodeOffset = *(dexCodeIndex + i);
            // 数据前两个字节为抽空方法的数量
            uint16_t methodCount = dexCode->readUInt16(dexCodeOffset);

            DLOGD("dexCodeOffset[%d] = %d, methodCount[%d] = %d", i, dexCodeOffset, i,
                  methodCount);
            auto codeItemVec = new std::vector<data::CodeItem *>(65536);
            // 跳过方法数量 codeItemIndex为codeItem的偏移
            uint32_t codeItemIndex = dexCodeOffset + 2;
            for (int k = 0; k < methodCount; k++) {
                data::CodeItem *codeItem = dexCode->nextCodeItem(&codeItemIndex);
                uint32_t methodIdx = codeItem->getMethodIdx();
                codeItemVec->at(methodIdx) = codeItem;
            }
            dexMap.emplace(i, codeItemVec);

        }
        DLOGD("map size = %lu", (unsigned long)dexMap.size());
    }
}

单看这个方法可能有点不明白，实际应该结合作者自定义的一个结构来分析

//
// Created by luoyesiqiu
//

#include "MultiDexCode.h"

dpt::data::MultiDexCode* dpt::data::MultiDexCode::getInst(){
    static auto *m_inst = new MultiDexCode();
    return m_inst;
}

// 初始化
void dpt::data::MultiDexCode::init(uint8_t* buffer, size_t size){
    this->m_buffer = buffer;
    this->m_size = size;
}

// 从偏移0开始读取两个字节 这代表了版本号
uint16_t dpt::data::MultiDexCode::readVersion(){
    return readUInt16(0);
}

// 从偏移2开始读取两个字节 这代表了dex数量
uint16_t dpt::data::MultiDexCode::readDexCount(){
    return readUInt16(2);
}

// 将count赋值为dex数量 返回数据 + 4偏移处的一个四字节类型的指针数组
uint32_t* dpt::data::MultiDexCode::readDexCodeIndex(int* count){
    uint16_t dexCount = readDexCount();
    *count = dexCount;
    return (uint32_t*)(m_buffer + 4);
}

// 读取单个方法的真实指令
dpt::data::CodeItem* dpt::data::MultiDexCode::nextCodeItem(uint32_t* offset) {
    // 读取偏移处的四个字节 代表方法id
    uint32_t methodIdx = readUInt32(*offset);
    // 读取偏移 + 4处的四个字节 代表指令长度
    uint32_t insnsSize = readUInt32(*offset + 4);
    // 指令 = 原始数据 + 偏移 + 8字节头部信息(方法id + 指令长度)
    auto* insns = (uint8_t*)(m_buffer + *offset + 8);
    // 下一个方法开头位置 = 原始开头位置 + 8字节头部信息 + 指令长度
    *offset = (*offset + 8 + insnsSize);
    // 新建codeItem类 保存指令
    auto* codeItem = new CodeItem(methodIdx, insnsSize, insns);
	// 返回codeItem
    return codeItem;
}
// 其实从这里就可以得到codeItem的结构 
// 方法id（4字节）+ 指令长度（4字节）+ 原始指令

// 内存读取函数
// 单字节读取
uint8_t dpt::data::MultiDexCode::readUInt8(uint32_t offset){
    uint8_t t = 0;
    memcpy(&t, m_buffer + offset, sizeof(uint8_t));
    return t;
}

// 双字节读取
uint16_t dpt::data::MultiDexCode::readUInt16(uint32_t offset){
    uint16_t t = 0;
    memcpy(&t, m_buffer + offset, sizeof(uint16_t));
    return t;
}

// 四字节读取
uint32_t dpt::data::MultiDexCode::readUInt32(uint32_t offset){
    uint32_t t = 0;
    memcpy(&t, m_buffer + offset, sizeof(uint32_t));
    return t;
}

根据 readCodeItem 方法中的 log

1 2	DLOGI("version = %d, dexCount = %d", dexCode->readVersion(), dexCode->readDexCount());

得知原始文件头为版本号 (2 字节) + dex 方法数 (2 字节)

接下来是一个 dex 方法数循环

for (int i = 0; i < indexCount; i++) {
            DLOGI("dexCodeIndex[%d] = %d", i, *(dexCodeIndex + i));
            // dexCodeIndex最开始为数据 + 4偏移处的地址 即跳过文件头的地址 后面紧接着是每个dex的位置偏移
    	   // dexCodeOffset为当前dex数据的起始地址
            uint32_t dexCodeOffset = *(dexCodeIndex + i);
            // 数据前两个字节为抽空方法的数量
            uint16_t methodCount = dexCode->readUInt16(dexCodeOffset);

            DLOGD("dexCodeOffset[%d] = %d, methodCount[%d] = %d", i, dexCodeOffset, i,
                  methodCount);
   		    //一个dex最大有65535个方法 所以这里直接创建了最大的vector
            auto codeItemVec = new std::vector<data::CodeItem *>(65536);
            // 跳过方法数量 codeItemIndex为codeItem的偏移
            uint32_t codeItemIndex = dexCodeOffset + 2;
            for (int k = 0; k < methodCount; k++) {
                // 将当前codeItem数据保存至自定义类
                data::CodeItem *codeItem = dexCode->nextCodeItem(&codeItemIndex);
                // 获取方法id
                uint32_t methodIdx = codeItem->getMethodIdx();
                // 将codeItem保存至codeItemVec
                codeItemVec->at(methodIdx) = codeItem;
            }
    	    // 第i个dex替换成此dex的 codeItemVec保存到dexMap中
            dexMap.emplace(i, codeItemVec);

文件头之后是每个 dex 在文件中的偏移，偏移之后是 dexcode

根据代码可以得到每个 dexCode 的结构：

抽空方法数（2 字节）+ codeItem

现在 init_app 函数功能就分析完了：读取 apk 中抽出的指令，并将其保存到 dexMap 中

# combineDexElements

DPT_ENCRYPT void combineDexElement(JNIEnv* env, jclass __unused, jobject targetClassLoader, const char* pathChs) {
    // 获取注入dex的Elements
    jobjectArray extraDexElements = makePathElements(env,pathChs);

    // 反射获取当前ClassLoader里PexPathList中的dexElements
    dalvik_system_BaseDexClassLoader targetBaseDexClassLoader(env,targetClassLoader);

    jobject originDexPathListObj = targetBaseDexClassLoader.getPathList();

    dalvik_system_DexPathList targetDexPathList(env,originDexPathListObj);

    jobjectArray originDexElements = targetDexPathList.getDexElements();

    jsize extraSize = env->GetArrayLength(extraDexElements);
    jsize originSize = env->GetArrayLength(originDexElements);

    dalvik_system_DexPathList::Element element(env, nullptr);
    jclass ElementClass = element.getClass();
    // 创建一个新的数组
    jobjectArray  newDexElements = env->NewObjectArray(originSize + extraSize,ElementClass, nullptr);

    // 将原来的两个dexElements拷贝到新数组
    for(int i = 0;i < originSize;i++) {
        jobject elementObj = env->GetObjectArrayElement(originDexElements, i);
        env->SetObjectArrayElement(newDexElements,i,elementObj);
    }

    for(int i = originSize;i < originSize + extraSize;i++) {
        jobject elementObj = env->GetObjectArrayElement(extraDexElements, i - originSize);
        env->SetObjectArrayElement(newDexElements,i,elementObj);
    }

    // 新DexElements设置回PathList中
    targetDexPathList.setDexElements(newDexElements);

    DLOGD("success");
}

DPT_ENCRYPT void combineDexElements(JNIEnv* env, jclass klass, jobject targetClassLoader) {
    char compressedDexesPathChs[256] = {0};
    // 获取注入dex的路径 在找dexElements时需要用到
    getCompressedDexesPath(env,compressedDexesPathChs, ARRAY_LENGTH(compressedDexesPathChs));

    combineDexElement(env, klass, targetClassLoader, compressedDexesPathChs);

// 垃圾指令用来混淆
#ifndef DEBUG
    junkCodeDexProtect(env);
#endif
    DLOGD("success");
}

//dpt_util.cpp
void getCompressedDexesPath(JNIEnv *env, char *outDexZipPath, size_t max_len) {
    // 获取app私有目录
    std::string dataDir = getDataDir(env);
    // 拼接出绝对路径 并存放至outDexZipPath中
    snprintf(outDexZipPath,max_len, "%s/%s/%s", dataDir.c_str(), CACHE_DIR, DEXES_ZIP_NAME);
}

这个是壳里非常经典的操作了，PathList 字段里存在两个成员变量，其中 dexElemenst 用来存放 dex 和资源列表，所以要把原始 dex 放到当前 dexElements 中，不然 classLoader 找不到这个 dex

被抽空的指令已经保存至 map 中，那么什么时候还原呢

// dpt_macro.h
#define INIT_ARRAY_SECTION __attribute__ ((constructor))
// dpt.h
INIT_ARRAY_SECTION void init_dpt();

在 so 文件被加载时，就会调用 init_dpt 这个函数

init_dpt

void init_dpt() {
#ifdef DECRYPT_BITCODE
    decrypt_bitcode();
#endif
    DLOGI("call!");

    // hook还原指令
    dpt_hook();
    // 检测
    createAntiRiskProcess();
}

其中 dpt_hook 为还原指令过程

# dpt_hook

void dpt_hook() {
    // 使用bytehook框架hook
    bytehook_init(BYTEHOOK_MODE_AUTOMATIC,false);
    // 获取当前安卓系统的版本
    g_sdkLevel = android_get_device_api_level();
    // 反aot预编译
    hook_execve();
    // 加载dex时使被加载的dex可写
    hook_mmap();
    hook_write();
    // 拦截类加载 优先尝试hook defineClass
    bool hookSuccess = hook_DefineClass();
    if(!hookSuccess) {
        // 次拦截LoadClass
        hook_LoadClass();
    }
}

先看是如何 hook defineClass 的

DPT_ENCRYPT bool hook_DefineClass() {
    char sym[256] = {0};
    // 在libart.so中获取ClassLinker::DefineClass的c++符号名
    find_symbol_in_elf_file(GetClassLinkerDefineClassLibPath(), sym, ARRAY_LENGTH(sym), 2, "ClassLinker", "DefineClass");

    if(strlen(sym) == 0) {
        DLOGW("cannot find symbol: DefineClass");
        return false;
    }

    // 解析出这个函数在内存中的绝对位置
    void* defineClassAddress = DobbySymbolResolver(GetClassLinkerDefineClassLibPath(), sym);

    if(defineClassAddress == nullptr) {
        DLOGE("defineClass address is null, sym: %s", sym);
        return false;
    }

    // 使用dobbyHook进行InlineHook
    int hookResult;
    if(g_sdkLevel >= __ANDROID_API_L_MR1__) {
        hookResult = DobbyHook(defineClassAddress, (dobby_dummy_func_t) DefineClassV22, (dobby_dummy_func_t *) &g_originDefineClassV22);
    }
    else {
        hookResult = DobbyHook(defineClassAddress, (dobby_dummy_func_t) DefineClassV21, (dobby_dummy_func_t *) &g_originDefineClassV21);
    }

    if(hookResult == 0) {
        DLOGD("hook success.");
        return true;
    }
    else {
        DLOGE("hook fail!");
        return false;
    }
}

// 跳板函数
DPT_ENCRYPT void *DefineClassV22(void* thiz,void* self,
                 const char* descriptor,
                 size_t hash,
                 void* class_loader,
                 const void* dex_file,
                 const void* dex_class_def) {

    if(LIKELY(g_originDefineClassV22 != nullptr)) {

        // 在创建并加载这个类之前 先将指令回填
        patchClass(descriptor,dex_file,dex_class_def);
	
        // 继续调用系统原本的DefineClass
        return g_originDefineClassV22( thiz,self,descriptor,hash,class_loader, dex_file, dex_class_def);

    }
    return nullptr;
}

可以看到，在调用 defineClass 之前先将方法回填，

为什么是选择 defineClass 来 hook 呢，作者是这样说的：

在 Hook DefineClass 函数之前，我们需要了解 DefineClass 函数流程。为什么是 DefineClass 函数，其他函数是否可行？

当一个类被加载的时候，它的调用顺序是这样的 (部分流程已省略)：

ClassLoader.java::loadClass

DexFile.java::defineClass

class_linker.cc::DefineClass

class_linker.cc::LoadClass

class_linker.cc::LoadClassMembers

class_linker.cc::LoadMethod

也就是说，当一个类被加载，它是会去调用 DefineClass 函数的，我们看一下它的函数原型：
1
2
3
4
5
6
mirror::Class* ClassLinker::DefineClass(Thread* self,
                                        const char* descriptor,
                                        size_t hash,
                                        Handle<mirror::ClassLoader> class_loader,
                                        const DexFile& dex_file,
                                        const DexFile::ClassDef& dex_class_def);
DefineClass 函数的参数很巧，有 DexFile 结构，还有 ClassDef 结构，我们通过 Hook 这个函数就知道以下信息：

加载的类来自哪个 dex 文件

加载类的数据的偏移

第一条可以帮助我们大致定位到存储的 CodeItem 的位置；第二条可以帮助我们找到 CodeItem 具体存储的位置以及填充到的位置。

来看一下 ClassDef 的定义：
1
2
3
4
5
6
7
8
9
10
struct ClassDef {
    uint32_t class_idx_;  // index into type_ids_ array for this class
    uint32_t access_flags_;
    uint32_t superclass_idx_;  // index into type_ids_ array for superclass
    uint32_t interfaces_off_;  // file offset to TypeList
    uint32_t source_file_idx_;  // index into string_ids_ for source file name
    uint32_t annotations_off_;  // file offset to annotations_directory_item
    uint32_t class_data_off_;  // file offset to class_data_item
    uint32_t static_values_off_;  // file offset to EncodedArray
};
其中最重要的字段就是 class_data_off_ 它的值是当前加载的类的具体数据在 dex 文件中的偏移，通过这个字段就可以顺藤摸瓜定位到当前加载类的所有函数的在内存中 CodeItem 的具体位置。

ClassDef 这个结构还有一个特点，它是 dex 文件的结构，也就是说 dex 文件格式不变，它一般就不会变。

还有，DefineClass 函数的参数会改变吗？目前来看从 Android M 到现在没有变过。

所以使用它不用太担心随着 Android 版本的升级而导致字段偏移的变化，也就是兼容性较强。

这就是为什么用 DefineClass 作为 Hook 点。

总结一下就是 DefineClass 函数稳定且方便，在调用时还原也可以起到动态还原的效果

接下来具体分析是如何将指令回填的

# patchClass

DPT_ENCRYPT void patchClass(__unused const char* descriptor,
                 const void* dex_file,
                 const void* dex_class_def) {

    // 垃圾类检测 如果加载垃圾类说明可能有人在主动遍历脱壳
    const char *junkClassName = AY_OBFUSCATE(JUNK_CLASS_FULL_NAME);
    if(descriptor != nullptr && UNLIKELY(dpt_strstr(descriptor, junkClassName) != nullptr)) {
        size_t descriptorLength = dpt_strlen(descriptor);
        char ch = descriptor[descriptorLength - 2];
        DLOGD("Attempt patch junk class %s ,char is '%c'",descriptor,ch);
        if(isdigit(ch)) {
            DLOGE("Find illegal call, desc: %s!", descriptor);
            dpt_crash();
            return;
        }

    }

    // 解析dexFile结构体
    if(LIKELY(dex_file != nullptr)){
        std::string location; // dex路径名
        uint8_t *begin = nullptr; // dex文件在内存中的起始基址
        uint64_t dexSize = 0; // dex大小
        // 不同版本适配
        // android14 / 15以上
        if(g_sdkLevel >= 35) {
            auto* dexFileV35 = (V35::DexFile *)dex_file;
            location = dexFileV35->location_;
            begin = (uint8_t *)dexFileV35->begin_;
            dexSize = dexFileV35->header_->file_size_;
        }
        // android9 - 13
        else if(g_sdkLevel >= __ANDROID_API_P__){
            auto* dexFileV28 = (V28::DexFile *)dex_file;
            location = dexFileV28->location_;
            begin = (uint8_t *)dexFileV28->begin_;
            dexSize = dexFileV28->size_ == 0 ? dexFileV28->header_->file_size_ : dexFileV28->size_;
        }
        // android5 - 8
        else {
            auto* dexFileV21 = (V21::DexFile *)dex_file;
            location = dexFileV21->location_;
            begin = (uint8_t *)dexFileV21->begin_;
            dexSize = dexFileV21->size_ == 0 ? dexFileV21->header_->file_size_ : dexFileV21->size_;
        }

        // 检查这个Dex的路径名是不是我们需要的并且dex_class_def不为空
        if(location.rfind(DEXES_ZIP_NAME) != std::string::npos && dex_class_def){
            // 从文件名中抽出dex索引
            int dexIndex = parse_dex_number(location);

            auto* class_def = (dex::ClassDef *)dex_class_def;
            NLOG("class_desc = '%s', class_idx_ = 0x%x, class data off = 0x%x",descriptor,class_def->class_idx_,class_def->class_data_off_);

            // 如果dex_class_def中的class_data_off字段不为空
            if(LIKELY(class_def->class_data_off_ != 0)) {
                size_t read = 0; // 记录已阅读的字节
                // 获取具体地址
                auto *class_data = (uint8_t *) ((uint8_t *) begin + class_def->class_data_off_);

                // 读取类的静态字段总数
                uint64_t static_fields_size = 0;
                // DexFileUtils::readUleb128方法返回占用字节 将其加到read上
                read += DexFileUtils::readUleb128(class_data, &static_fields_size);

                // 读取类的实例字段总数
                uint64_t instance_fields_size = 0;
                read += DexFileUtils::readUleb128(class_data + read, &instance_fields_size);

                // 直接方法数
                uint64_t direct_methods_size = 0;
                read += DexFileUtils::readUleb128(class_data + read, &direct_methods_size);

                // 虚方法数
                uint64_t virtual_methods_size = 0;
                read += DexFileUtils::readUleb128(class_data + read, &virtual_methods_size);

                // staticFields
                read += DexFileUtils::getFieldsSize(class_data + read, static_fields_size);

                // instanceFields
                read += DexFileUtils::getFieldsSize(class_data + read, instance_fields_size);

                // 开辟直接方法数大小的数组用来存放直接方法
                auto *directMethods = new dex::ClassDataMethod[direct_methods_size];
                // 将直接方法存入数组
                read += DexFileUtils::readMethods(class_data + read, directMethods,
                                                  direct_methods_size);

                // 开辟虚方法数大小的数组用来存放虚方法
                auto *virtualMethods = new dex::ClassDataMethod[virtual_methods_size];
                read += DexFileUtils::readMethods(class_data + read, virtualMethods,
                                                  virtual_methods_size);

                // 遍历直接方法并回填指令
                for (uint64_t i = 0; i < direct_methods_size; i++) {
                    auto method = directMethods[i];
                    patchMethod(begin, location.c_str(), dexSize, dexIndex,
                                method.method_idx_delta_, method.code_off_);
                }

                // 遍历虚方法并回填指令
                for (uint64_t i = 0; i < virtual_methods_size; i++) {
                    auto method = virtualMethods[i];
                    patchMethod(begin, location.c_str(), dexSize, dexIndex,
                                method.method_idx_delta_, method.code_off_);
                }

                delete[] directMethods;
                delete[] virtualMethods;
            }
            else {
                NLOG("class_def->class_data_off_ is zero");
            }
        }
    }
}

在 Dex 格式中，为了极致压缩体积，Google 使用了 ULEB128 (Unsigned Little Endian Base 128) 变长编码来存储数字，一个数字可能占 1 个字节，也可能占 5 个字节。所以在上面代码中，添加了 read 这个游标来确定占用字节。

dex class_data 格式如下，上面代码解析了 dex 格式，并找到对应方法的地址，遍历方法将指令回填

这段代码作者使用了很多自定义函数或结构，大多在 dex_file.h/dex_file_cpp 里。为了方便理解，对它们也进行分析

首先是这段对于 dexFile 的解析，因为我的真机是 Android13，所以用这个版本的代码做示例

else if(g_sdkLevel >= __ANDROID_API_P__){
            auto* dexFileV28 = (V28::DexFile *)dex_file;
            location = dexFileV28->location_;
            begin = (uint8_t *)dexFileV28->begin_;
            dexSize = dexFileV28->size_ == 0 ? dexFileV28->header_->file_size_ : dexFileV28->size_;
        }

当在 defineClass 拦截到这个操作时，会传过来一个 dexFile 的指针，因为没有直接解析它的函数，所以作者自定义了一个

//dex_file.h
namespace V28 {
        class DexFile {
        public:
            //vtable pointer
            // 虚表指针
            void *_;

            // The base address of the memory mapping.
            const uint8_t *const begin_;

            // The size of the underlying memory allocation in bytes.
            const size_t size_;

            // The base address of the data section (same as Begin() for standard dex).
            const uint8_t *const data_begin_;

            // The size of the data section.
            const size_t data_size_;

            // Typically the dex file name when available, alternatively some identifying string.
            //
            // The ClassLinker will use this to match DexFiles the boot class
            // path to DexCache::GetLocation when loading from an image.
            const std::string location_;

            const uint32_t location_checksum_;

            // Points to the header section.
            const dex::Header *const header_;

            // Points to the base of the string identifier list.
            const dex::StringId *const string_ids_;

            // Points to the base of the type identifier list.
            const dex::TypeId *const type_ids_;

            // Points to the base of the field identifier list.
            const dex::FieldId *const field_ids_;

            // Points to the base of the method identifier list.
            const dex::MethodId *const method_ids_;

            // Points to the base of the prototype identifier list.
            const dex::ProtoId *const proto_ids_;

            // Points to the base of the class definition list.
            const dex::ClassDef *const class_defs_;

        };
    } //namespace V28

这就是 dexFile 在内存中的管理对象，当然还有在物理内存中的 dexFile 对象

下面是一些工具类函数

// 解析变长数字
size_t dpt::DexFileUtils::readUleb128(uint8_t const * const data, uint64_t * const val) {
    uint64_t result = 0; // 真实数字
    size_t read = 0; // 记录变长数字占用的字节
    for(int i = 0;i < 5;i++){ // 一个数字最多占5个字节
        uint8_t b = *(data + i); // 每次读取一个字节
        uint8_t value =  b & 0x7f; // & 0111 1111 清零最高位
        result |= (value << (i * 7)); // 得到小端序结果
        read++; // 占用字节数+1
        // & 1000 0000 判断最高位
        // Uleb128中最高位表示后面还有没有数据
        // 为0表明数字结束
        if((b & 0x80) != 0x80){
            break;
        }
    }
    *val = result;
    return read;
}

// 解析差值编码
size_t dpt::DexFileUtils::readFields(uint8_t *data, dpt::dex::ClassDataField *fields, uint64_t count) {
    // 记录一共读取了多少数字
    size_t read = 0;
    // 累加器 还原真实索引
    uint32_t fieldIndexDelta = 0;
    // 遍历读取每个字段
    for (uint64_t i = 0; i < count; ++i) {
        // 读取当前值
        uint64_t fieldIndex = 0;
        // 数量加到read上作为偏移
        read += readUleb128(data + read,&fieldIndex);
        // 真实的索引值
        fieldIndexDelta += fieldIndex;

        // 访问权限
        uint64_t accessFlags = 0;
        read += readUleb128(data + read,&accessFlags);
        fields[i].field_idx_delta_ = fieldIndexDelta;
        fields[i].access_flags_ = accessFlags;
    }

    return read;
}

// 跳步函数 直接找到方法的偏移
size_t dpt::DexFileUtils::getFieldsSize(uint8_t *data, uint64_t count) {
    // 跳过方法前的字段
    size_t read = 0;
    for (uint64_t i = 0; i < count; ++i) {
        uint64_t fieldIndex = 0;
        read += readUleb128(data + read, &fieldIndex);

        uint64_t accessFlags = 0;
        read += readUleb128(data + read, &accessFlags);
    }

    return read;
}

// 解析方法
size_t dpt::DexFileUtils::readMethods(uint8_t *data, dpt::dex::ClassDataMethod *method, uint64_t count){
    size_t read = 0;
    uint32_t methodIndexDelta = 0;
    for (uint64_t i = 0; i < count; ++i) {
        uint64_t methodIndex = 0;
        read += readUleb128(data + read,&methodIndex);
        methodIndexDelta += methodIndex;

        uint64_t accessFlags = 0;
        read += readUleb128(data + read,&accessFlags);

        uint64_t codeOff = 0;
        read += readUleb128(data + read,&codeOff);

        method[i].method_idx_delta_ = methodIndexDelta;
        method[i].access_flags_ = accessFlags;
        method[i].code_off_ = codeOff;
    }

    return read;
}

最后看一下 patchMethod 方法具体是如何回填指令的

# patchMethod

void patchMethod(uint8_t *begin,
                             __unused const char *location,
                             uint32_t dexSize,
                             int dexIndex,
                             uint32_t methodIdx,
                             uint32_t codeOff) {

    // 找到dex的索引
    auto dexIt = dexMap.find(dexIndex);
    if (LIKELY(dexIt != dexMap.end())) {
        auto dexMemIt = dexMemMap.find(dexIndex);
        // 是否解锁可写权限
        if(UNLIKELY(dexMemIt == dexMemMap.end())){
            // 未解锁就先解锁
            change_dex_protective(begin, dexSize, dexIndex);
        }

        // 拿到这个dex的指令集
        auto codeItemVec = dexIt->second;
        // 找到当前方法指令
        auto codeItem = codeItemVec->at(methodIdx);
        if (LIKELY(codeItem != nullptr)) {
            if(codeOff == 0) {
                NLOG("dex: %d methodIndex: %d no need patch!",dexIndex,methodIdx);
                return;
            }

            // 计算在内存中的绝对地址
            auto *dexCodeItem = (dex::CodeItem *)(begin + codeOff);

            // 真实的insn指针
            auto *realInsnsPtr = (uint8_t *)(dexCodeItem->insns_);

            NLOG("codeItem patch, methodIndex = %d, insnsSize = %d >>> %p(0x%x)",
                 codeItem->getMethodIdx(),
                 codeItem->getInsnsSize(),
                 realInsnsPtr,
                 (unsigned int)(realInsnsPtr - begin));

            // 写回原方法
            memcpy(realInsnsPtr,codeItem->getInsns(),codeItem->getInsnsSize());
        }
        else{
            NLOG("cannot find  methodId: %d in codeitem map, dex index: %d(%s)", methodIdx, dexIndex, location);
        }
    }
    else{
        DLOGW("cannot find dex: '%s' in dex map", location);
    }
}

将之前存在 dexMap 中的指令取出，用 memcpy 拷贝回去，这样就完成最后方法指令回填了

在 dpt_hook 中，除了 hook DefineClass 实现指令回填，注意到还 hook 了几个函数

# hook_execve

DPT_ENCRYPT void hook_execve(){
    bytehook_stub_t stub = bytehook_hook_single(
            getArtLibName(),
            "libc.so",
            "execve",
            (void *) fake_execve,
            nullptr,
            nullptr);
    if (stub != nullptr) {
        DLOGD("execve hook success!");
    }
    else {
        DLOGE("execve hook fail!");
    }
}

DPT_ENCRYPT int fake_execve(const char *pathname, char *const argv[], char *const envp[]) {
    BYTEHOOK_STACK_SCOPE();
    DLOGD("execve hooked: %s", pathname);
    // 拦截系统启动 `dex2oat`，强行返回拒绝访问 (EACCES)，迫使系统退回 JIT 或解释器模式
    if (strstr(pathname, "dex2oat") != nullptr) {
        DLOGD("execve blocked: %s", pathname);
        errno = EACCES;
        return -1;
    }
    return BYTEHOOK_CALL_PREV(fake_execve, pathname, argv, envp);
}

hook execve 是为了防止系统将 dex 预编译

Dalvik 虚拟机主要依赖解释器，当 app 运行到一个方法，会实时去内存中读取 Dalvik 字节码，然后解释执行。

但是 ART 时代下，引入了 AOT 预编译机制，系统会通过 dex2oat 这个进程，将 dex 编译为本地机器码格式，虽然这样提升了效率，但是在抽取壳中，dex 是被我们抽空的，回填指令操作和 oat 文件没什么关系，预编译之后也是空方法。所以要禁用 dex2oat 预编译，让系统强制走解释执行

# hook_mmap

DPT_ENCRYPT void hook_mmap(){
    bytehook_stub_t stub = bytehook_hook_single(
            getArtLibName(),
            "libc.so",
            "mmap",
            (void*)fake_mmap,
            nullptr,
            nullptr);
    if(stub != nullptr){
        DLOGD("mmap hook success!");
    }
    else {
        DLOGE("mmap hook fail!");
    }
}

DPT_ENCRYPT void* fake_mmap(void* __addr, size_t __size, int __prot, int __flags, int __fd, off_t __offset){
    BYTEHOOK_STACK_SCOPE();

    int prot = __prot;
    int hasRead = (__prot & PROT_READ) == PROT_READ;
    int hasWrite = (__prot & PROT_WRITE) == PROT_WRITE;

    char fd_path[256] = {0};
    // 文件描述符转绝对路径
    dpt_readlink(__fd,fd_path, ARRAY_LENGTH(fd_path));

    std::string fd_path_str = fd_path;
    // 检测是否为WebView的文件
    // Android 的 WebView 组件非常特殊且脆弱，它在加载自己的 vdex 文件时有严格的权限和状态校验 如果壳强行修改了 WebView 文件的映射权限 会导致 WebView 内部逻辑崩溃
    if(checkWebViewInFilename(fd_path_str)) {
        DLOGW("link path: %s, no need to change prot",fd_path);
        goto tail;
    }

    // 有读且没有写权限
    if(hasRead && !hasWrite) {
        // 添加写权限
        prot = prot | PROT_WRITE;
        DLOGD("append write flag fd = %d, size = %zu, prot = %d, flag = %d",__fd,__size, prot,__flags);
    }

    // 如果为Android11 则防止走base.vdex的特殊路径
    if(g_sdkLevel == 30){
        if(strstr(fd_path,"base.vdex") != nullptr){
            DLOGE("want to mmap base.vdex");
            __flags = 0;
        }
    }
    tail:
    void *addr = BYTEHOOK_CALL_PREV(fake_mmap,__addr,  __size, prot,  __flags,  __fd,  __offset);
    return addr;
}

将物理文件映射至内存空间时都需要使用 mmap 函数，所以可以 hook mmap 函数更改 dex 权限

# 总结

简单来说，抽取壳其实就是抽取指令 -> 还原的过程。为了实现调用方法时才动态还原，需要 Hook DefineClass 等函数实现

Reverse