Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

Virtual machine (instruction-level)

An LLVM pass that replaces arithmetic instructions with calls to a register-based VM. Instead of executing add, sub, mul, etc. directly, operands are stored into a global register file and a bytecode blob is created for each instruction. Then __vm_exec(bytecode_ptr) reads the bytecode [opcode, dst, src0, src1], executes the operation via the proper VM handler and writes the result back to a destination register. This means that before calling __vm_exec, the inputs must be copied into the src0 and src1 registers and the result must be read from the dst register.

This is a simplified, instruction-level approach. Commercial tools usually virtualize entire functions or regions, use a single bytecode stream with a fetch-decode-execute (FDE) loop and hide control flow inside the VM. Here, we create separate bytecode blobs per instruction and keep branches and loops native.

Known limitations:

  • significantly increased code size
  • significantly increased runtime penalty
  • control flow remains visible (not virtualized)
  • no bytecode encryption
  • the VM can be easily reversed

The source code is available here.

Generate the IR for our main() test code:

Note: we optimize the generated IR before applying our obfuscation pass.

$ clang test.c -O3 -fno-discard-value-names -S -emit-llvm -o test.ll

Check the output:

$ cat test.ll
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx15.0.0"

@.str = private unnamed_addr constant [12 x i8] c"Result: %d\0A\00", align 1

; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync)
define i32 @compute(i32 noundef %a, i32 noundef %b) local_unnamed_addr #0 {
entry:
  %add = add nsw i32 %b, %a
  %mul = shl nsw i32 %add, 1
  %xor = xor i32 %mul, 255
  %sub = sub nsw i32 %xor, %a
  ret i32 %sub
}

; Function Attrs: nofree nounwind ssp uwtable(sync)
define noundef i32 @main() local_unnamed_addr #1 {
entry:
  %call = tail call i32 @compute(i32 noundef 10, i32 noundef 20)
  %call1 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, i32 noundef %call)
  ret i32 0
}

; Function Attrs: nofree nounwind
declare noundef i32 @printf(ptr noundef readonly captures(none), ...) local_unnamed_addr #2

attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #1 = { nofree nounwind ssp uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #2 = { nofree nounwind "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 15, i32 5]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 1}
!4 = !{i32 7, !"frame-pointer", i32 1}
!5 = !{!"Homebrew clang version 21.1.8"}

Run the pass:

$ opt -load-pass-plugin=./obf.dylib -passes="virtual-machine<compute>" -S test.ll -o obf.ll
VirtualMachinePass: instructions replaced in function 'compute'

Check the output, note that the arithmetic instructions have been replaced with __vm_exec calls:

$ cat obf.ll
; ModuleID = 'test.ll'
source_filename = "test.c"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx15.0.0"

@.str = private unnamed_addr constant [12 x i8] c"Result: %d\0A\00", align 1
@__vm_regs = private global [256 x i64] zeroinitializer
@__vm_bc_0 = private constant [4 x i8] c"\01\02\00\01"
@__vm_bc_1 = private constant [4 x i8] c"\07\02\00\01"
@__vm_bc_2 = private constant [4 x i8] c"\06\02\00\01"
@__vm_bc_3 = private constant [4 x i8] c"\02\02\00\01"

; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync)
define i32 @compute(i32 noundef %a, i32 noundef %b) local_unnamed_addr #0 {
entry:
  %a_ext = sext i32 %b to i64
  %b_ext = sext i32 %a to i64
  store i64 %a_ext, ptr @__vm_regs, align 8
  store i64 %b_ext, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 1), align 8
  call void @__vm_exec(ptr @__vm_bc_0)
  %vm_result = load i64, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 2), align 8
  %vm_trunc = trunc i64 %vm_result to i32
  %a_ext1 = sext i32 %vm_trunc to i64
  store i64 %a_ext1, ptr @__vm_regs, align 8
  store i64 1, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 1), align 8
  call void @__vm_exec(ptr @__vm_bc_1)
  %vm_result2 = load i64, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 2), align 8
  %vm_trunc3 = trunc i64 %vm_result2 to i32
  %a_ext4 = sext i32 %vm_trunc3 to i64
  store i64 %a_ext4, ptr @__vm_regs, align 8
  store i64 255, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 1), align 8
  call void @__vm_exec(ptr @__vm_bc_2)
  %vm_result5 = load i64, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 2), align 8
  %vm_trunc6 = trunc i64 %vm_result5 to i32
  %a_ext7 = sext i32 %vm_trunc6 to i64
  %b_ext8 = sext i32 %a to i64
  store i64 %a_ext7, ptr @__vm_regs, align 8
  store i64 %b_ext8, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 1), align 8
  call void @__vm_exec(ptr @__vm_bc_3)
  %vm_result9 = load i64, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 2), align 8
  %vm_trunc10 = trunc i64 %vm_result9 to i32
  ret i32 %vm_trunc10
}

; Function Attrs: nofree nounwind ssp uwtable(sync)
define noundef i32 @main() local_unnamed_addr #1 {
entry:
  %call = tail call i32 @compute(i32 noundef 10, i32 noundef 20)
  %call1 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, i32 noundef %call)
  ret i32 0
}

; Function Attrs: nofree nounwind
declare noundef i32 @printf(ptr noundef readonly captures(none), ...) local_unnamed_addr #2

; Function Attrs: noinline optnone
define private void @__vm_exec(ptr %bytecode) #3 {
entry:
  %op_ptr = getelementptr inbounds i8, ptr %bytecode, i64 0
  %dst_ptr = getelementptr inbounds i8, ptr %bytecode, i64 1
  %src0_ptr = getelementptr inbounds i8, ptr %bytecode, i64 2
  %src1_ptr = getelementptr inbounds i8, ptr %bytecode, i64 3
  %op = load i8, ptr %op_ptr, align 1
  %dst = load i8, ptr %dst_ptr, align 1
  %src0 = load i8, ptr %src0_ptr, align 1
  %src1 = load i8, ptr %src1_ptr, align 1
  %src0_ext = zext i8 %src0 to i64
  %src1_ext = zext i8 %src1 to i64
  %src0_reg_ptr = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %src0_ext
  %src1_reg_ptr = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %src1_ext
  %a = load i64, ptr %src0_reg_ptr, align 8
  %b = load i64, ptr %src1_reg_ptr, align 8
  switch i8 %op, label %default [
    i8 1, label %add
    i8 2, label %sub
    i8 3, label %mul
    i8 4, label %and
    i8 5, label %or
    i8 6, label %xor
    i8 7, label %shl
    i8 8, label %shr
  ]

add:                                              ; preds = %entry
  %add_res = add i64 %a, %b
  %dst_ext = zext i8 %dst to i64
  %dst_ptr1 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext
  store i64 %add_res, ptr %dst_ptr1, align 8
  ret void

sub:                                              ; preds = %entry
  %sub_res = sub i64 %a, %b
  %dst_ext4 = zext i8 %dst to i64
  %dst_ptr5 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext4
  store i64 %sub_res, ptr %dst_ptr5, align 8
  ret void

mul:                                              ; preds = %entry
  %mul_res = mul i64 %a, %b
  %dst_ext2 = zext i8 %dst to i64
  %dst_ptr3 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext2
  store i64 %mul_res, ptr %dst_ptr3, align 8
  ret void

and:                                              ; preds = %entry
  %and_res = and i64 %a, %b
  %dst_ext6 = zext i8 %dst to i64
  %dst_ptr7 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext6
  store i64 %and_res, ptr %dst_ptr7, align 8
  ret void

or:                                               ; preds = %entry
  %or_res = or i64 %a, %b
  %dst_ext8 = zext i8 %dst to i64
  %dst_ptr9 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext8
  store i64 %or_res, ptr %dst_ptr9, align 8
  ret void

xor:                                              ; preds = %entry
  %xor_res = xor i64 %a, %b
  %dst_ext10 = zext i8 %dst to i64
  %dst_ptr11 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext10
  store i64 %xor_res, ptr %dst_ptr11, align 8
  ret void

shl:                                              ; preds = %entry
  %shl_res = shl i64 %a, %b
  %dst_ext12 = zext i8 %dst to i64
  %dst_ptr13 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext12
  store i64 %shl_res, ptr %dst_ptr13, align 8
  ret void

shr:                                              ; preds = %entry
  %lshr_res = lshr i64 %a, %b
  %dst_ext14 = zext i8 %dst to i64
  %dst_ptr15 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext14
  store i64 %lshr_res, ptr %dst_ptr15, align 8
  ret void

default:                                          ; preds = %entry
  ret void
}

attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #1 = { nofree nounwind ssp uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #2 = { nofree nounwind "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #3 = { noinline optnone }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 15, i32 5]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 1}
!4 = !{i32 7, !"frame-pointer", i32 1}
!5 = !{!"Homebrew clang version 21.1.8"}

If we load the binaries into Ghidra, we can see that _compute is harder to understand than without the VM. Still, this is a very basic VM, so it can be reversed rather quickly.

Before:

int _compute(int param_1,int param_2)

{
  return ((param_2 + param_1) * 2 ^ 0xffU) - param_1;
}

After:

undefined8 _compute(int param_1,int param_2)

{
  DAT_100008000 = (long)param_2;
  DAT_100008008 = (long)param_1;
  FUN_100000668(&DAT_100000860);
  DAT_100008000 = (long)(int)DAT_100008010;
  DAT_100008008 = 1;
  FUN_100000668(&DAT_100000864);
  DAT_100008000 = (long)(int)DAT_100008010;
  DAT_100008008 = 0xff;
  FUN_100000668(&DAT_100000868);
  DAT_100008000 = (long)(int)DAT_100008010;
  DAT_100008008 = (long)param_1;
  FUN_100000668(&DAT_10000086c);
  return DAT_100008010;
}

void FUN_100000668(char *param_1)

{
  char cVar1;
  byte bVar2;
  ulong uVar3;
  ulong uVar4;
  
  cVar1 = *param_1;
  bVar2 = param_1[1];
  uVar4 = (&DAT_100008000)[(byte)param_1[2]];
  uVar3 = (&DAT_100008000)[(byte)param_1[3]];
  if (cVar1 == '\x01') {
    (&DAT_100008000)[bVar2] = uVar4 + uVar3;
    return;
  }
  if (cVar1 == '\x02') {
    (&DAT_100008000)[bVar2] = uVar4 - uVar3;
    return;
  }
  if (cVar1 == '\x03') {
    (&DAT_100008000)[bVar2] = uVar4 * uVar3;
    return;
  }
  if (cVar1 == '\x04') {
    (&DAT_100008000)[bVar2] = uVar4 & uVar3;
    return;
  }
  if (cVar1 == '\x05') {
    (&DAT_100008000)[bVar2] = uVar4 | uVar3;
    return;
  }
  if (cVar1 == '\x06') {
    (&DAT_100008000)[bVar2] = uVar4 ^ uVar3;
    return;
  }
  if (cVar1 == '\a') {
    (&DAT_100008000)[bVar2] = uVar4 << (uVar3 & 0x3f);
    return;
  }
  if (cVar1 != '\b') {
    return;
  }
  (&DAT_100008000)[bVar2] = uVar4 >> (uVar3 & 0x3f);
  return;
}

Build the modified IR and run the executable:

Note: do not pass -O3 or other optimization-related options at this point as they might interfere with the applied obfuscation methods.

$ clang obf.ll -o obf && ./obf
Result: 185