Virtual machine (instruction-level)

An LLVM pass that replaces arithmetic instructions with calls to a register-based VM. Instead of executing add, sub, mul, etc. directly, operands are stored into a global register file and a bytecode blob is created for each instruction. Then __vm_exec(bytecode_ptr) reads the bytecode [opcode, dst, src0, src1], executes the operation via the proper VM handler and writes the result back to a destination register. This means that before calling __vm_exec, the inputs must be copied into the src0 and src1 registers and the result must be read from the dst register.

This is a simplified, instruction-level approach. Commercial tools usually virtualize entire functions or regions, use a single bytecode stream with a fetch-decode-execute (FDE) loop and hide control flow inside the VM. Here, we create separate bytecode blobs per instruction and keep branches and loops native.

Known limitations:

significantly increased code size
significantly increased runtime penalty
control flow remains visible (not virtualized)
no bytecode encryption
the VM can be easily reversed

The source code is available here.

Generate the IR for our main() test code:

Note: we optimize the generated IR before applying our obfuscation pass.

$ clang test.c -O3 -fno-discard-value-names -S -emit-llvm -o test.ll

Check the output:

$ cat test.ll
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx15.0.0"

@.str = private unnamed_addr constant [12 x i8] c"Result: %d\0A\00", align 1

; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync)
define i32 @compute(i32 noundef %a, i32 noundef %b) local_unnamed_addr #0 {
entry:
  %add = add nsw i32 %b, %a
  %mul = shl nsw i32 %add, 1
  %xor = xor i32 %mul, 255
  %sub = sub nsw i32 %xor, %a
  ret i32 %sub
}

; Function Attrs: nofree nounwind ssp uwtable(sync)
define noundef i32 @main() local_unnamed_addr #1 {
entry:
  %call = tail call i32 @compute(i32 noundef 10, i32 noundef 20)
  %call1 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, i32 noundef %call)
  ret i32 0
}

; Function Attrs: nofree nounwind
declare noundef i32 @printf(ptr noundef readonly captures(none), ...) local_unnamed_addr #2

attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #1 = { nofree nounwind ssp uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #2 = { nofree nounwind "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 15, i32 5]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 1}
!4 = !{i32 7, !"frame-pointer", i32 1}
!5 = !{!"Homebrew clang version 21.1.8"}

Run the pass:

$ opt -load-pass-plugin=./obf.dylib -passes="virtual-machine<compute>" -S test.ll -o obf.ll
VirtualMachinePass: instructions replaced in function 'compute'

Check the output, note that the arithmetic instructions have been replaced with __vm_exec calls:

$ cat obf.ll
; ModuleID = 'test.ll'
source_filename = "test.c"
target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx15.0.0"

@.str = private unnamed_addr constant [12 x i8] c"Result: %d\0A\00", align 1
@__vm_regs = private global [256 x i64] zeroinitializer
@__vm_bc_0 = private constant [4 x i8] c"\01\02\00\01"
@__vm_bc_1 = private constant [4 x i8] c"\07\02\00\01"
@__vm_bc_2 = private constant [4 x i8] c"\06\02\00\01"
@__vm_bc_3 = private constant [4 x i8] c"\02\02\00\01"

; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync)
define i32 @compute(i32 noundef %a, i32 noundef %b) local_unnamed_addr #0 {
entry:
  %a_ext = sext i32 %b to i64
  %b_ext = sext i32 %a to i64
  store i64 %a_ext, ptr @__vm_regs, align 8
  store i64 %b_ext, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 1), align 8
  call void @__vm_exec(ptr @__vm_bc_0)
  %vm_result = load i64, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 2), align 8
  %vm_trunc = trunc i64 %vm_result to i32
  %a_ext1 = sext i32 %vm_trunc to i64
  store i64 %a_ext1, ptr @__vm_regs, align 8
  store i64 1, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 1), align 8
  call void @__vm_exec(ptr @__vm_bc_1)
  %vm_result2 = load i64, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 2), align 8
  %vm_trunc3 = trunc i64 %vm_result2 to i32
  %a_ext4 = sext i32 %vm_trunc3 to i64
  store i64 %a_ext4, ptr @__vm_regs, align 8
  store i64 255, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 1), align 8
  call void @__vm_exec(ptr @__vm_bc_2)
  %vm_result5 = load i64, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 2), align 8
  %vm_trunc6 = trunc i64 %vm_result5 to i32
  %a_ext7 = sext i32 %vm_trunc6 to i64
  %b_ext8 = sext i32 %a to i64
  store i64 %a_ext7, ptr @__vm_regs, align 8
  store i64 %b_ext8, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 1), align 8
  call void @__vm_exec(ptr @__vm_bc_3)
  %vm_result9 = load i64, ptr getelementptr inbounds ([256 x i64], ptr @__vm_regs, i64 0, i64 2), align 8
  %vm_trunc10 = trunc i64 %vm_result9 to i32
  ret i32 %vm_trunc10
}

; Function Attrs: nofree nounwind ssp uwtable(sync)
define noundef i32 @main() local_unnamed_addr #1 {
entry:
  %call = tail call i32 @compute(i32 noundef 10, i32 noundef 20)
  %call1 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str, i32 noundef %call)
  ret i32 0
}

; Function Attrs: nofree nounwind
declare noundef i32 @printf(ptr noundef readonly captures(none), ...) local_unnamed_addr #2

; Function Attrs: noinline optnone
define private void @__vm_exec(ptr %bytecode) #3 {
entry:
  %op_ptr = getelementptr inbounds i8, ptr %bytecode, i64 0
  %dst_ptr = getelementptr inbounds i8, ptr %bytecode, i64 1
  %src0_ptr = getelementptr inbounds i8, ptr %bytecode, i64 2
  %src1_ptr = getelementptr inbounds i8, ptr %bytecode, i64 3
  %op = load i8, ptr %op_ptr, align 1
  %dst = load i8, ptr %dst_ptr, align 1
  %src0 = load i8, ptr %src0_ptr, align 1
  %src1 = load i8, ptr %src1_ptr, align 1
  %src0_ext = zext i8 %src0 to i64
  %src1_ext = zext i8 %src1 to i64
  %src0_reg_ptr = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %src0_ext
  %src1_reg_ptr = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %src1_ext
  %a = load i64, ptr %src0_reg_ptr, align 8
  %b = load i64, ptr %src1_reg_ptr, align 8
  switch i8 %op, label %default [
    i8 1, label %add
    i8 2, label %sub
    i8 3, label %mul
    i8 4, label %and
    i8 5, label %or
    i8 6, label %xor
    i8 7, label %shl
    i8 8, label %shr
  ]

add:                                              ; preds = %entry
  %add_res = add i64 %a, %b
  %dst_ext = zext i8 %dst to i64
  %dst_ptr1 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext
  store i64 %add_res, ptr %dst_ptr1, align 8
  ret void

sub:                                              ; preds = %entry
  %sub_res = sub i64 %a, %b
  %dst_ext4 = zext i8 %dst to i64
  %dst_ptr5 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext4
  store i64 %sub_res, ptr %dst_ptr5, align 8
  ret void

mul:                                              ; preds = %entry
  %mul_res = mul i64 %a, %b
  %dst_ext2 = zext i8 %dst to i64
  %dst_ptr3 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext2
  store i64 %mul_res, ptr %dst_ptr3, align 8
  ret void

and:                                              ; preds = %entry
  %and_res = and i64 %a, %b
  %dst_ext6 = zext i8 %dst to i64
  %dst_ptr7 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext6
  store i64 %and_res, ptr %dst_ptr7, align 8
  ret void

or:                                               ; preds = %entry
  %or_res = or i64 %a, %b
  %dst_ext8 = zext i8 %dst to i64
  %dst_ptr9 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext8
  store i64 %or_res, ptr %dst_ptr9, align 8
  ret void

xor:                                              ; preds = %entry
  %xor_res = xor i64 %a, %b
  %dst_ext10 = zext i8 %dst to i64
  %dst_ptr11 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext10
  store i64 %xor_res, ptr %dst_ptr11, align 8
  ret void

shl:                                              ; preds = %entry
  %shl_res = shl i64 %a, %b
  %dst_ext12 = zext i8 %dst to i64
  %dst_ptr13 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext12
  store i64 %shl_res, ptr %dst_ptr13, align 8
  ret void

shr:                                              ; preds = %entry
  %lshr_res = lshr i64 %a, %b
  %dst_ext14 = zext i8 %dst to i64
  %dst_ptr15 = getelementptr inbounds [256 x i64], ptr @__vm_regs, i64 0, i64 %dst_ext14
  store i64 %lshr_res, ptr %dst_ptr15, align 8
  ret void

default:                                          ; preds = %entry
  ret void
}

attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind ssp willreturn memory(none) uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #1 = { nofree nounwind ssp uwtable(sync) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #2 = { nofree nounwind "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="apple-m1" "target-features"="+aes,+altnzcv,+ccdp,+ccidx,+ccpp,+complxnum,+crc,+dit,+dotprod,+flagm,+fp-armv8,+fp16fml,+fptoint,+fullfp16,+jsconv,+lse,+neon,+pauth,+perfmon,+predres,+ras,+rcpc,+rdm,+sb,+sha2,+sha3,+specrestrict,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
attributes #3 = { noinline optnone }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 15, i32 5]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 8, !"PIC Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 1}
!4 = !{i32 7, !"frame-pointer", i32 1}
!5 = !{!"Homebrew clang version 21.1.8"}

If we load the binaries into Ghidra, we can see that _compute is harder to understand than without the VM. Still, this is a very basic VM, so it can be reversed rather quickly.

Before:

int _compute(int param_1,int param_2)

{
  return ((param_2 + param_1) * 2 ^ 0xffU) - param_1;
}

After:

undefined8 _compute(int param_1,int param_2)

{
  DAT_100008000 = (long)param_2;
  DAT_100008008 = (long)param_1;
  FUN_100000668(&DAT_100000860);
  DAT_100008000 = (long)(int)DAT_100008010;
  DAT_100008008 = 1;
  FUN_100000668(&DAT_100000864);
  DAT_100008000 = (long)(int)DAT_100008010;
  DAT_100008008 = 0xff;
  FUN_100000668(&DAT_100000868);
  DAT_100008000 = (long)(int)DAT_100008010;
  DAT_100008008 = (long)param_1;
  FUN_100000668(&DAT_10000086c);
  return DAT_100008010;
}

void FUN_100000668(char *param_1)

{
  char cVar1;
  byte bVar2;
  ulong uVar3;
  ulong uVar4;
  
  cVar1 = *param_1;
  bVar2 = param_1[1];
  uVar4 = (&DAT_100008000)[(byte)param_1[2]];
  uVar3 = (&DAT_100008000)[(byte)param_1[3]];
  if (cVar1 == '\x01') {
    (&DAT_100008000)[bVar2] = uVar4 + uVar3;
    return;
  }
  if (cVar1 == '\x02') {
    (&DAT_100008000)[bVar2] = uVar4 - uVar3;
    return;
  }
  if (cVar1 == '\x03') {
    (&DAT_100008000)[bVar2] = uVar4 * uVar3;
    return;
  }
  if (cVar1 == '\x04') {
    (&DAT_100008000)[bVar2] = uVar4 & uVar3;
    return;
  }
  if (cVar1 == '\x05') {
    (&DAT_100008000)[bVar2] = uVar4 | uVar3;
    return;
  }
  if (cVar1 == '\x06') {
    (&DAT_100008000)[bVar2] = uVar4 ^ uVar3;
    return;
  }
  if (cVar1 == '\a') {
    (&DAT_100008000)[bVar2] = uVar4 << (uVar3 & 0x3f);
    return;
  }
  if (cVar1 != '\b') {
    return;
  }
  (&DAT_100008000)[bVar2] = uVar4 >> (uVar3 & 0x3f);
  return;
}

Build the modified IR and run the executable:

Note: do not pass -O3 or other optimization-related options at this point as they might interfere with the applied obfuscation methods.

$ clang obf.ll -o obf && ./obf
Result: 185

Keyboard shortcuts

Phantom pass: LLVM obfuscator

Virtual machine (instruction-level)