crackmes.one CTF 2026 - crackme9
This year I participated in the crackmes.one CTF, a reverse engineering similar to Flare-On. Crackme9 was one of the harder challenges, with 43 solves by the end of the CTF.

As usual, we’re given a prompt to enter a serial number. Tracing through the function that checks the serial, we can see a call to a function at the address 0x40a000. This address is not part of the .text section of the PE file; instead, it occurs at the start of a nonstandard section called .pc. Looking at a hex dump of the .pc section, it was obviously encrypted.
99 ae 95 d7 80 31 f7 5c 3b c9 38 6e 9a 11 56 06 .....1.\;.8n..V.
16 81 d5 0c eb 59 b4 fa fe da 10 89 de 78 32 6b .....Y.......x2k
87 bb fb e6 09 19 77 a9 84 33 4a dc 91 7b 8b 5a ......w..3J..{.Z
6d 5c 4a 94 ed ee cd 5e d8 5e f5 c2 8b bc 35 15 m\J....^.^....5.
39 7c 1b 08 bd 76 29 5d 66 13 0c 14 c1 f3 93 c6 9|...v)]f.......
98 71 b1 26 83 bf c7 20 b9 a6 8b 91 cc 05 7f 20 .q.&... .......
b7 74 0c 6d ea 66 b3 34 aa 03 18 25 23 4f 45 26 .t.m.f.4...%#OE&
84 4a 2e 47 db 99 9d 97 6c 37 3a d3 84 ac 17 67 .J.G....l7:....g
Initial deobfuscation steps
API resolution
I started off by setting breakpoints on commonly used APIs like VirtualAlloc and VirtualProtect. Tracing through the calls, I found that the APIs were being resolved through some kind of indirect lookup function.
0040112c int32_t do_VirtualProtect()
0040112f sub_401367()
0040113c jump(sub_401ad3(&data_40d1a8))
As usual for this type of challenge, the APIs were being called based on their hash, in this case 0x10066f2f for VirtualProtect.
00401ad3 int32_t __fastcall sub_401ad3(int32_t* arg1)
00401ae6 if (arg1[5] != 0)
00401aeb return arg1[5]
00401aeb
00401b20 void* var_14
00401b20 arg1[5] = sub_401175(sub_401ca2(*arg1, &var_14, 0x10066f2f))
00401b26 return arg1[5]
Looking at the hash function at sub_401ba9, it turned out to just be unmodified CRC32:
00401ba9 uint32_t __stdcall sub_401ba9(char* arg1, int32_t arg2)
00401bad int32_t esi = 0
00401baf uint32_t edx = 0xffffffff
00401bb3 char* edi = arg1
00401bb3
00401bb9 if (arg2 u> 0)
00401bbb int32_t ebx
00401bbb int32_t var_10_1 = ebx
00401bbb
00401be6 do
00401bbc ebx.b = edi[esi]
00401bc1 int32_t i_1 = 8
00401bdd int32_t i
00401bdd
00401bdd do
00401bc7 uint32_t ecx_2 = edx u>> 1
00401bc9 char eax_2 = ebx.b ^ edx.b
00401bcd edx = ecx_2 ^ 0xedb88320
00401bcd
00401bd5 if ((eax_2 & 1) == 0)
00401bd5 edx = ecx_2
00401bd5
00401bd8 ebx.b u>>= 1
00401bda i = i_1
00401bda i_1 -= 1
00401bdd while (i != 1)
00401bdf edi = arg1
00401be2 esi += 1
00401be6 while (esi u< arg2)
00401be6
00401bf0 return not.d(edx)
Since CRC32 is such a widely used algorithm, we can use HashDB to look up the API names that go with each checksum. In fact, there’s a Binary Ninja plugin that pulls data from HashDB to define an enum mapping all the API names to their checksums.
enum hashdb_strings_crc32 : uint32_t
{
FlushInstructionCache = 0xe9258e7a,
RtlMoveMemory = 0x1c0cd35c,
GlobalUnWire = 0x2c82f5fb,
MapUserPhysicalPages = 0xafd472a5,
// [...]
RtlUniform = 0xd09d867b,
RtlNewSecurityObjectEx = 0x8733c5dd,
WinSqmSetEscalationInfo = 0x648930f1
};
Normally I’d write a script to rename all the API hashing functions to their respective API names, but in this case there weren’t that many of them so I just renamed them all by hand.
Breakpoints
When left to run under a debugger, the binary runs until it hits an int3 breakpoint at 0x4037c6. I’d seen things like this before where exceptions are used to obfuscate control flow, but the binary seemed to be “getting stuck”, never progressing past the breakpoint location.
004037c2 void __fastcall do_breakpoint(void* arg1)
004037c2 c6410101 mov byte [ecx+0x1], 0x1
004037c6 cc int3
When I was resolving the API hashes, I saw that KiUserExceptionDispatcher was one of the hashed APIs, so I looked at how that was being used. It looked like the binary was supposed to overwrite the entry point of KiUserExceptionDispatcher with an exception handler of its own. However, for whatever reason the overwrite function wasn’t being called, so the real KiUserExceptionDispatcher was being called instead.
00402ce0 void __fastcall overwrite_KiUserExceptionDispatcher(char* hook)
00402ce6 if (*hook == 0)
00402ce6 return
00402ce6
00402cef int32_t KiUserExceptionDispatcher
00402cef int80_t st0_1
00402cef st0_1, KiUserExceptionDispatcher =
00402cef get_KiUserExceptionDispatcher(&kernel32_addr, get_kernel32_baseaddr())
00402cfd _memcpy_s(KiUserExceptionDispatcher, 6, &hook[1], 6)
00402d05 *hook = 0
It turned out the KiUserExceptionDispatcher overwrite was only called if an anti-debug check was passed. The check used NtQueryInformationProcess, which I had thought could be bypassed with ScyllaHide, but for whatever reason that didn’t work in this case. Ultimately, I just patched out the check entirely.
00402b6d int32_t __fastcall check_ProcessDebugPort(int32_t* arg1)
00402b77 if (sub_402bc7(arg1) != 0)
00402b7c return 1
00402b7c
00402b88 int32_t* ProcessInformation = arg1
00402b89 int32_t esi
00402b89 int32_t var_c = esi
00402b8a ProcessInformation = nullptr
00402b91 get_dll_baseaddr()
00402b98 HANDLE ProcessHandle = do_GetCurrentProcess()
00402b9f get_dll_baseaddr()
00402bb1 NTSTATUS result = do_NtQueryInformationProcess(ProcessHandle,
00402bb1 ProcessInformationClass: ProcessDebugPort, &ProcessInformation,
00402bb1 ProcessInformationLength: 4, ReturnLength: nullptr)
00402bb1
00402bb9 if (result != STATUS_SUCCESS)
00402bc3 result.b = 0
00402bc6 return result
00402bc6
00402bbe result.b = ProcessInformation != result
00402bc2 return result
At that point, the custom exception handler was being called, but the binary still crashed soon after with mysterious access violation exceptions. It seemed like it was trying to decrypt and run the .pc section, but the decryption was incorrect.
00402e1c void decrypt_and_ntcontinue(PCONTEXT arg1) __noreturn
00402e20 context_record = arg1
00402e28 exception = __return_addr
00402e2d get_cipher_ctx()
00402e40 decrypt_from_exception(ctx: &cipher_ctx, exception: exception,
00402e40 context: context_record)
00402e45 get_dll_baseaddr()
00402e54 do_NtContinue(ContextRecord: context_record, TestAlert: 0)
00402e59 breakpoint
The .pc section
Decrypting the code
At some point in the reversing process I got pretty stuck, and I was just kind of clicking around the decompilation for a while looking for something comprehensible. I eventually found something I recognized at sub_401e67:
00401e67 uint32_t* __thiscall sub_401e67(uint32_t* arg1, char* arg2, char* arg3)
00401e72 int32_t var_14 = 0xb979379e
00401e7c int32_t var_10 = 0x157c4a7f
00401e83 int32_t var_c = 0x60c09cf3
00401e8a int32_t var_8 = 0x34c8ed5c
00401e96 *arg1 = bytes_to_int(&var_14)
00401ea1 arg1[1] = bytes_to_int(&var_10)
00401ead arg1[2] = bytes_to_int(&var_c)
00401ebd arg1[3] = bytes_to_int(&var_8)
00401ec5 arg1[4] = bytes_to_int(arg2)
00401ed1 arg1[5] = bytes_to_int(&arg2[4])
00401edd arg1[6] = bytes_to_int(&arg2[8])
00401ee9 arg1[7] = bytes_to_int(&arg2[0xc])
00401ef5 arg1[8] = bytes_to_int(&arg2[0x10])
00401f01 arg1[9] = bytes_to_int(&arg2[0x14])
00401f0d arg1[0xa] = bytes_to_int(&arg2[0x18])
00401f1d arg1[0xb] = bytes_to_int(&arg2[0x1c])
00401f20 arg1[0xc] = 0
00401f27 arg1[0xd] = 0
00401f33 arg1[0xe] = bytes_to_int(arg3)
00401f42 arg1[0xf] = bytes_to_int(&arg3[4])
00401f4a return arg1
This function takes a 32-byte array and an 8-byte array, and uses them alongside a 16-byte constant to initialize an array of 64 bytes. If you’ve reversed enough cryptography code before, you’ll likely recognize this as the initialization of a ChaCha20 matrix. Normally the constant used in ChaCha20 is the string expand 32-byte k, but I guess the challenge author thought that would be too easy to search for, so here it’s replaced by the custom constant 0xb979379e, 0x157c4a7f, 0x60c09cf3, 0x34c8ed5c.
Looking at where the ChaCha20 matrix is initialized, the nonce is hard-coded as 0a 0b 0c 0d 0e 0f 10 11, but the key is retrieved from a global variable.
00404059 char* __thiscall do_chacha_with_hprov_key(struct cipher_ctx* ctx, int32_t ciphertext)
00404069 char* ciphertext_buf = do_malloc(ctx->len)
00404077 memcpy(dest: ciphertext_buf, src: ciphertext, count: ctx->len)
00404087 int32_t counter_1 = ciphertext - *ctx->pc_ptr
0040408c int64_t nonce
0040408c nonce.d = 0xd0c0b0a
00404093 nonce:4.d = 0x11100f0e
0040409a get_crypt_context()
004040ad struct matrix matrix
004040ad init_chacha_matrix_with_counter(&matrix, key: get_key_from_hprov(&hprov), &nonce,
004040ad counter0: 0, counter1: 0)
004040ba chacha_rotated_counter(&matrix, counter_1, counter_2: 0)
004040c8 do_chacha_encrypt(&matrix, ciphertext: ciphertext_buf, len: 0x10)
004040d3 return ciphertext_buf
I found that the key was being stored alongside an HCRYPTPROV handle obtained from CryptAcquireContextA, and that other Crypto API functions were being called to generate the key itself.
00403388 BOOL __convention("regparm") hash_something_for_chacha_key(int32_t arg1, int32_t* arg2 @ ebp, char* arg3 @ esi, char* arg4 @ edi)
00403388 *0xff45c6 = arg1
00403395 arg2[-3] = 0
004033a5 get_dll_baseaddr()
004033aa arg2[-5] = 0x40d1a0
004033b2 arg2[-4] = *arg2[-2]
004033c5 arg2[-5]
004033c8 BOOL result = do_CryptCreateHash(hProv: arg2[-4], Algid: 0x800c, hKey: 0,
004033c8 dwFlags: 0, phHash: &arg2[-3])
004033c8
004033cf if (result != 0)
004033de get_dll_baseaddr()
004033e3 arg2[-9] = 0x40d1a0
004033e9 arg2[-8] = arg2[-3]
004033f7 arg2[-9]
004033f7
00403401 if (do_CryptHashData(hHash: arg2[-8], pbData: arg2[-7], dwDataLen: arg2[-6],
00403401 dwFlags: 0) != 0)
0040340c arg2[-0xa] = 0x20
00403420 get_dll_baseaddr()
00403425 arg2[-0xc] = 0x40d1a0
0040342b arg2[-0xb] = arg2[-3]
00403440 arg2[-0xc]
00403443 do_CryptGetHashParam(arg2[-0xb], 2, arg2[-2] + 4, &arg2[-0xa], 0)
00403443
00403456 get_dll_baseaddr()
0040345b arg2[-0xe] = 0x40d1a0
00403461 arg2[-0xd] = arg2[-3]
00403467 arg2[-0xe]
0040346a result = do_CryptDestroyHash(arg2[-0xd])
0040346a
00403473 *arg2
00403474 return result
Setting a breakpoint on CryptHashData, it turned out the ChaCha20 key was the SHA-256 hash of the .text section of the executable. This is an anti-debug measure, as setting software breakpoints changes the contents of .text.
When I tried dumping the .text section from the binary and hashing it, the resulting hash didn’t result in a valid decryption. Looking more closely in x64dbg at the data that was being hashed, I found that the last byte of the section had been changed from 0x00 to 0x90. Making the same change to my copy of the .text section, I got the correct key F630AA38D57297375D645559C334FD50D55CA1D177D2655A042351CF69244BF2. (For the actual decryption step, I used CryptoTester, which supports custom-constant ChaCha20 out of the box.)
Running the code
Single stepping
Now that I had the right key, I could see how the program tried to run the decrypted code. The program obtains the ChaCha20 key with a call to CryptGetHashParam, so I set a breakpoint on CryptGetHashParam’s return to patch in the correct hash. To make things easier, I eventually figured out that I could set the breakpoint to run a command to write to the hash address, so that I wouldn’t have to manually do the patch on every run. The syntax to do this in x64dbg is:
set 0x40d268,#F630AA38D57297375D645559C334FD50D55CA1D177D2655A042351CF69244BF2#
However, even with the correct key, the program didn’t decrypt and run the .pc section all at once. Instead, it only decrypted it 16 bytes at a time, and wrote only those few instructions to the buffer it had allocated for running shellcode.
004040ad init_chacha_matrix_with_counter(&matrix, key: get_key_from_hprov(&hprov), &nonce,
004040ad counter0: 0, counter1: 0)
004040ba chacha_rotated_counter(&matrix, counter_1, counter_2: 0)
004040c8 do_chacha_encrypt(&matrix, ciphertext: ciphertext_buf, len: 0x10)
004040d3 return ciphertext_buf
Moreover, the program seemed to only be running one instruction at a time before allocating a new shellcode buffer with VirtualAlloc. For example, out of the first 16 bytes that are decrypted, only the instruction mov eax,ecx is run at first.
3E4C0006 | 8BC1 | mov eax,ecx |
3E4C0008 | C741 04 90B16E0A | mov dword ptr ds:[ecx+4],A6EB190 |
3E4C000F | C700 00000000 | mov dword ptr ds:[eax],0 |
3E4C0015 | 0000 | add byte ptr ds:[eax],al |
3E4C0017 | 0000 | add byte ptr ds:[eax],al |
3E4C0019 | 0000 | add byte ptr ds:[eax],al |
Then, a new buffer is allocated to run the next instruction mov dword ptr ds:[ecx+4],A6EB190, and so on.
17100002 | C741 04 90B16E0A | mov dword ptr ds:[ecx+4],A6EB190 |
17100009 | C741 08 336C4720 | mov dword ptr ds:[ecx+8],20476C33 |
17100010 | 0000 | add byte ptr ds:[eax],al |
17100012 | 0000 | add byte ptr ds:[eax],al |
17100014 | 0000 | add byte ptr ds:[eax],al |
In order to run a single instruction at a time before stopping execution, the program uses the same mechanisms that debuggers use to perform single steps. The CPU has a flag called the trap flag that tells it to execute a single instruction before raising an exception of type EXCEPTION_SINGLE_STEP. (You can see an example of how a debugger might use this flag here.) Sure enough, the trap flag is set before each run of the decrypted shellcode.
004037fa uint32_t __stdcall set_context_flags(CONTEXT* arg1)
00403804 *(arg1->ContextFlags + 0x18) = 0
00403809 *(arg1->ContextFlags + 0x14) = 0
0040380e *(arg1->ContextFlags + 4) = 0
00403811 uint32_t ContextFlags = arg1->ContextFlags
00403813 *(ContextFlags + 0xc0) |= 0x100
0040381e return ContextFlags
Control flow obfuscation
Even after decryption, the decompilation of the .pc section didn’t make much sense. The code was full of int3 breakpoints, which were usually followed by nonsensical disassembly.
0040a0cb mov eax, dword [ebp-0x8]
0040a0ce and eax, 0x80000003
0040a0d3 int3
0040a0d4 fisttp dword [eax-0x7d], st0
0040a0d7 enter 0x40fc, 0x85
0040a0db ror ah, 0x38
Upon closer inspection in x64dbg, several bytes were being skipped each time the program executed an int3 breakpoint. For instance, after the breakpoint at 0x40a035 was executed, the next instruction to be run was the push dword ptr [ebp+8] instruction at 0x40a037, with the address 0x40a036 being skipped over entirely.
Looking at the exception handler code again, there were different functions for retrieving the next .pc instruction depending on whether the exception was an int3 breakpoint or a single step.
00404562 void __thiscall decrypt_from_exception(struct cipher_ctx* ctx, int32_t* exception, CONTEXT* context)
00404569 if (ctx->field_8 != 0)
00404569 return
00404569
00404574 if (*exception == STATUS_BREAKPOINT)
00404579 chacha_breakpoint(ctx, context)
00404574 else if (*exception == STATUS_SINGLE_STEP)
0040458b chacha_single_step(ctx, context)
00404586 else if (*exception == STATUS_GUARD_PAGE_VIOLATION)
0040459a CONTEXT* context_1 = context
0040459e chacha_guard_page_violation(ctx, exception)
Unlike the single-step handler, which just set context->Eip to the next instruction before returning, the breakpoint handler called another function to determine the next instruction pointer before returning.
004042ec int32_t __thiscall chacha_breakpoint(struct cipher_ctx* ctx, CONTEXT* context)
004042fe if (sub_4037b2(ctx->field_24) == 0)
00404318 CONTEXT* exception_context = context
00404318
00404328 if (check_context_eip_in_range(ctx, eip: exception_context->Eip) != 0)
0040432d breakpoint_set_eip(ctx, exception_context)
00404332 CONTEXT* context_1 = context
00404335 uint32_t Eip = context_1->Eip
0040433b uint32_t Eip_1 = Eip
0040433f ctx->Eip = Eip
00404342 chacha_and_flush_cache(ctx, context: context_1)
004042fe else
00404304 flush_instruction_cache(ctx, &context)
00404309 ctx->field_24
00404310 set_context_flags(&context)
00404310
0040434d return 0xffffffff
Looking at the breakpoint_set_eip function at 0x4045f3, it was retrieving a value from a map, then setting the next instruction to one of two locations pulled from that map.
004045f3 struct map_val* __thiscall breakpoint_set_eip(struct cipher_ctx* ctx, CONTEXT* exception_context)
004045f7 CONTEXT* exception_context_1 = exception_context
00404608 struct map_val* result =
00404608 get_map_data(ctx, key: (exception_context_1->Eip).w - (ctx->baseaddr).w)
00404608
00404611 if (result != 0)
00404622 int32_t offset
00404622
00404622 if (check_eflags(result, exception_context_1) == 0)
0040462c offset = result->offset_2
00404622 else
00404627 offset = result->offset_1 + result->offset_2
00404627
0040463c flush_process_cache(ctx, &exception_context, offset + exception_context_1->Eip)
00404641 result.b = 1
00404611 else
00404613 result.b = 0
00404613
00404647 return result
Dumping the map data from the debugger, it contained a long list of structures of four integer values. The first integer was the map key, corresponding to the address of the .pc instruction currently being executed (minus the base address of 0x400000). The third and fourth integers were the two choices of addresses to jump to.
00 A0 00 00 01 00 00 00 2F 00 00 00 02 00 00 00
01 A0 00 00 12 00 00 00 49 00 00 00 02 00 00 00
02 A0 00 00 0A 00 00 00 72 00 00 00 02 00 00 00
03 A0 00 00 0A 00 00 00 43 00 00 00 02 00 00 00
04 A0 00 00 05 00 00 00 5A 00 00 00 02 00 00 00
The second integer, which was always a value between 1 and 18, was used for deciding which of the two offsets to jump to. Depending on its value, different combinations of the flags in the eflags register were checked, and the decision for where to jump was made based on the value of those flags.
00404166 uint32_t __stdcall check_eflags(struct map_val* arg1, CONTEXT* arg2)
00404172 uint32_t EFlags = arg2->EFlags
00404178 uint32_t choice = arg1->field_4 - 1
00404178
0040417c if (choice u> 0x11)
00404238 choice.b = 0
0040417c else
00404182 switch (choice)
004041ea case 0
004041ea EFlags u>>= 6 // ~ZF
004041ed goto get_negated_flag
00404234 case 1
00404234 choice.b = 1
00404189 case 2
0040418c EFlags.b = (EFlags u>> 7).b & 1 // SF
0040418f choice.b = EFlags.b
// [...]
004041a2 case 0x10
004041a5 if ((1 & (EFlags u>> 6).b) != 0) // ZF
004041b7 choice.b = 0
004041a5 else
004041af // ~ (OF ^ SF)
004041af choice.b = (EFlags u>> 0xb).b ^ (EFlags u>> 7).b
004041af
004041b3 if ((1 & choice.b) == 0)
004041b7 choice.b = 1
004041b3 else
004041b7 choice.b = 0
0040418c case 0x11
0040418c EFlags.b &= 1
0040418f choice.b = EFlags.b
0040418f
0040423b return choice
The combinations of flags that were being checked were, in fact, the exact combinations of flags that are checked in ordinary x86 branch instructions. For example, in the switch case above, case 0 corresponds to the condition ZF = 0, which is used in the conditional branch instruction JNZ. Similarly, case 0x10 corresponds to the condition ZF = 0 and SF = OF, which is the conditional JG.
Essentially, every single branch instruction in the .pc section had been replaced with a breakpoint, and the branch logic was being handled in the exception handler. Since this is a one-to-one replacement of one assembly instruction with another, the most logical way to deal with it would’ve been to patch the original branch instructions back in, except that the branch instructions were longer than the breakpoint instructions replacing them.
Reversing the check algorithm
Emulating with Unicorn
Since I couldn’t think of a good way to patch the binary in a way that would get me a reasonable-looking decompilation, I decided to try to emulate the .pc section in Unicorn, adding a hook to intercept the breakpoint instructions and resolve the branches. I had to hook a couple of external functions like strlen, but luckily there weren’t too many of those, so I didn’t have to do much extra work other than the usual boilerplate that comes with setting up Unicorn.
from unicorn import *
from unicorn.x86_const import *
from capstone import *
import struct
def to_int(b):
return hex(int.from_bytes(b, 'little'))
jumps = { 0: 'jnz', 1: 'jmp', 2: 'js', 3: 'jae', 4: 'jle', 5: 'ja', 6: 'jge', 7: 'jp', 8: 'jo', 9: 'jbe', 10: 'jecxz', 11: 'jnp', 12: 'jz', 13: 'jl', 14: 'jns', 15: 'jno', 16: 'jg', 17: 'jb' }
def get_bit(val, bit):
return (val >> bit) & 1
def get_branch_cond(cond, flags, ecx):
if cond == 0: # jnz
return get_bit(flags, 6) == 0
elif cond == 1: # jmp
return True
elif cond == 2: # js
return get_bit(flags, 7) != 0
elif cond == 3: # jae
return get_bit(flags, 0) == 0
elif cond == 4: # jle
return get_bit(flags, 6) == 0 or (get_bit(flags, 0xb) != get_bit(flags, 0x7))
elif cond == 5: # ja
return get_bit(flags, 6) == 0 and get_bit(flags, 0) == 0
elif cond == 6: # jge
return get_bit(flags, 7) == get_bit(flags, 0xb)
elif cond == 7: # jp
return get_bit(flags, 2) != 0
elif cond == 8: # jo
return get_bit(flags, 0xb) != 0
elif(cond == 9): # jbe
return get_bit(flags, 6) != 0 or get_bit(flags, 0) != 0
elif(cond == 10): # jecxz
return ecx == 0
elif(cond == 11): # jnp
return get_bit(flags, 2) == 0
elif(cond == 12): # jz
return get_bit(flags, 6) != 0
elif(cond == 13): # jl
return get_bit(flags, 0xb) != get_bit(flags, 0x7)
elif(cond == 14): # jns
return get_bit(flags, 7) == 0
elif(cond == 15): # jno
return get_bit(flags, 0xb) == 0
elif(cond == 16): # jg
return get_bit(flags, 6) == 0 and (get_bit(flags, 0xb) == get_bit(flags, 0x7))
elif(cond == 17): # jb
return get_bit(flags, 0) != 0
else:
raise ValueError("invalid condition", cond)
offsets = {}
f = open('data.bin', 'rb').read()
for i in range(0, len(f), 0x10):
data = f[i:i+0x10]
key, cond, jump, base = struct.unpack('<4i', data)
offsets[key + 0x400000] = (cond, jump, base)
def emu_push(mu: Uc, val):
esp = mu.reg_read(UC_X86_REG_ESP)
mu.mem_write(esp - 4, val.to_bytes(4, 'little'))
mu.reg_write(UC_X86_REG_ESP, esp - 4)
def emu_pop(mu: Uc):
esp = mu.reg_read(UC_X86_REG_ESP)
val = mu.mem_read(esp, 4)
mu.reg_write(UC_X86_REG_ESP, esp + 4)
return int.from_bytes(val, 'little')
def emu_return(mu: Uc):
ret_addr = emu_pop(mu)
mu.reg_write(UC_X86_REG_EIP, ret_addr)
def hook_strlen(uc):
print('Skipping strlen function')
uc.reg_write(UC_X86_REG_EAX, 0x13)
def hook_crypt(uc):
#print('Skipping crypto setup')
uc.reg_write(UC_X86_REG_EAX, HPROV)
def hook_hash(uc):
#print('Skipping hash function')
uc.reg_write(UC_X86_REG_EAX, MEM)
def hook_pass(uc):
pass
def hook_mem_write(mu: Uc, access, address, size, value, user_data):
data_bytes = value.to_bytes(size, 'little')
print(f'[{hex(mu.reg_read(UC_X86_REG_EIP))}] Address {hex(address)}: Wrote {data_bytes.hex()}')
def hook_mem_read(mu: Uc, access, address, size, value, user_data):
if address >= MEM and address < MEM + 0x10:
data_bytes = mu.mem_read(address, size)
print(f'[{hex(mu.reg_read(UC_X86_REG_EIP))}] Address {hex(address)}: Read {data_bytes.hex()}')
def hook_code(mu: Uc, address, size, user_data):
for i in md.disasm(mu.mem_read(address, size), address):
print("0x%x:\t%s\t%s" % (i.address, i.mnemonic, i.op_str))
print('\teax:', hex(mu.reg_read(UC_X86_REG_EAX)))
print('\tebx:', hex(mu.reg_read(UC_X86_REG_EBX)))
print('\tecx:', hex(mu.reg_read(UC_X86_REG_ECX)))
print('\tedx:', hex(mu.reg_read(UC_X86_REG_EDX)))
ebp = mu.reg_read(UC_X86_REG_EBP)
esp = mu.reg_read(UC_X86_REG_ESP)
print(f'\tebp: {to_int(mu.mem_read(ebp, 4))} ebp - 4: {to_int(mu.mem_read(ebp - 4, 4))} ebp - 8: {to_int(mu.mem_read(ebp - 8, 4))} ebp - c: {to_int(mu.mem_read(ebp - 0xc, 4))} ebp - 10: {to_int(mu.mem_read(ebp - 0x10, 4))} ebp - 0x14: {to_int(mu.mem_read(ebp - 0x14, 4))}')
print(f'\tstack: +0: {to_int(mu.mem_read(esp, 4))} +4: {to_int(mu.mem_read(esp + 4, 4))} +8: {to_int(mu.mem_read(esp + 8, 4))} +c: {to_int(mu.mem_read(ebp + 0xc, 4))} ')
if i.mnemonic == 'call': # skip external function calls
target = i.operands[0].imm
if target in hooks:
hooks[target](mu)
mu.reg_write(UC_X86_REG_EIP, i.address + i.size)
def hook_interrupt(mu: Uc, intno, user_data):
if intno == 3:
eip = mu.reg_read(UC_X86_REG_EIP) - 1
cond, jump, base = offsets[eip]
not_taken_addr = eip + base
taken_addr = eip + base + jump
#print(hex(eip), hex(not_taken_addr), hex(taken_addr))
ecx = mu.reg_read(UC_X86_REG_ECX)
flags = mu.reg_read(UC_X86_REG_EFLAGS)
should_branch = get_branch_cond(cond - 1, flags, ecx)
if should_branch:
print(f'Branch {jumps[cond - 1]} taken: {hex(taken_addr)}')
mu.reg_write(UC_X86_REG_EIP, taken_addr)
else:
print(f'Branch {jumps[cond - 1]} not taken: {hex(not_taken_addr)}')
mu.reg_write(UC_X86_REG_EIP, not_taken_addr)
sc = open('shellcode.bin', 'rb').read()
md = Cs(CS_ARCH_X86, CS_MODE_32)
md.detail = True
hooks = {
0x409731: hook_strlen,
0x403477: hook_crypt,
0x40a4b5: hook_hash
}
BASE = 0x40a000
BASE_SIZE = 1024*1024
STACK_ADDR = 0x40000000
STACK_SIZE = 1024*1024
MEM = 0x60000000
MEM_SIZE = 1024*1024
HPROV = MEM + 0x100
CONSTS = MEM + 0x2000
def emu(start):
mu = Uc(UC_ARCH_X86, UC_MODE_32)
mu.mem_map(STACK_ADDR, STACK_SIZE)
mu.reg_write(UC_X86_REG_ESP, STACK_ADDR + STACK_SIZE // 2)
mu.reg_write(UC_X86_REG_EBP, STACK_ADDR + STACK_SIZE // 2 - 0x1000)
mu.mem_map(MEM, MEM_SIZE)
mu.mem_write(MEM, b'ABCD-4567-89ab-cdef')
mu.mem_write(STACK_ADDR + STACK_SIZE // 2 + 4, MEM.to_bytes(4, 'little'))
KEY = bytes.fromhex('F630AA38D57297375D645559C334FD50D55CA1D177D2655A042351CF69244BF2')
mu.mem_write(HPROV + 4, KEY)
mu.mem_write(HPROV + 0x24, (MEM+0x1000).to_bytes(4, 'little'))
mu.mem_write(HPROV + 0x28, b'\x00\x8a\x00\x00')
mu.mem_write(CONSTS, bytes.fromhex('47bb5d8690b16e0a336c472093768a1cfbbdfe59'))
mu.reg_write(UC_X86_REG_ECX, CONSTS)
mu.mem_map(BASE, BASE_SIZE)
mu.mem_write(BASE, sc)
mu.hook_add(UC_HOOK_CODE, hook_code)
mu.hook_add(UC_HOOK_INTR, hook_interrupt)
mu.hook_add(UC_HOOK_MEM_READ, hook_mem_read)
mu.emu_start(start, BASE+BASE_SIZE)
emu(0x40a025)
Unfortunately, this approach turned out to be largely unhelpful. I could tell that the serial was supposed to be 19 characters long (consistent with something of the format XXXX-XXXX-XXXX-XXXX), and that the characters were being hashed in groups of four, but the hash algorithm didn’t match up with anything I recognized.
I did notice that the .pc section started with a function that loaded five random-looking values, though, so I figured those were the target values for the hash.
0040a000 int32_t* __convention("fastcall") sub_40a000(int32_t* arg1)
0040a000 c70147bb5d86 mov dword [ecx], 0x865dbb47 {0x865dbb47}
0040a006 8bc1 mov eax, ecx
0040a008 c7410490b16e0a mov dword [ecx+0x4], 0xa6eb190
0040a00f c74108336c4720 mov dword [ecx+0x8], 0x20476c33
0040a016 c7410c93768a1c mov dword [ecx+0xc], 0x1c8a7693
0040a01d c74110fbbdfe59 mov dword [ecx+0x10], 0x59febdfb
0040a024 c3 retn {__return_addr}
Fixing the decompilation
After a while I gave up on emulation and came back to the idea of trying to decompile the code again. I couldn’t figure out a way to patch in the branch instructions without overwriting too many other instructions, so instead I decided to see if I could modify the Binary Ninja IL somehow.
In Binary Ninja, the code to be decompiled is lifted through several different forms of IL as it’s being processed. In theory, there are APIs to edit IL instructions at any stage of the analysis process. Since the modification I wanted to make was supposed to be equivalent to a simple binary patch, I thought it would make the most sense to modify the lowest level of IL, called Lifted IL. However, when I looked through the documentation on how to do so, I found that it explicitly advised against it:
You probably do not want to modify Lifted IL with a Workflow. Instead, consider modifying the Architecture directly (most of which are Open Source on our GitHub) or making your Activity modify Low Level IL.
“Modifying the Architecture directly” sounded promising, so I looked at how the x86 architecture plugin generated IL instructions.
The code for how breakpoints were handled turned out to be very straightforward: for each INT3 breakpoint instruction, a single IL breakpoint instruction is emitted. To fix it, I would need to modify that line to emit an IL branch instruction instead.
case XED_ICLASS_INT3:
il.AddInstruction(il.Breakpoint());
break;
Conveniently, the code for handling branch instructions is right next to the breakpoint code, and it includes a convenient switch case listing all the ways to construct an IL conditional branch from its corresponding x86 branch.
case XED_ICLASS_JO:
ConditionalJump(arch, il, il.FlagCondition(LLFC_O), addrSize, branchDestination, addr + instLen);
return false;
case XED_ICLASS_JNO:
ConditionalJump(arch, il, il.FlagCondition(LLFC_NO), addrSize, branchDestination, addr + instLen);
return false;
// [...]
case XED_ICLASS_JECXZ:
ConditionalJump(arch, il, il.CompareEqual(4, il.Register(4, XED_REG_ECX), il.Const(4, 0)), addrSize, branchDestination, addr + instLen);
return false;
case XED_ICLASS_JRCXZ:
ConditionalJump(arch, il, il.CompareEqual(8, il.Register(8, XED_REG_RCX), il.Const(8, 0)), addrSize, branchDestination, addr + instLen);
return false;
To generate the branch instructions, I started by adding a hard-coded map of all the breakpoint addresses in the .pc section and their corresponding branch types, as well as an enum containing all the conditions.
#include <map>
#include <cstdint>
struct CrackmeCond {
int cond;
int not_taken;
int taken;
};
const uint64_t CRACKME_SC_START = 0x40a000;
const uint64_t CRACKME_SC_END = 0x40a5ff;
enum class JumpCond: int {
JNZ = 0,
JMP,
JS,
JAE,
JLE,
JA,
JGE,
JP,
JO,
JBE,
JECXZ,
JNP,
JZ,
JL,
JNS,
JNO,
JG,
JB
};
std::map<int, CrackmeCond> conds = {
{ 0x40a000, { 1, 2, 49 } },
{ 0x40a001, { 18, 2, 75 } },
{ 0x40a002, { 10, 2, 116 } },
{ 0x40a003, { 10, 2, 69 } },
{ 0x40a004, { 5, 2, 92 } },
{ 0x40a005, { 3, 2, 34 } },
{ 0x40a006, { 18, 2, 161 } },
// [...]
{ 0x40a5fd, { 7, 2, 130 } },
{ 0x40a5fe, { 13, 2, 52 } },
{ 0x40a5ff, { 13, 2, 46 } },
};
I then modified the breakpoint handling code to add a switch case modeled after Binary Ninja’s own code for handling x86 conditional branch instructions, calling ConditionalJump with the arguments I had parsed out of the map whenever a breakpoint fell within the range of addresses in the .pc section.
case XED_ICLASS_INT3:
if(addr >= CRACKME_SC_START && addr < CRACKME_SC_END)
{
const auto cond_struct = conds[addr];
const auto cond = static_cast<JumpCond>(cond_struct.cond - 1);
const auto taken_addr = addr + cond_struct.taken;
const auto not_taken_addr = addr + cond_struct.not_taken;
switch(cond)
{
case JumpCond::JMP:
il.AddInstruction(il.Jump(il.ConstPointer(addrSize, taken_addr)));
break;
case JumpCond::JO:
ConditionalJump(arch, il, il.FlagCondition(LLFC_O), addrSize, taken_addr, not_taken_addr);
break;
// [...]
case JumpCond::JECXZ:
ConditionalJump(arch, il, il.CompareEqual(4, il.Register(4, XED_REG_ECX), il.Const(4, 0)), addrSize, taken_addr, not_taken_addr);
break;
default:
il.AddInstruction(il.Breakpoint());
}
}
else
{
il.AddInstruction(il.Breakpoint());
}
break;
After making this modification, the LLIL breakpoint instructions were replaced with LLIL branches, which repaired the control flow of the binary.
6 @ 0040a02e [ebp - 0x14 {var_18}].d = ecx
7 @ 0040a031 [ebp + 8 {serial}].d
8 @ 0040a035 breakpoint
6 @ 0040a02e [ebp - 0x14 {var_18}].d = ecx
7 @ 0040a035 if ([ebp + 8 {serial}].d == 0) then 8 @ 0x40a045 else 10 @ 0x40a037
This worked far better than I expected it to, resulting in coherent decompilation! It wasn’t perfect by any means, but it was good enough for me to reimplement the algorithm. The trace I’d printed out from Unicorn turned out to be helpful after all, as it gave me values to test against in the steps where the decompilation was ambiguous.
0040a025 int32_t __stdcall do_check(int32_t arg1 @ ecx, int32_t serial)
0040a025 int32_t ebp
0040a025 int32_t var_4 = ebp
0040a026 struct struct_1* ebp_1 = &var_4
0040a02b void* entry_ebx
0040a02b void* var_28 = entry_ebx
0040a02c int32_t entry_esi
0040a02c int32_t var_2c = entry_esi
0040a02d int32_t edi
0040a02d int32_t var_30 = edi
0040a02d int32_t* esp_1 = &var_30
0040a02e int32_t var_18 = arg1
0040a035 int32_t result
0040a035
0040a035 if (serial == 0)
0040a045 result.b = 0
0040a035 else
0040a03f esp_1 = &var_30
0040a03f
0040a043 if (0x409731(serial) == 0x13)
0040a04c int32_t var_8_1 = 0
0040a053 int32_t var_c_1 = 0
0040a05a int32_t var_14_1 = 0
0040a061 int32_t var_10_1 = 0xcafebabe
0040a061
0040a06f while (true)
0040a06f sub_40a4b5(0x403477())
0040a06f
0040a080 if (ebp_1->serial_char == 0)
0040a08a ebp_1->serial_char = 1
0040a080 else if (ebp_1->serial_char == 1)
0040a0ae *(esp_1 - 4) = zx.d(*(ebp_1->acc + ebp_1->count))
0040a0af *(esp_1 - 8) = ebp_1->i
0040a0ba ebp_1->i = do_char_hash(ebp_1->field_c)
0040a0c1 ebp_1->count += 1
0040a0ce int32_t done_yet = ebp_1->count & 0x80000003
0040a0ce
0040a0d3 if (done_yet s< 0)
0040a0d9 done_yet = ((done_yet - 1) | 0xfffffffc) + 1
0040a0d9
0040a0dc if (done_yet == 0)
0040a0e6 // go to compare-and-multiply
0040a0e6 ebp_1->serial_char = 2
0040a0dc else if (ebp_1->count != 0x13)
0040a10f // if not all chars hashed, resume loop
0040a10f ebp_1->serial_char = 1
0040a0f3 else
0040a0fe ebp_1->serial_char = 3
0040a09a else if (ebp_1->serial_char != 2) // state 3: check
0040a178 if (ebp_1->serial_char != 3)
0040a178 break
0040a178
0040a17a *(esp_1 - 4) = 4
0040a186 ebp_1->field_0 = ebp_1->field_c[*(esp_1 - 4)]
0040a199 int32_t check_result = ebp_1->field_10 | (ebp_1->i ^ ebp_1->field_0)
0040a19b ebp_1->field_10 = check_result
0040a19b
0040a19e if (check_result != 0)
0040a1a9 ebp_1->serial_char = 5 // wrong
0040a19e else
0040a1a0 ebp_1->serial_char = 4 // right
0040a11f else // state 2: get compare val and multiply
0040a124 void* eax_11
0040a124 int32_t edx_1
0040a124 edx_1:eax_11 = sx.q(ebp_1->count)
0040a12a unimplemented {sar eax, 0x2}
0040a12e ebp_1->field_8 = ((eax_11 + (edx_1 & 3)) s>> 2) - 1
0040a13a // save compare_val into ebp_1 - 0x1c?
0040a13a ebp_1->field_4 = *(ebp_1->field_c + (ebp_1->field_8 << 2))
0040a151 ebp_1->field_10 |= ebp_1->i ^ ebp_1->field_4
0040a168 ebp_1->i = ebp_1->count * 0x112233 - 0x35014542
0040a16b ebp_1->serial_char = 1
0040a16b
0040a1b6 if (ebp_1->serial_char != 4)
0040a1be result.b = 0
0040a1b6 else
0040a1b8 result.b = 1
0040a043 else
0040a045 result.b = 0
0040a045
0040a1c7 *esp_1
0040a1c7 esp_1[1]
0040a1c8 esp_1[2]
0040a1ca ebp_1->field_20
0040a1cb return result
Reimplementation
The hash was a weird ad-hoc algorithm that made different decisions about how to process each character depending on certain ranges of values that it fell into. This means that the initial approach I took with Unicorn would never have worked very well on its own, as the flow of execution is completely different depending on whether each of the characters in the serial are letters, numbers, special characters and so on. There’s even a special case that specifically handles the character 0x30.
The presence of all the weird conditionals also means that the hash can’t be cracked using constraint solvers. (My initial reimplementation of the algorithm was in Python with the intent of solving it with z3, which resulted in confusing error statements like Symbolic expressions cannot be cast to concrete Boolean values.) Luckily, only 4 characters are hashed at a time, so I rewrote my implementation in C and bruteforced each group of characters.
#include <stdio.h>
#include <stdint.h>
uint32_t hash_round(uint32_t acc, uint32_t c) {
if((acc & 1) != 0) {
if((acc ^ c) <= 0x80000000) {
if((c >= 0x61) && (c <= (0x61 + 26))) {
acc -= c;
}
else if((c < 0x41) || (c > (0x41 + 26))) {
acc *= 9;
}
else {
acc += c;
if((acc & 0x100) != 0) {
acc ^= 0x13371337;
}
}
}
else if(c > 0x60) {
acc = (c * 0x21) ^ (acc + 0xdeadbeef);
}
else {
acc += 0xdeadbeef;
}
}
else {
if(c >= 0x40) {
if(c % 2 != 0) {
acc = ((acc >> 5) | (acc << 0x1b)) ^ 0x87654321;
}
else {
acc = ((acc << 3) | (acc >> 0x1d)) + 0x12345678;
}
}
else if((c & 2) == 0) {
acc -= (c << 4);
if(c == 0x30) {
acc |= 0xf0f0f0f0;
}
}
else {
acc ^= 0x55aa55aa;
acc += c;
}
}
uint32_t mod_5 = (c % 5) + 2;
for(uint32_t i = 0; i < mod_5; i++) {
if((acc & 0x80000000) == 0) {
acc <<= 1;
}
else {
acc = (acc << 1) ^ 0x4c11db7;
}
uint32_t mask = i & 0x80000001;
if(mask != 0) {
acc += i * 0xa;
}
else {
acc ^= c;
}
}
uint32_t count = acc;
count ^= (count >> 0x10);
count ^= (count >> 0x8);
if((count & 0xf) > 7) {
acc = ~acc;
}
return acc;
}
uint32_t hash(char* data, uint32_t pos, uint32_t len) {
uint32_t acc = 0xcafebabe + 0x112233 * 4 * pos;
for(uint32_t i = 0; i < len; i++) {
acc = hash_round(acc, data[i]);
}
return acc;
}
int check_val(uint32_t expected, uint32_t pos, uint32_t len) {
for(char i = 0x20; i < 0x7f; i++) {
//printf("trying %x\n", i);
for(char j = 0x20; j < 0x7f; j++) {
for(char k = 0x20; k < 0x7f; k++) {
for(char l = 0x20; l < 0x7f; l++) {
char to_try[4] = {i, j, k, l};
uint32_t acc = hash(to_try, pos, len);
if(acc == expected) {
printf("%c%c%c%c", i, j, k, l);
return 0;
}
}
}
}
}
return -1;
}
int main() {
uint32_t compares[5] = {0x865dbb47, 0xa6eb190, 0x20476c33, 0x1c8a7693, 0x59febdfb};
for(int i = 0; i < 4; i++) {
if(check_val(compares[i], i, 4) == -1) {
return -1;
}
}
if(check_val(compares[4], 4, 3) == -1) {
return -1;
}
return 0;
}
This gets us the serial: D3FE-A7ED-BAAD-C0D3. Entering it into the crackme, we finally have our flag!

CMO{byp4ss3d_f4tm1k3_2o26}
Note: After the CTF, I found out that the control flow obfuscation came from an obfuscator called Nanomites, the source code of which was on the challenge author’s GitHub long before the CTF started. I’ll have to remember to check for that next time.