Writeup for the Broken challenge from VolgaCTF 2016 Quals

This is a pretty nice challenge from the VolgaCTF 2016 Quals; sadly, I couldn’t join the r/OpenToAllCTFteam and play because I was too busy, but I noticed it was missing a writeup and decided to write one.

The first thing I always do when I want to analyze an executable is to run it inside a disposable virtual machine; the first thing you will notice is that it doesn’t seem to be doing anything at all, and that it will closes itself after half a minute with the following message: “The processing has taken too long, terminating the process…”. It’s obvious we will not gain any further knowledge from this executable by just launching it. Fire up your favorite disassembler and let’s take a look at the entry point.

This is roughly what happens inside the main entry point:

  • A couple of structures are initialized.
  • Four threads are started using a function that wraps pthread_create.
  • The program pauses until the threads have returned (using a wrapper for pthread_join).
  • A printf() call outputs a string.

This is the function that is used to create each thread. We are running under Linux x64, meaning that the vast majority of the functions we will encounter here will use the __fastcall convention: arguments are primarily passed using the registers (RDI, RSI, RDX, RCX, R8, R9, XMM0/XMM7). The start routine is the second argument of the function, and will therefore end up inside the RSI register.

.text:0000000000401340 ; void __fastcall CreateThread(pthread_t *thread, void *(__cdecl *start_routine)(void *), void *thread_argument)
.text:0000000000401340 CreateThread proc near                  ; CODE XREF: main+E9^p
.text:0000000000401340                                         ; main+102^p ...
.text:0000000000401340     push    rbx
.text:0000000000401341     mov     ebx, ecx
.text:0000000000401343     mov     rcx, rdx                    ; arg
.text:0000000000401346     mov     rdx, rsi                    ; start_routine
.text:0000000000401349     xor     esi, esi                    ; attr
.text:000000000040134B     call    _pthread_create
.text:0000000000401350     test    eax, eax
.text:0000000000401352     jnz     short loc_401356
.text:0000000000401354     pop     rbx
.text:0000000000401355     retn
.text:0000000000401356 ; ---------------------------------------------------------------------------
.text:0000000000401356 loc_401356:                             ; CODE XREF: CreateThread+12 ^j
.text:0000000000401356     mov     edi, offset s               ; "Failed to spawn thread\n"
.text:000000000040135B     call    _perror
.text:0000000000401360     mov     edi, ebx                    ; status
.text:0000000000401362     call    _exit
.text:0000000000401362 CreateThread endp

You can now track down the start routines used to create the threads:

  • 0x00400E20: ComputeSHA256Hash
  • 0x00400E60: ComputeSHA512Hash
  • 0x00400F40: Thread3
  • 0x00400EA0: TimeoutThread

The first two threads will compute the hash of a buffer and terminate; remember the two structures that are initialized right at the start of the main entry point? They hold both the input buffer and the pointer where the resulting digest will be stored. They are not particularly interesting, and I will not talk much about them.

00000000 HashedBuffer struc ; (sizeof=0x18, mappedto_3) ; XREF: 00007FFC02AF11C0/r
00000000                                         ; main/r ...
00000000 pbuffer dq ?                            ; XREF: main+8C/w main+AA/w ; offset
00000008 size dd ?                               ; XREF: main+9F/w main+B5/w
0000000C     db ? ; undefined
0000000D     db ? ; undefined
0000000E     db ? ; undefined
0000000F     db ? ; undefined
00000010 pdigest dq ?                            ; XREF: main+60/w main+C1/w ; offset
00000018 HashedBuffer ends

.text:0000000000400E20 ; uint8_t __fastcall ComputeSHA256Hash(struct HashedBuffer *hashed_buffer_obj)
.text:0000000000400E20 ComputeSHA256Hash proc near
.text:0000000000400E20     push    rbx
.text:0000000000400E21     mov     rbx, rdi
.text:0000000000400E24     mov     edi, offset thread1_sha256_context ; context
.text:0000000000400E29     call    _SHA256_Init
.text:0000000000400E2E     movsxd  rdx, dword ptr [rbx+8] ; len
.text:0000000000400E32     mov     rsi, [rbx]  ; data
.text:0000000000400E35     mov     edi, offset thread1_sha256_context ; context
.text:0000000000400E3A     call    _SHA256_Update
.text:0000000000400E3F     mov     rdi, [rbx+10h] ; digest
.text:0000000000400E43     mov     esi, offset thread1_sha256_context ; context
.text:0000000000400E48     call    _SHA256_Final
.text:0000000000400E4D     mov     rax, [rbx+10h]
.text:0000000000400E51     pop     rbx
.text:0000000000400E52     retn
.text:0000000000400E52 ComputeSHA256Hash

.text:0000000000400E60 ; uint8_t __fastcall ComputeSHA512Hash(struct HashedBuffer *hashed_buffer_obj)
.text:0000000000400E60 ComputeSHA512Hash proc near
.text:0000000000400E60     push    rbx
.text:0000000000400E61     mov     rbx, rdi
.text:0000000000400E64     mov     edi, offset thread2_sha512_context ; context
.text:0000000000400E69     call    _SHA512_Init
.text:0000000000400E6E     movsxd  rdx, dword ptr [rbx+8] ; len
.text:0000000000400E72     mov     rsi, [rbx]  ; data
.text:0000000000400E75     mov     edi, offset thread2_sha512_context ; context
.text:0000000000400E7A     call    _SHA512_Update
.text:0000000000400E7F     mov     rdi, [rbx+10h] ; digest
.text:0000000000400E83     mov     esi, offset thread2_sha512_context ; context
.text:0000000000400E88     call    _SHA512_Final
.text:0000000000400E8D     mov     rax, [rbx+10h]
.text:0000000000400E91     pop     rbx
.text:0000000000400E92     retn
.text:0000000000400E92 ComputeSHA512Has

The fourth thread is the one that prints the timeout message and terminate the process by calling exit().

.text:0000000000400EA0 ; void *__cdecl TimeoutThread(void *unused)
.text:0000000000400EA0 TimeoutThread proc near                 ; DATA XREF: main+12C^o
.text:0000000000400EA0     sub     rsp, 8
.text:0000000000400EA4     mov     edi, 30                     ; seconds
.text:0000000000400EA9     call    _sleep
.text:0000000000400EAE     mov     edi, 1
.text:0000000000400EB3     mov     edx, offset aTheProcessingH ; "The processing has taken too long, term"...
.text:0000000000400EB8     mov     esi, offset unk_401970
.text:0000000000400EBD     xor     eax, eax
.text:0000000000400EBF     call    ___printf_chk
.text:0000000000400EC4     mov     edi, 5                      ; status
.text:0000000000400EC9     call    _exit
.text:0000000000400EC9 TimeoutThread endp

Let’s take a look at the third thread; you will eventually notice there’s something spying on you once you start setting breakpoints around. The program will stop working and the execution will not even reach the main entry point. This is caused by a function that computes the hash of a selected number of functions defined the program and terminate in case any of the calculated signatures don’t match.

.text:0000000000401580 ; void ModuleInitialization(void)
.text:0000000000401580 ModuleInitialization proc near          ; DATA XREF: start+16^o
.text:0000000000401580     push    r15
.text:0000000000401582     mov     r15d, edi
.text:0000000000401585     push    r14
.text:0000000000401587     mov     r14, rsi
.text:000000000040158A     push    r13
.text:000000000040158C     mov     r13, rdx
.text:000000000040158F     push    r12
.text:0000000000401591     lea     r12, InitializationCallbacks
.text:0000000000401598     push    rbp
.text:0000000000401599     lea     rbp, InitializationCallbacks2
.text:00000000004015A0     push    rbx
.text:00000000004015A1     sub     rbp, r12
.text:00000000004015A4     xor     ebx, ebx
.text:00000000004015A6     sar     rbp, 3
.text:00000000004015AA     sub     rsp, 8
.text:00000000004015AE     call    _init_proc
.text:00000000004015B3     test    rbp, rbp
.text:00000000004015B6     jz      short loc_4015D6
.text:00000000004015B8     nop     dword ptr [rax+rax+00000000h]
.text:00000000004015C0 loc_4015C0:                             ; CODE XREF: ModuleInitialization+54 vj
.text:00000000004015C0     mov     rdx, r13                    ;
.text:00000000004015C0                                         ; Functions called: PatchProtection, Initialize
.text:00000000004015C3     mov     rsi, r14
.text:00000000004015C6     mov     edi, r15d
.text:00000000004015C9     call    qword ptr [r12+rbx*8]
.text:00000000004015CD     add     rbx, 1
.text:00000000004015D1     cmp     rbx, rbp
.text:00000000004015D4     jnz     short loc_4015C0
.text:00000000004015D6 loc_4015D6:                             ; CODE XREF: ModuleInitialization+36 ^j
.text:00000000004015D6     add     rsp, 8
.text:00000000004015DA     pop     rbx
.text:00000000004015DB     pop     rbp
.text:00000000004015DC     pop     r12
.text:00000000004015DE     pop     r13
.text:00000000004015E0     pop     r14
.text:00000000004015E2     pop     r15
.text:00000000004015E4     retn
.text:00000000004015E4 ModuleInitialization endp

If you are curious about this, you can declare such function using the __attribute__((constructor)) statement in your C or C++ code.

As you can see, it’s calling a couple of initialization functions taken from an array; this array is accessed at virtual address 0x00401591 and contains both the patching protection (0x00400A50) that is interfering with us and a function that initializes the internal state of the program (0x00400DF0). The huge nop instruction at virtual address 0x004015B8 is a clear indication that an opcode has been removed. Did you notice that the second array is referenced but its value is actually never used? Replace the instruction with a call to the function pointer stored inside the second array (0x00400DD0).

Now we have to disable the protection; I have forced the jump at virtual address 0x00401541, but you can probably just skip the whole function.

Let’s go back to the thread we were analyzing (0x00400F40). If you step through the code, you will notice that it deadlocks inside a sem_wait call. Do you remember how sem_init works? When you create a new semaphore, you can set the initial value; this value is increased using sem_post and decreased using sem_wait. If you wait on a semaphore that is currently set to 0, you will have to wait until someone increments it using sem_post.

The whole situation become a lot easier to understand once you give the semaphores a name. I have named them (surprise) semaphore1 (0x00400F77), semaphore2 (0x00400F8D) and semaphore3 (0x00400FA3). They have all been initialized to 0: keep this in mind because it’s important.

Once the semaphores are initialized, a couple more threads are created: Thread5 (0x00400ED0) and Thread6 (0x004012C0). Both functions perform (more or less) the same operations:

// pseudo-code for Thread5 and Thread6
void ThreadEntryPoint()
	// keep in mind that you have two almost identical threads that
	// are performing the same operations!
	// increment semaphore3 twice; this will allow Thread3 to call
	// sem_wait(semaphore3) twice after Thread5 and Thread6 are created.
	_sem_post(semaphore3) // semaphore3 += 2

		// wait for Thread3 to give us the ok to proceed
		// we obviously need two sem_post(semaphore1) calls in order
		// to unlock both threads
		sem_wait(semaphore1); // semaphore1 -= 2
		// update the program state
		// ...

		// this will tell Thread3 that it can proceed
	} while (sem_post(semaphore3) == 0); // semaphore3 += 2

Now that we know how the two threads are working, let’s go back to Thread3. We have another nop at virtual address 0x00401073 and we need to replace it with another function call; remember the patch protection function at virtual address 0x00400A50? One of the routines that it was protecting was not referenced by anything else in the program, meaning that it’s the one we need to call.

.text:0000000000400AE0 loc_400AE0:                             ; CODE XREF: PatchProtection+78 ^j
.text:0000000000400AE0     mov     rdi, cs:sub401450_address   ;
.text:0000000000400AE0                                         ; this is the virtual address of the function: 0x00401450
.text:0000000000400AE7     test    rdi, rdi
.text:0000000000400AEA     jz      short loc_400B02
.text:0000000000400AEC     mov     rdx, cs:sub401450_code_hash_ptr ; expected_hash
.text:0000000000400AF3     mov     esi, cs:sub401450_code_length ; length
.text:0000000000400AF9     call    VerifyMemoryHash
.text:0000000000400AFE     test    eax, eax
.text:0000000000400B00     jnz     short loc_400B07

You will notice that this is not enough to fix the program, because it will now deadlock somewhere else inside the code we added; if you take a closer look at where it gets stuck you will realize that the only possible explanation is that one of the semaphores inside the threads #5 and #6 has been changed.

Open the Thread6 function and fix the semaphore inside the loop:

.text:0000000000401311 BF 00 35 60 00        mov     edi, offset semaphore1 ; change this to semaphore 2
.text:0000000000401316 E8 D5 F5 FF FF        call    _sem_wait
.text:000000000040131B 85 C0                 test    eax, eax
.text:000000000040131D 74 B9                 jz      short loc_40

Here’s a summary of the patching I have done:

# open the executable in read/write
radare2 -w broken

# disable the patch protection
[0x00400d35]> s 0x401541
[0x00401541]> wa jmp 0x401530

# add the missing initialization function
[0x00401541]> s 0x4015b8
[0x004015b8]> wa call 0x400DD0
[0x004015b8]> s 0x004015bd
[0x004015bd]> wa nop
[0x004015bd]> s 0x004015be
[0x004015be]> wa nop
[0x004015be]> s 0x004015bf
[0x004015bf]> wa nop

# restore the function call inside Thread3
[0x004015b8]> s 0x0401073
[0x00401073]> wa call 0x401450

# fix the semaphore inside Thread6
[0x00401073]> s 0x00401311
[0x00401311]> wa mov edi, 0x6034E0

# save everything and close
[0x00401311]> wc
[0x00401311]> q

Run it again and you will get the flag: VolgaCTF{avoid_de@dl0cks_they_br3akyour@pp}