Learn Rust From Assembly Code

machine

Rust is a complex programming language, with a lot of concepts. When you find the concept hard to understand, perhaps checking the assembly code would be helpful. It’s my favourite way to learn Rust.

Move

In Rust, we have to use move in most time. It is used in variable assignment, function arguments and closure. move means ownship transfer, i.e. when you move one object, you could not access it from the original variable binding. But what happens in assembly code?

Let’s take a simple example:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
use std::env;

#[derive(Debug)]
struct Foobar {
    a: i8,
    b: u8,
    c: i64,
    d: i64,
    e: i64,
    f: i64,
    g: i64,
    h: i64,
    i: i64,
}

#[inline(never)]
fn echo(bar: Foobar) {
    println!("{:?}", bar);
}

fn main() {
    let arg1 = env::args().nth(1).unwrap().parse::<i8>().unwrap();
    let arg2 = env::args().nth(2).unwrap().parse::<u8>().unwrap();
    let arg3 = env::args().nth(3).unwrap().parse::<i64>().unwrap();
    let arg4 = env::args().nth(4).unwrap().parse::<i64>().unwrap();
    let arg5 = env::args().nth(5).unwrap().parse::<i64>().unwrap();
    let arg6 = env::args().nth(6).unwrap().parse::<i64>().unwrap();
    let arg7 = env::args().nth(7).unwrap().parse::<i64>().unwrap();
    let arg8 = env::args().nth(8).unwrap().parse::<i64>().unwrap();
    let arg9 = env::args().nth(9).unwrap().parse::<i64>().unwrap();

    let bar = Foobar {
        a: arg1,
        b: arg2,
        c: arg3,
        d: arg4,
        e: arg5,
        f: arg6,
        g: arg7,
        h: arg8,
        i: arg9,
    };
    echo(bar);
}

Note that I define Foobar with more than two fields, so that llvm would not unroll the struct and pass the fields to echo field by field. And, I disable inline of echo, so that we could check move via real function call.

I use Rust playground to generate the assembly code.

Assembly code in debug mode:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
...
	callq	core::str::<impl str>::parse
	movw	%ax, 206(%rsp)
	jmp	.LBB80_9

.LBB80_9:
	movw	206(%rsp), %ax
	movw	%ax, 1212(%rsp)
	movw	1212(%rsp), %ax
	movw	%ax, 224(%rsp)
	movzwl	224(%rsp), %edi
	leaq	.L__unnamed_22(%rip), %rsi
	callq	core::result::Result<T,E>::unwrap
	movb	%al, 205(%rsp)
	jmp	.LBB80_10

.LBB80_109:
	leaq	1032(%rsp), %rdi
	callq	core::ptr::drop_in_place<std::env::Args>
	movq	8(%rsp), %rax
...
	movb	205(%rsp), %r11b
	movb	%r11b, 1120(%rsp)
	movb	%r10b, 1121(%rsp)
...
	movq	%rax, 1112(%rsp)
	leaq	1128(%rsp), %rdi
	leaq	1064(%rsp), %rsi
	movl	$64, %edx
	callq	memcpy@PLT
	leaq	1128(%rsp), %rdi
	callq	playground::echo

You could see that it uses memcpy to copy the struct and pass to echo.

206(%rsp) is one of the parsed argument, after unwrap(), it becomes 205(%rsp), and finally as the source item 1120(%rsp) of memcpy.

Let’s check the release version:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
...
	callq	*<std::env::Args as core::iter::traits::iterator::Iterator>::next@GOTPCREL(%rip)
...
	callq	*core::num::<impl core::str::traits::FromStr for i64>::from_str@GOTPCREL(%rip)
	cmpb	$0, 8(%rsp)
	jne	.LBB11_72
	movq	16(%rsp), %rax
	movq	%rax, 144(%rsp)

.LBB11_352:
	movq	144(%rsp), %rax
	movq	%rax, 40(%rsp)
	movq	136(%rsp), %rax
...
	leaq	40(%rsp), %rdi
	callq	playground::echo

No copy in release mode! It just pass the struct pointer to echo.

144(%rsp) is one of the parsed argument, get assigned to the first item of Foobar struct, 40(%rsp). And then, leaq 40(%rsp), %rdi gets the struct address. Interestingly, you could also see that the fields are reordered by the llvm.

So no need to worry about the performance of move.

BTW, let’s change the code a bit and use reference instead.

1
echo(&bar);

Then no surprise, it uses pointer, no matter debug or release mode.

1
2
	leaq	1064(%rsp), %rdi
	callq	playground::echo

Copy

How about struct with copy trait?

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
#[derive(Debug, Copy, Clone)]
struct Foobar {
    a: i8,
    b: u8,
    c: i64,
    d: i64,
    e: i64,
    f: i64,
    g: i64,
    h: i64,
    i: i64,
}

Check the debug version of assembly code:

1
2
3
4
5
6
7
8
	movq	%rcx, 1104(%rsp)
	movq	%rax, 1112(%rsp)
	leaq	1128(%rsp), %rdi
	leaq	1064(%rsp), %rsi
	movl	$64, %edx
	callq	memcpy@PLT
	leaq	1128(%rsp), %rdi
	callq	playground::echo

Well, memcpy happens.

Check the release version of assembly code:

1
2
	leaq	40(%rsp), %rdi
	callq	playground::echo

No copy happens! Just like move, llvm does not do stupid copy even if the copy trait is implemented.

Even if you call echo twice, it happens the same.

We could change the code and check again:

1
2
3
4
    let mut bar = ...
    echo(bar);
    bar.a = 99;
    echo(bar);

Check the debug version of assembly code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
	movq	%rax, 1112(%rsp)
	leaq	1128(%rsp), %rdi
	leaq	1064(%rsp), %rsi
	movl	$64, %edx
	callq	memcpy@PLT
	leaq	1128(%rsp), %rdi
	callq	playground::echo
	movb	$99, 1120(%rsp)
	leaq	1192(%rsp), %rdi
	leaq	1064(%rsp), %rsi
	movl	$64, %edx
	callq	memcpy@PLT
	leaq	1192(%rsp), %rdi
	callq	playground::echo

Copy one to echo, and modify a field in place, and copy another to second echo.

Check the release version of assembly code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
	leaq	40(%rsp), %rdi
	callq	playground::echo
	movq	%rbp, 40(%rsp)
	movq	%r14, 48(%rsp)
	movq	%r12, 56(%rsp)
	movq	%r13, 64(%rsp)
	movq	%r15, 72(%rsp)
	movq	120(%rsp), %rax
	movq	%rax, 80(%rsp)
	movq	112(%rsp), %rax
	movq	%rax, 88(%rsp)
	movb	$99, 96(%rsp)
	movb	%bl, 97(%rsp)
	leaq	40(%rsp), %rdi
	callq	playground::echo

The release assembly code reuses the same memory block in the stack to hold the struct content and passes the address to echo. Perfect!

Move in thread::spawn

Because function calls happen in the same stack frame chains, so Rust could optimize them without question. But for the closure case, esepcially for threading, the copy is unavoidable, because the closure invocation would happen in different context or even different OS thread!

Let’s confirm it.

1
2
3
    std::thread::spawn(|| {
        echo(bar);
    });

We only check the release assembly code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
...
	callq	*core::num::<impl core::str::traits::FromStr for i64>::from_str@GOTPCREL(%rip)
	cmpb	$0, 8(%rsp)
	jne	.LBB41_345
	movq	16(%rsp), %r12
...
	movq	%r12, 72(%rsp)
...
	movl	$88, %edi
	movl	$8, %esi
	callq	*__rust_alloc@GOTPCREL(%rip)
	testq	%rax, %rax
	je	.LBB41_397
	movq	88(%rsp), %rcx
	movq	%rcx, 80(%rax)
	movups	72(%rsp), %xmm0
	movups	%xmm0, 64(%rax)
	movdqu	8(%rsp), %xmm0
	movups	24(%rsp), %xmm1
	movups	40(%rsp), %xmm2
	movups	56(%rsp), %xmm3
	movups	%xmm3, 48(%rax)
	movups	%xmm2, 32(%rax)
	movups	%xmm1, 16(%rax)
	movdqu	%xmm0, (%rax)
	leaq	.L__unnamed_23(%rip), %rcx
	leaq	112(%rsp), %rdi
	movq	%r15, %rsi
	movq	%rax, %rdx
	callq	*std::sys::unix::thread::Thread::new@GOTPCREL(%rip)

You could see that it allocates on the heap and copy from the stack.

Take one field as example: %r12 -> 72(%rsp) -> %xmm0 -> 64(%rax).

Box

Box is used to manage memory on heap. In fact, Rust perfers stack. As known, struct/enum allocation and initialization only happen on the stack. Even if you wrap it with Box::new(), you still need to construct a struct instance on the stack first.

Then we have a question, does Rust optimize it so that the construction done on the heap directly?

Let’s check, we reuse above source code and change it:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
#[inline(never)]
fn echo(bar: Box<Foobar>) {
    println!("{:?}", bar);
}
...
    let bar = Box::new(Foobar {
        a: arg1,
        b: arg2,
        c: arg3,
        d: arg4,
        e: arg5,
        f: arg6,
        g: arg7,
        h: arg8,
        i: arg9,
    });
    echo(bar);

Check the debug assembly code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
	callq	alloc::alloc::exchange_malloc
...
	movq	(%rsp), %rdi
	leaq	1072(%rsp), %rsi
	movl	$64, %edx
	callq	memcpy@PLT
	movq	(%rsp), %rax
	movq	%rax, 1216(%rsp)
	movq	(%rsp), %rdi
	callq	playground::echo

Yes, as expected, it constructs the struct on stack, and copy it to the heap.

What about release version?

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
	callq	*__rust_alloc@GOTPCREL(%rip)
	testq	%rax, %rax
	movl	76(%rsp), %ecx
	movzbl	75(%rsp), %edx
	je	.LBB13_357
	movq	112(%rsp), %rsi
	movq	%rsi, (%rax)
	movq	104(%rsp), %rsi
	movq	%rsi, 8(%rax)
	movq	96(%rsp), %rsi
	movq	%rsi, 16(%rax)
	movq	88(%rsp), %rsi
	movq	%rsi, 24(%rax)
	movq	80(%rsp), %rsi
	movq	%rsi, 32(%rax)
	movq	%r12, 40(%rax)
	movq	%r15, 48(%rax)
	movb	%cl, 56(%rax)
	movb	%dl, 57(%rax)
	movq	%rax, %rdi
	callq	playground::echo

Wow! It allocates and initializes the struct on the heap directly!

match

match is like C switch. In fact, simple constant switch would generate linear comparison branches, instead of jump table!

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
#[inline(never)]
fn echo(bar: Foobar) {
    match bar.i {
        3 => println!("{:?}", bar),
        99 => println!("99 {:?}", bar),
        88 => println!("88 {:?}", bar),
        188 => println!("188 {:?}", bar),
        288 => println!("288 {:?}", bar),
        388 => println!("388 {:?}", bar),
        488 => println!("488 {:?}", bar),
        588 => println!("588 {:?}", bar),
        688 => println!("688 {:?}", bar),
        788 => println!("788 {:?}", bar),
        888 => println!("888 {:?}", bar),
        _ => todo!(),
    }
}

Check the debug version of assembly code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
playground::echo:
	subq	$904, %rsp
	movq	%rdi, 184(%rsp)
	movq	48(%rdi), %rax
	movq	%rax, 192(%rsp)
	subq	$3, %rax
	je	.LBB79_2
	jmp	.LBB79_36

.LBB79_36:
	movq	192(%rsp), %rax
	subq	$88, %rax
	je	.LBB79_4
	jmp	.LBB79_37

.LBB79_37:
	movq	192(%rsp), %rax
	subq	$99, %rax
	je	.LBB79_3
	jmp	.LBB79_38

.LBB79_38:
	movq	192(%rsp), %rax
	subq	$188, %rax
	je	.LBB79_5
	jmp	.LBB79_39

Check the release version of assembly code:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
playground::echo:
	subq	$72, %rsp
	movq	48(%rdi), %rax
	cmpq	$387, %rax
	jle	.LBB10_1
	cmpq	$687, %rax
	jg	.LBB10_14
	cmpq	$388, %rax
	je	.LBB10_21
	cmpq	$488, %rax
	je	.LBB10_22
	cmpq	$588, %rax
	jne	.LBB10_17
	movq	%rdi, 8(%rsp)
	leaq	<playground::Foobar as core::fmt::Debug>::fmt(%rip), %rax
	movq	%rax, 16(%rsp)
	leaq	.L__unnamed_2(%rip), %rax
	jmp	.LBB10_26

.LBB10_1:
	cmpq	$98, %rax
	jle	.LBB10_2
	cmpq	$99, %rax
	je	.LBB10_19
	cmpq	$188, %rax
	je	.LBB10_20
	cmpq	$288, %rax
	jne	.LBB10_17

Interestingly, the release version of assembly code splits the comparison branches into ranges to speed up the branch selection.