Rust is a complex programming language, with a lot of concepts. When you find the concept hard to
understand, perhaps checking the assembly code would be helpful. It’s my favourite way to learn Rust.
Move
In Rust, we have to use move
in most time. It is used in variable assignment, function arguments and closure.
move
means ownship transfer, i.e. when you move one object, you could not access it from the original
variable binding. But what happens in assembly code?
Let’s take a simple example:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
use std ::env ;
#[derive(Debug)]
struct Foobar {
a : i8 ,
b : u8 ,
c : i64 ,
d : i64 ,
e : i64 ,
f : i64 ,
g : i64 ,
h : i64 ,
i : i64 ,
}
#[inline(never)]
fn echo ( bar : Foobar ) {
println! ( "{:?}" , bar );
}
fn main () {
let arg1 = env ::args (). nth ( 1 ). unwrap (). parse ::< i8 > (). unwrap ();
let arg2 = env ::args (). nth ( 2 ). unwrap (). parse ::< u8 > (). unwrap ();
let arg3 = env ::args (). nth ( 3 ). unwrap (). parse ::< i64 > (). unwrap ();
let arg4 = env ::args (). nth ( 4 ). unwrap (). parse ::< i64 > (). unwrap ();
let arg5 = env ::args (). nth ( 5 ). unwrap (). parse ::< i64 > (). unwrap ();
let arg6 = env ::args (). nth ( 6 ). unwrap (). parse ::< i64 > (). unwrap ();
let arg7 = env ::args (). nth ( 7 ). unwrap (). parse ::< i64 > (). unwrap ();
let arg8 = env ::args (). nth ( 8 ). unwrap (). parse ::< i64 > (). unwrap ();
let arg9 = env ::args (). nth ( 9 ). unwrap (). parse ::< i64 > (). unwrap ();
let bar = Foobar {
a : arg1 ,
b : arg2 ,
c : arg3 ,
d : arg4 ,
e : arg5 ,
f : arg6 ,
g : arg7 ,
h : arg8 ,
i : arg9 ,
};
echo ( bar );
}
Note that I define Foobar
with more than two fields, so that llvm would not unroll the struct and
pass the fields to echo
field by field. And, I disable inline of echo
, so that we could check move
via real function call.
I use Rust playground to generate the assembly code.
Assembly code in debug mode:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
...
callq core :: str :: < impl str > :: parse
movw %ax , 206 ( %rsp )
jmp .LBB80_9
.LBB80_9:
movw 206 ( %rsp ), %ax
movw %ax , 1212 ( %rsp )
movw 1212 ( %rsp ), %ax
movw %ax , 224 ( %rsp )
movzwl 224 ( %rsp ), %edi
leaq .L__unnamed_22 ( %rip ), %rsi
callq core :: result :: Result < T , E > :: unwrap
movb %al , 205 ( %rsp )
jmp .LBB80_10
.LBB80_109:
leaq 1032 ( %rsp ), %rdi
callq core :: ptr :: drop_in_place < std :: env :: Args >
movq 8 ( %rsp ), %rax
...
movb 205 ( %rsp ), %r11b
movb %r11b , 1120 ( %rsp )
movb %r10b , 1121 ( %rsp )
...
movq %rax , 1112 ( %rsp )
leaq 1128 ( %rsp ), %rdi
leaq 1064 ( %rsp ), %rsi
movl $64 , %edx
callq memcpy@PLT
leaq 1128 ( %rsp ), %rdi
callq playground :: echo
You could see that it uses memcpy
to copy the struct and pass to echo
.
206(%rsp)
is one of the parsed argument, after unwrap()
, it becomes 205(%rsp)
, and finally as
the source item 1120(%rsp)
of memcpy
.
Let’s check the release version:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
...
callq * < std :: env :: Args as core :: iter :: traits :: iterator :: Iterator > :: next@GOTPCREL ( %rip )
...
callq * core :: num :: < impl core :: str :: traits :: FromStr for i64 > :: from_str@GOTPCREL ( %rip )
cmpb $0 , 8 ( %rsp )
jne .LBB11_72
movq 16 ( %rsp ), %rax
movq %rax , 144 ( %rsp )
.LBB11_352:
movq 144 ( %rsp ), %rax
movq %rax , 40 ( %rsp )
movq 136 ( %rsp ), %rax
...
leaq 40 ( %rsp ), %rdi
callq playground :: echo
No copy in release mode! It just pass the struct pointer to echo
.
144(%rsp)
is one of the parsed argument, get assigned to the first item of Foobar
struct, 40(%rsp)
.
And then, leaq 40(%rsp), %rdi
gets the struct address. Interestingly, you could also see that the fields
are reordered by the llvm.
So no need to worry about the performance of move
.
BTW, let’s change the code a bit and use reference instead.
Then no surprise, it uses pointer, no matter debug or release mode.
1
2
leaq 1064 ( %rsp ), %rdi
callq playground :: echo
Copy
How about struct with copy trait?
1
2
3
4
5
6
7
8
9
10
11
12
#[derive(Debug, Copy, Clone)]
struct Foobar {
a : i8 ,
b : u8 ,
c : i64 ,
d : i64 ,
e : i64 ,
f : i64 ,
g : i64 ,
h : i64 ,
i : i64 ,
}
Check the debug version of assembly code:
1
2
3
4
5
6
7
8
movq %rcx , 1104 ( %rsp )
movq %rax , 1112 ( %rsp )
leaq 1128 ( %rsp ), %rdi
leaq 1064 ( %rsp ), %rsi
movl $64 , %edx
callq memcpy@PLT
leaq 1128 ( %rsp ), %rdi
callq playground :: echo
Well, memcpy
happens.
Check the release version of assembly code:
1
2
leaq 40 ( %rsp ), %rdi
callq playground :: echo
No copy happens! Just like move
, llvm does not do stupid copy even if the copy trait is implemented.
Even if you call echo
twice, it happens the same.
We could change the code and check again:
1
2
3
4
let mut bar = .. .
echo ( bar );
bar . a = 99 ;
echo ( bar );
Check the debug version of assembly code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
movq %rax , 1112 ( %rsp )
leaq 1128 ( %rsp ), %rdi
leaq 1064 ( %rsp ), %rsi
movl $64 , %edx
callq memcpy@PLT
leaq 1128 ( %rsp ), %rdi
callq playground :: echo
movb $99 , 1120 ( %rsp )
leaq 1192 ( %rsp ), %rdi
leaq 1064 ( %rsp ), %rsi
movl $64 , %edx
callq memcpy@PLT
leaq 1192 ( %rsp ), %rdi
callq playground :: echo
Copy one to echo
, and modify a field in place, and copy another to second echo
.
Check the release version of assembly code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
leaq 40 ( %rsp ), %rdi
callq playground :: echo
movq %rbp , 40 ( %rsp )
movq %r14 , 48 ( %rsp )
movq %r12 , 56 ( %rsp )
movq %r13 , 64 ( %rsp )
movq %r15 , 72 ( %rsp )
movq 120 ( %rsp ), %rax
movq %rax , 80 ( %rsp )
movq 112 ( %rsp ), %rax
movq %rax , 88 ( %rsp )
movb $99 , 96 ( %rsp )
movb %bl , 97 ( %rsp )
leaq 40 ( %rsp ), %rdi
callq playground :: echo
The release assembly code reuses the same memory block in the stack to hold the struct content
and passes the address to echo
. Perfect!
Move in thread::spawn
Because function calls happen in the same stack frame chains, so Rust could optimize them without question.
But for the closure case, esepcially for threading, the copy is unavoidable, because the closure invocation
would happen in different context or even different OS thread!
Let’s confirm it.
1
2
3
std ::thread ::spawn ( || {
echo ( bar );
});
We only check the release assembly code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
...
callq * core :: num :: < impl core :: str :: traits :: FromStr for i64 > :: from_str@GOTPCREL ( %rip )
cmpb $0 , 8 ( %rsp )
jne .LBB41_345
movq 16 ( %rsp ), %r12
...
movq %r12 , 72 ( %rsp )
...
movl $88 , %edi
movl $8 , %esi
callq * __rust_alloc@GOTPCREL ( %rip )
testq %rax , %rax
je .LBB41_397
movq 88 ( %rsp ), %rcx
movq %rcx , 80 ( %rax )
movups 72 ( %rsp ), %xmm0
movups %xmm0 , 64 ( %rax )
movdqu 8 ( %rsp ), %xmm0
movups 24 ( %rsp ), %xmm1
movups 40 ( %rsp ), %xmm2
movups 56 ( %rsp ), %xmm3
movups %xmm3 , 48 ( %rax )
movups %xmm2 , 32 ( %rax )
movups %xmm1 , 16 ( %rax )
movdqu %xmm0 , ( %rax )
leaq .L__unnamed_23 ( %rip ), %rcx
leaq 112 ( %rsp ), %rdi
movq %r15 , %rsi
movq %rax , %rdx
callq * std :: sys :: unix :: thread :: Thread :: new@GOTPCREL ( %rip )
You could see that it allocates on the heap and copy from the stack.
Take one field as example: %r12
-> 72(%rsp)
-> %xmm0
-> 64(%rax)
.
Box
Box
is used to manage memory on heap. In fact, Rust perfers stack. As known, struct/enum allocation
and initialization only happen on the stack. Even if you wrap it with Box::new()
, you still need to
construct a struct instance on the stack first.
Then we have a question, does Rust optimize it so that the construction done on the heap directly?
Let’s check, we reuse above source code and change it:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#[inline(never)]
fn echo ( bar : Box < Foobar > ) {
println! ( "{:?}" , bar );
}
.. .
let bar = Box ::new ( Foobar {
a : arg1 ,
b : arg2 ,
c : arg3 ,
d : arg4 ,
e : arg5 ,
f : arg6 ,
g : arg7 ,
h : arg8 ,
i : arg9 ,
});
echo ( bar );
Check the debug assembly code:
1
2
3
4
5
6
7
8
9
10
callq alloc :: alloc :: exchange_malloc
...
movq ( %rsp ), %rdi
leaq 1072 ( %rsp ), %rsi
movl $64 , %edx
callq memcpy@PLT
movq ( %rsp ), %rax
movq %rax , 1216 ( %rsp )
movq ( %rsp ), %rdi
callq playground :: echo
Yes, as expected, it constructs the struct on stack, and copy it to the heap.
What about release version?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
callq * __rust_alloc@GOTPCREL ( %rip )
testq %rax , %rax
movl 76 ( %rsp ), %ecx
movzbl 75 ( %rsp ), %edx
je .LBB13_357
movq 112 ( %rsp ), %rsi
movq %rsi , ( %rax )
movq 104 ( %rsp ), %rsi
movq %rsi , 8 ( %rax )
movq 96 ( %rsp ), %rsi
movq %rsi , 16 ( %rax )
movq 88 ( %rsp ), %rsi
movq %rsi , 24 ( %rax )
movq 80 ( %rsp ), %rsi
movq %rsi , 32 ( %rax )
movq %r12 , 40 ( %rax )
movq %r15 , 48 ( %rax )
movb %cl , 56 ( %rax )
movb %dl , 57 ( %rax )
movq %rax , %rdi
callq playground :: echo
Wow! It allocates and initializes the struct on the heap directly!
match
match
is like C switch
. In fact, simple constant switch would generate linear comparison branches,
instead of jump table!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#[inline(never)]
fn echo ( bar : Foobar ) {
match bar . i {
3 => println! ( "{:?}" , bar ),
99 => println! ( "99 {:?}" , bar ),
88 => println! ( "88 {:?}" , bar ),
188 => println! ( "188 {:?}" , bar ),
288 => println! ( "288 {:?}" , bar ),
388 => println! ( "388 {:?}" , bar ),
488 => println! ( "488 {:?}" , bar ),
588 => println! ( "588 {:?}" , bar ),
688 => println! ( "688 {:?}" , bar ),
788 => println! ( "788 {:?}" , bar ),
888 => println! ( "888 {:?}" , bar ),
_ => todo! (),
}
}
Check the debug version of assembly code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
playground: : echo:
subq $904 , %rsp
movq %rdi , 184 ( %rsp )
movq 48 ( %rdi ), %rax
movq %rax , 192 ( %rsp )
subq $3 , %rax
je .LBB79_2
jmp .LBB79_36
.LBB79_36:
movq 192 ( %rsp ), %rax
subq $88 , %rax
je .LBB79_4
jmp .LBB79_37
.LBB79_37:
movq 192 ( %rsp ), %rax
subq $99 , %rax
je .LBB79_3
jmp .LBB79_38
.LBB79_38:
movq 192 ( %rsp ), %rax
subq $188 , %rax
je .LBB79_5
jmp .LBB79_39
Check the release version of assembly code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
playground: : echo:
subq $72 , %rsp
movq 48 ( %rdi ), %rax
cmpq $387 , %rax
jle .LBB10_1
cmpq $687 , %rax
jg .LBB10_14
cmpq $388 , %rax
je .LBB10_21
cmpq $488 , %rax
je .LBB10_22
cmpq $588 , %rax
jne .LBB10_17
movq %rdi , 8 ( %rsp )
leaq < playground :: Foobar as core :: fmt :: Debug > :: fmt ( %rip ), %rax
movq %rax , 16 ( %rsp )
leaq .L__unnamed_2 ( %rip ), %rax
jmp .LBB10_26
.LBB10_1:
cmpq $98 , %rax
jle .LBB10_2
cmpq $99 , %rax
je .LBB10_19
cmpq $188 , %rax
je .LBB10_20
cmpq $288 , %rax
jne .LBB10_17
Interestingly, the release version of assembly code splits the comparison branches into ranges to speed up the branch selection.