Rust is a complex programming language, with a lot of concepts. When you find the concept hard to
understand, perhaps checking the assembly code would be helpful. It’s my favourite way to learn Rust.
In Rust, we have to use move in most time. It is used in variable assignment, function arguments and closure.
move means ownship transfer, i.e. when you move one object, you could not access it from the original
variable binding. But what happens in assembly code?
Let’s take a simple example:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
  
use   std ::env ; 
 
 #[derive(Debug)] 
 struct  Foobar   { 
      a : i8 , 
      b : u8 , 
      c : i64 , 
      d : i64 , 
      e : i64 , 
      f : i64 , 
      g : i64 , 
      h : i64 , 
      i : i64 , 
 } 
 
 #[inline(never)] 
 fn  echo ( bar : Foobar )   { 
      println! ( "{:?}" ,   bar ); 
 } 
 
 fn  main ()   { 
      let   arg1   =   env ::args (). nth ( 1 ). unwrap (). parse ::< i8 > (). unwrap (); 
      let   arg2   =   env ::args (). nth ( 2 ). unwrap (). parse ::< u8 > (). unwrap (); 
      let   arg3   =   env ::args (). nth ( 3 ). unwrap (). parse ::< i64 > (). unwrap (); 
      let   arg4   =   env ::args (). nth ( 4 ). unwrap (). parse ::< i64 > (). unwrap (); 
      let   arg5   =   env ::args (). nth ( 5 ). unwrap (). parse ::< i64 > (). unwrap (); 
      let   arg6   =   env ::args (). nth ( 6 ). unwrap (). parse ::< i64 > (). unwrap (); 
      let   arg7   =   env ::args (). nth ( 7 ). unwrap (). parse ::< i64 > (). unwrap (); 
      let   arg8   =   env ::args (). nth ( 8 ). unwrap (). parse ::< i64 > (). unwrap (); 
      let   arg9   =   env ::args (). nth ( 9 ). unwrap (). parse ::< i64 > (). unwrap (); 
 
      let   bar   =   Foobar   { 
          a : arg1 , 
          b : arg2 , 
          c : arg3 , 
          d : arg4 , 
          e : arg5 , 
          f : arg6 , 
          g : arg7 , 
          h : arg8 , 
          i : arg9 , 
      }; 
      echo ( bar ); 
 } 
  
 
Note that I define Foobar with more than two fields, so that llvm would not unroll the struct and
pass the fields to echo field by field. And, I disable inline of echo, so that we could check move
via real function call.
I use Rust playground  to generate the assembly code.
Assembly code in debug mode:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
  
... 
	callq 	core :: str :: < impl  str > :: parse 
 	movw 	%ax ,  206 ( %rsp ) 
 	jmp 	.LBB80_9 
 
 .LBB80_9: 
	movw 	206 ( %rsp ),  %ax 
 	movw 	%ax ,  1212 ( %rsp ) 
 	movw 	1212 ( %rsp ),  %ax 
 	movw 	%ax ,  224 ( %rsp ) 
 	movzwl 	224 ( %rsp ),  %edi 
 	leaq 	.L__unnamed_22 ( %rip ),  %rsi 
 	callq 	core :: result :: Result < T , E > :: unwrap 
 	movb 	%al ,  205 ( %rsp ) 
 	jmp 	.LBB80_10 
 
 .LBB80_109: 
	leaq 	1032 ( %rsp ),  %rdi 
 	callq 	core :: ptr :: drop_in_place < std :: env :: Args > 
 	movq 	8 ( %rsp ),  %rax 
 ... 
	movb 	205 ( %rsp ),  %r11b 
 	movb 	%r11b ,  1120 ( %rsp ) 
 	movb 	%r10b ,  1121 ( %rsp ) 
 ... 
	movq 	%rax ,  1112 ( %rsp ) 
 	leaq 	1128 ( %rsp ),  %rdi 
 	leaq 	1064 ( %rsp ),  %rsi 
 	movl 	$64 ,  %edx 
 	callq 	memcpy@PLT 
 	leaq 	1128 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
  
 
You could see that it uses memcpy to copy the struct and pass to echo. 
206(%rsp) is one of the parsed argument, after unwrap(), it becomes 205(%rsp), and finally as
the source item 1120(%rsp) of memcpy.
Let’s check the release version:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
  
... 
	callq 	* < std :: env :: Args  as  core :: iter :: traits :: iterator :: Iterator > :: next@GOTPCREL ( %rip ) 
 ... 
	callq 	* core :: num :: < impl  core :: str :: traits :: FromStr  for  i64 > :: from_str@GOTPCREL ( %rip ) 
 	cmpb 	$0 ,  8 ( %rsp ) 
 	jne 	.LBB11_72 
 	movq 	16 ( %rsp ),  %rax 
 	movq 	%rax ,  144 ( %rsp ) 
 
 .LBB11_352: 
	movq 	144 ( %rsp ),  %rax 
 	movq 	%rax ,  40 ( %rsp ) 
 	movq 	136 ( %rsp ),  %rax 
 ... 
	leaq 	40 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
  
 
No copy in release mode! It just pass the struct pointer to echo. 
144(%rsp) is one of the parsed argument, get assigned to the first item of Foobar struct, 40(%rsp).
And then, leaq	40(%rsp), %rdi gets the struct address. Interestingly, you could also see that the fields
are reordered by the llvm.
So no need to worry about the performance of move.
BTW, let’s change the code a bit and use reference instead.
Then no surprise, it uses pointer, no matter debug or release mode.
1
 2
  
	leaq 	1064 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
  
 
How about struct with copy trait?
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
  
#[derive(Debug, Copy, Clone)] 
 struct  Foobar   { 
      a : i8 , 
      b : u8 , 
      c : i64 , 
      d : i64 , 
      e : i64 , 
      f : i64 , 
      g : i64 , 
      h : i64 , 
      i : i64 , 
 } 
  
 
Check the debug version of assembly code:
1
 2
 3
 4
 5
 6
 7
 8
  
	movq 	%rcx ,  1104 ( %rsp ) 
 	movq 	%rax ,  1112 ( %rsp ) 
 	leaq 	1128 ( %rsp ),  %rdi 
 	leaq 	1064 ( %rsp ),  %rsi 
 	movl 	$64 ,  %edx 
 	callq 	memcpy@PLT 
 	leaq 	1128 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
  
 
Well, memcpy happens.
Check the release version of assembly code:
1
 2
  
	leaq 	40 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
  
 
No copy happens! Just like move, llvm does not do stupid copy even if the copy trait is implemented.
Even if you call echo twice, it happens the same.
We could change the code and check again:
1
 2
 3
 4
  
     let   mut   bar   =   .. . 
      echo ( bar ); 
      bar . a   =   99 ; 
      echo ( bar ); 
  
 
Check the debug version of assembly code:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
  
	movq 	%rax ,  1112 ( %rsp ) 
 	leaq 	1128 ( %rsp ),  %rdi 
 	leaq 	1064 ( %rsp ),  %rsi 
 	movl 	$64 ,  %edx 
 	callq 	memcpy@PLT 
 	leaq 	1128 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
 	movb 	$99 ,  1120 ( %rsp ) 
 	leaq 	1192 ( %rsp ),  %rdi 
 	leaq 	1064 ( %rsp ),  %rsi 
 	movl 	$64 ,  %edx 
 	callq 	memcpy@PLT 
 	leaq 	1192 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
  
 
Copy one to echo, and modify a field in place, and copy another to second echo.
Check the release version of assembly code:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
  
	leaq 	40 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
 	movq 	%rbp ,  40 ( %rsp ) 
 	movq 	%r14 ,  48 ( %rsp ) 
 	movq 	%r12 ,  56 ( %rsp ) 
 	movq 	%r13 ,  64 ( %rsp ) 
 	movq 	%r15 ,  72 ( %rsp ) 
 	movq 	120 ( %rsp ),  %rax 
 	movq 	%rax ,  80 ( %rsp ) 
 	movq 	112 ( %rsp ),  %rax 
 	movq 	%rax ,  88 ( %rsp ) 
 	movb 	$99 ,  96 ( %rsp ) 
 	movb 	%bl ,  97 ( %rsp ) 
 	leaq 	40 ( %rsp ),  %rdi 
 	callq 	playground :: echo 
  
 
The release assembly code reuses the same memory block in the stack to hold the struct content
and passes the address to echo. Perfect!
Because function calls happen in the same stack frame chains, so Rust could optimize them without question.
But for the closure case, esepcially for threading, the copy is unavoidable, because the closure invocation
would happen in different context or even different OS thread!
Let’s confirm it.
1
 2
 3
  
     std ::thread ::spawn ( ||   { 
          echo ( bar ); 
      }); 
  
 
We only check the release assembly code:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
  
... 
	callq 	* core :: num :: < impl  core :: str :: traits :: FromStr  for  i64 > :: from_str@GOTPCREL ( %rip ) 
 	cmpb 	$0 ,  8 ( %rsp ) 
 	jne 	.LBB41_345 
 	movq 	16 ( %rsp ),  %r12 
 ... 
	movq 	%r12 ,  72 ( %rsp ) 
 ... 
	movl 	$88 ,  %edi 
 	movl 	$8 ,  %esi 
 	callq 	* __rust_alloc@GOTPCREL ( %rip ) 
 	testq 	%rax ,  %rax 
 	je 	.LBB41_397 
 	movq 	88 ( %rsp ),  %rcx 
 	movq 	%rcx ,  80 ( %rax ) 
 	movups 	72 ( %rsp ),  %xmm0 
 	movups 	%xmm0 ,  64 ( %rax ) 
 	movdqu 	8 ( %rsp ),  %xmm0 
 	movups 	24 ( %rsp ),  %xmm1 
 	movups 	40 ( %rsp ),  %xmm2 
 	movups 	56 ( %rsp ),  %xmm3 
 	movups 	%xmm3 ,  48 ( %rax ) 
 	movups 	%xmm2 ,  32 ( %rax ) 
 	movups 	%xmm1 ,  16 ( %rax ) 
 	movdqu 	%xmm0 ,  ( %rax ) 
 	leaq 	.L__unnamed_23 ( %rip ),  %rcx 
 	leaq 	112 ( %rsp ),  %rdi 
 	movq 	%r15 ,  %rsi 
 	movq 	%rax ,  %rdx 
 	callq 	* std :: sys :: unix :: thread :: Thread :: new@GOTPCREL ( %rip ) 
  
 
You could see that it allocates on the heap and copy from the stack.
Take one field as example: %r12 -> 72(%rsp) -> %xmm0 -> 64(%rax).
Box is used to manage memory on heap. In fact, Rust perfers stack. As known, struct/enum allocation
and initialization only happen on the stack. Even if you wrap it with Box::new(), you still need to
construct a struct instance on the stack first.
Then we have a question, does Rust optimize it so that the construction done on the heap directly?
Let’s check, we reuse above source code and change it:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
  
#[inline(never)] 
 fn  echo ( bar : Box < Foobar > )   { 
      println! ( "{:?}" ,   bar ); 
 } 
 .. . 
      let   bar   =   Box ::new ( Foobar   { 
          a : arg1 , 
          b : arg2 , 
          c : arg3 , 
          d : arg4 , 
          e : arg5 , 
          f : arg6 , 
          g : arg7 , 
          h : arg8 , 
          i : arg9 , 
      }); 
      echo ( bar ); 
  
 
Check the debug assembly code:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
  
	callq 	alloc :: alloc :: exchange_malloc 
 ... 
	movq 	( %rsp ),  %rdi 
 	leaq 	1072 ( %rsp ),  %rsi 
 	movl 	$64 ,  %edx 
 	callq 	memcpy@PLT 
 	movq 	( %rsp ),  %rax 
 	movq 	%rax ,  1216 ( %rsp ) 
 	movq 	( %rsp ),  %rdi 
 	callq 	playground :: echo 
  
 
Yes, as expected, it constructs the struct on stack, and copy it to the heap.
What about release version?
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
  
	callq 	* __rust_alloc@GOTPCREL ( %rip ) 
 	testq 	%rax ,  %rax 
 	movl 	76 ( %rsp ),  %ecx 
 	movzbl 	75 ( %rsp ),  %edx 
 	je 	.LBB13_357 
 	movq 	112 ( %rsp ),  %rsi 
 	movq 	%rsi ,  ( %rax ) 
 	movq 	104 ( %rsp ),  %rsi 
 	movq 	%rsi ,  8 ( %rax ) 
 	movq 	96 ( %rsp ),  %rsi 
 	movq 	%rsi ,  16 ( %rax ) 
 	movq 	88 ( %rsp ),  %rsi 
 	movq 	%rsi ,  24 ( %rax ) 
 	movq 	80 ( %rsp ),  %rsi 
 	movq 	%rsi ,  32 ( %rax ) 
 	movq 	%r12 ,  40 ( %rax ) 
 	movq 	%r15 ,  48 ( %rax ) 
 	movb 	%cl ,  56 ( %rax ) 
 	movb 	%dl ,  57 ( %rax ) 
 	movq 	%rax ,  %rdi 
 	callq 	playground :: echo 
  
 
Wow! It allocates and initializes the struct on the heap directly!
match is like C switch. In fact, simple constant switch would generate linear comparison branches,
instead of jump table!
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
  
#[inline(never)] 
 fn  echo ( bar : Foobar )   { 
      match   bar . i   { 
          3   =>   println! ( "{:?}" ,   bar ), 
          99   =>   println! ( "99 {:?}" ,   bar ), 
          88   =>   println! ( "88 {:?}" ,   bar ), 
          188   =>   println! ( "188 {:?}" ,   bar ), 
          288   =>   println! ( "288 {:?}" ,   bar ), 
          388   =>   println! ( "388 {:?}" ,   bar ), 
          488   =>   println! ( "488 {:?}" ,   bar ), 
          588   =>   println! ( "588 {:?}" ,   bar ), 
          688   =>   println! ( "688 {:?}" ,   bar ), 
          788   =>   println! ( "788 {:?}" ,   bar ), 
          888   =>   println! ( "888 {:?}" ,   bar ), 
          _   =>   todo! (), 
      } 
 } 
  
 
Check the debug version of assembly code:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
  
playground: : echo: 
	subq 	$904 ,  %rsp 
 	movq 	%rdi ,  184 ( %rsp ) 
 	movq 	48 ( %rdi ),  %rax 
 	movq 	%rax ,  192 ( %rsp ) 
 	subq 	$3 ,  %rax 
 	je 	.LBB79_2 
 	jmp 	.LBB79_36 
 
 .LBB79_36: 
	movq 	192 ( %rsp ),  %rax 
 	subq 	$88 ,  %rax 
 	je 	.LBB79_4 
 	jmp 	.LBB79_37 
 
 .LBB79_37: 
	movq 	192 ( %rsp ),  %rax 
 	subq 	$99 ,  %rax 
 	je 	.LBB79_3 
 	jmp 	.LBB79_38 
 
 .LBB79_38: 
	movq 	192 ( %rsp ),  %rax 
 	subq 	$188 ,  %rax 
 	je 	.LBB79_5 
 	jmp 	.LBB79_39 
  
 
Check the release version of assembly code:
 1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
  
playground: : echo: 
	subq 	$72 ,  %rsp 
 	movq 	48 ( %rdi ),  %rax 
 	cmpq 	$387 ,  %rax 
 	jle 	.LBB10_1 
 	cmpq 	$687 ,  %rax 
 	jg 	.LBB10_14 
 	cmpq 	$388 ,  %rax 
 	je 	.LBB10_21 
 	cmpq 	$488 ,  %rax 
 	je 	.LBB10_22 
 	cmpq 	$588 ,  %rax 
 	jne 	.LBB10_17 
 	movq 	%rdi ,  8 ( %rsp ) 
 	leaq 	< playground :: Foobar  as  core :: fmt :: Debug > :: fmt ( %rip ),  %rax 
 	movq 	%rax ,  16 ( %rsp ) 
 	leaq 	.L__unnamed_2 ( %rip ),  %rax 
 	jmp 	.LBB10_26 
 
 .LBB10_1: 
	cmpq 	$98 ,  %rax 
 	jle 	.LBB10_2 
 	cmpq 	$99 ,  %rax 
 	je 	.LBB10_19 
 	cmpq 	$188 ,  %rax 
 	je 	.LBB10_20 
 	cmpq 	$288 ,  %rax 
 	jne 	.LBB10_17 
  
 
Interestingly, the release version of assembly code splits the comparison branches into ranges to speed up the branch selection.