前言
这道题还是参考了别人的writeup,学到了内存对齐的知识。
分析
先分析源代码memcpy.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121// compiled with : gcc -o memcpy memcpy.c -m32 -lm
unsigned long long rdtsc(){
asm("rdtsc");
}
char* slow_memcpy(char* dest, const char* src, size_t len){
int i;
for (i=0; i<len; i++) {
dest[i] = src[i];
}
return dest;
}
char* fast_memcpy(char* dest, const char* src, size_t len){
size_t i;
// 64-byte block fast copy
if(len >= 64){
i = len / 64;
len &= (64-1);
while(i-- > 0){
__asm__ __volatile__ (
"movdqa (%0), %%xmm0\n"
"movdqa 16(%0), %%xmm1\n"
"movdqa 32(%0), %%xmm2\n"
"movdqa 48(%0), %%xmm3\n"
"movntps %%xmm0, (%1)\n"
"movntps %%xmm1, 16(%1)\n"
"movntps %%xmm2, 32(%1)\n"
"movntps %%xmm3, 48(%1)\n"
::"r"(src),"r"(dest):"memory");
dest += 64;
src += 64;
}
}
// byte-to-byte slow copy
if(len) slow_memcpy(dest, src, len);
return dest;
}
int main(void){
setvbuf(stdout, 0, _IONBF, 0);
setvbuf(stdin, 0, _IOLBF, 0);
printf("Hey, I have a boring assignment for CS class.. :(\n");
printf("The assignment is simple.\n");
printf("-----------------------------------------------------\n");
printf("- What is the best implementation of memcpy? -\n");
printf("- 1. implement your own slow/fast version of memcpy -\n");
printf("- 2. compare them with various size of data -\n");
printf("- 3. conclude your experiment and submit report -\n");
printf("-----------------------------------------------------\n");
printf("This time, just help me out with my experiment and get flag\n");
printf("No fancy hacking, I promise :D\n");
unsigned long long t1, t2;
int e;
char* src;
char* dest;
unsigned int low, high;
unsigned int size;
// allocate memory
char* cache1 = mmap(0, 0x4000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
char* cache2 = mmap(0, 0x4000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
src = mmap(0, 0x2000, 7, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
size_t sizes[10];
int i=0;
// setup experiment parameters
for(e=4; e<14; e++){ // 2^13 = 8K
low = pow(2,e-1);
high = pow(2,e);
printf("specify the memcpy amount between %d ~ %d : ", low, high);
scanf("%d", &size);
if( size < low || size > high ){
printf("don't mess with the experiment.\n");
exit(0);
}
sizes[i++] = size;
}
sleep(1);
printf("ok, lets run the experiment with your configuration\n");
sleep(1);
// run experiment
for(i=0; i<10; i++){
size = sizes[i];
printf("experiment %d : memcpy with buffer size %d\n", i+1, size);
dest = malloc( size );
memcpy(cache1, cache2, 0x4000); // to eliminate cache effect
t1 = rdtsc();
slow_memcpy(dest, src, size); // byte-to-byte memcpy
t2 = rdtsc();
printf("ellapsed CPU cycles for slow_memcpy : %llu\n", t2-t1);
memcpy(cache1, cache2, 0x4000); // to eliminate cache effect
t1 = rdtsc();
fast_memcpy(dest, src, size); // block-to-block memcpy
t2 = rdtsc();
printf("ellapsed CPU cycles for fast_memcpy : %llu\n", t2-t1);
printf("\n");
}
printf("thanks for helping my experiment!\n");
printf("flag : ----- erased in this source code -----\n");
return 0;
}
程序的大概逻辑是比较两个内存复制算法slow_memcpy
和fast_memcpy
的效率。其中,程序要求输入10次要分配的内存大小,每次大小都限定在两个相邻的2的幂次之间,并且使用malloc
函数来分配内存,如果所有步骤执行完毕程序会在最后输出flag。
先是按照要求跑一遍程序,发现程序最后会在某一次比较中突然停下,也就是说程序异常退出了。当时真不知道咋回事,也没有自己编译调试程序,于是便参考了人家的writeup,并有了思路。
fast_memcpy
函数中用于内存复制的两个指令movdqa
和movntps
他们的操作数如果是内存地址的话,那么这个地址必须是16字节对齐的,否则会产生一般保护性异常导致程序退出。malloc
在分配内存时它实际上还会多分配4字节用于存储堆块信息,所以如果分配a字节实际上分配的是a+4
字节。另外32位系统上该函数分配的内存是以8字节对齐的。
有了这两点就知道程序的异常退出是因为分配的内存没有16字节对齐,那么要getflag只需要每次分配的内存地址能够被16整除就可以了(实际上由于malloc
函数分配的内存8字节对齐,只要内存大小除以16的余数大于9就可以了)。下面贴出exp1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26from socket import *
import time
def work():
host = 'pwnable.kr'
port = 9022
sock = socket(AF_INET,SOCK_STREAM)
sock.connect((host,port))
print sock.recv(100)
for i in xrange(4,14):
tmp = 2**i-4
print sock.recv(1024),tmp
time.sleep(1)
sock.send(str(tmp)+'\n')
while True:
r = sock.recv(1024)
if r:
print r
time.sleep(0.5)
else:
break
work()
1 | C:\Users\admin\Desktop>python 2.py |
总结
通过这道题又学习到了新知识。学到了两个命令,以及32位系统上malloc
函数分配时另加4字节的堆块信息内存8字节对齐。