Attachments (1)
Change History (14)
by , 5 years ago
Attachment: | CIMG4605 (2).jpg added |
---|
comment:1 by , 5 years ago
I would guess there is a problem with https://git.haiku-os.org/haiku/commit/?id=7b4d924f98c08c57e6566b33aeda00cac2f6fec7 since that's the only boot related thing that changed in that range.
Does reverting this fix the problem?
comment:3 by , 5 years ago
This change is mainly moving code around, but there is some change in the system_time function.
New implementation (the rdtsc() function returns the result of rdtsc as a 64bit number):
extern "C" bigtime_t system_time() { return rdtsc() * gTimeConversionFactor; }
Old implementation for 32bit:
FUNCTION(system_time): /* load 64-bit factor into %eax (low), %edx (high) */ /* hand-assemble rdtsc -- read time stamp counter */ rdtsc /* time in %edx,%eax */ pushl %ebx movl gTimeConversionFactor, %ebx movl %edx, %ecx /* save high half */ mull %ebx /* truncate %eax, but keep %edx */ movl %ecx, %eax movl %edx, %ecx /* save high half of low */ mull %ebx /*, %eax*/ /* now compute [%edx, %eax] + [%ecx], propagating carry */ subl %ebx, %ebx /* need zero to propagate carry */ addl %ecx, %eax adc %ebx, %edx popl %ebx ret
Old implementation for 64bit:
extern "C" bigtime_t system_time() { uint64 lo, hi; asm("rdtsc": "=a"(lo), "=d"(hi)); return ((lo * gTimeConversionFactor) >> 32) + hi * gTimeConversionFactor; }
comment:4 by , 5 years ago
And indeed that's not correct, the previous code does an additionnal shift right by 32bits. So now our pause() is 4 million times too slow. Oops.
comment:6 by , 5 years ago
Does generated code of system_time OK? It looks less efficient than older version. system_time is time critical function and should run as fast as possible.
00000b54 <system_time>: b54: 55 push ebp b55: 89 e5 mov ebp,esp b57: 83 ec 5c sub esp,0x5c b5a: 57 push edi b5b: 56 push esi b5c: 53 push ebx b5d: e8 fc ff ff ff call b5e <system_time+0xa> b62: 89 45 c8 mov DWORD PTR [ebp-56],eax b65: 89 55 cc mov DWORD PTR [ebp-52],edx b68: 89 45 f0 mov DWORD PTR [ebp-16],eax b6b: 89 55 f4 mov DWORD PTR [ebp-12],edx b6e: c7 45 f8 00 00 00 00 mov DWORD PTR [ebp-8],0x0 b75: c7 45 fc 00 00 00 00 mov DWORD PTR [ebp-4],0x0 b7c: 89 55 b0 mov DWORD PTR [ebp-80],edx b7f: c7 45 b4 00 00 00 00 mov DWORD PTR [ebp-76],0x0 b86: 8b 0d 00 00 00 00 mov ecx,DWORD PTR ds:0x0 b8c: 8b 45 b0 mov eax,DWORD PTR [ebp-80] b8f: f7 e1 mul ecx b91: 8b 75 b0 mov esi,DWORD PTR [ebp-80] b94: 31 db xor ebx,ebx b96: 0f af f3 imul esi,ebx b99: 89 45 c0 mov DWORD PTR [ebp-64],eax b9c: 8b 45 b4 mov eax,DWORD PTR [ebp-76] b9f: 0f af c1 imul eax,ecx ba2: 01 f2 add edx,esi ba4: 89 55 c4 mov DWORD PTR [ebp-60],edx ba7: 8b 55 cc mov edx,DWORD PTR [ebp-52] baa: 23 15 bc 00 00 00 and edx,DWORD PTR ds:0xbc bb0: 89 55 b4 mov DWORD PTR [ebp-76],edx bb3: 01 45 c4 add DWORD PTR [ebp-60],eax bb6: 8b 45 c8 mov eax,DWORD PTR [ebp-56] bb9: 23 05 b8 00 00 00 and eax,DWORD PTR ds:0xb8 bbf: 89 45 b0 mov DWORD PTR [ebp-80],eax bc2: f7 e1 mul ecx bc4: 8b 75 b0 mov esi,DWORD PTR [ebp-80] bc7: 0f af f3 imul esi,ebx bca: 89 45 b8 mov DWORD PTR [ebp-72],eax bcd: 8b 45 b4 mov eax,DWORD PTR [ebp-76] bd0: 0f af c1 imul eax,ecx bd3: 01 f2 add edx,esi bd5: 89 55 bc mov DWORD PTR [ebp-68],edx bd8: 8b 55 c4 mov edx,DWORD PTR [ebp-60] bdb: 8b 75 c4 mov esi,DWORD PTR [ebp-60] bde: 31 ff xor edi,edi be0: 01 45 bc add DWORD PTR [ebp-68],eax be3: 8b 45 c0 mov eax,DWORD PTR [ebp-64] be6: 89 c2 mov edx,eax be8: 31 c0 xor eax,eax bea: 8b 4d b8 mov ecx,DWORD PTR [ebp-72] bed: 8b 5d bc mov ebx,DWORD PTR [ebp-68] bf0: 01 c1 add ecx,eax bf2: 11 d3 adc ebx,edx bf4: 39 da cmp edx,ebx bf6: 77 06 ja bfe <system_time+0xaa> bf8: 75 0a jne c04 <system_time+0xb0> bfa: 39 c8 cmp eax,ecx bfc: 76 06 jbe c04 <system_time+0xb0> bfe: 83 c6 01 add esi,0x1 c01: 83 d7 00 adc edi,0x0 c04: 89 4d d0 mov DWORD PTR [ebp-48],ecx c07: 89 5d d4 mov DWORD PTR [ebp-44],ebx c0a: 89 75 d8 mov DWORD PTR [ebp-40],esi c0d: 89 7d dc mov DWORD PTR [ebp-36],edi c10: 89 d9 mov ecx,ebx c12: 31 db xor ebx,ebx c14: 89 f0 mov eax,esi c16: 89 fa mov edx,edi c18: 89 c2 mov edx,eax c1a: 31 c0 xor eax,eax c1c: 09 c1 or ecx,eax c1e: 09 d3 or ebx,edx c20: 89 4d e0 mov DWORD PTR [ebp-32],ecx c23: 89 5d e4 mov DWORD PTR [ebp-28],ebx c26: 89 7d e8 mov DWORD PTR [ebp-24],edi c29: c7 45 ec 00 00 00 00 mov DWORD PTR [ebp-20],0x0 c30: 89 c8 mov eax,ecx c32: 89 da mov edx,ebx c34: 5b pop ebx c35: 5e pop esi c36: 5f pop edi c37: 89 ec mov esp,ebp c39: 5d pop ebp c3a: c3 ret c3b: 90 nop
comment:7 by , 5 years ago
I don't like having to write code in assembler just because it's more efficient. Is there a way to make the generated code better but staying in C++?
comment:8 by , 5 years ago
This source code:
extern "C" bigtime_t system_time() { uint64 tsc = rdtsc(); uint64 lo = (uint32)tsc; uint64 hi = tsc >> 32; return ((lo * gTimeConversionFactor) >> 32) + hi * gTimeConversionFactor; }
produce a bit better machine code:
00000b54 <system_time>: b54: 55 push ebp b55: 89 e5 mov ebp,esp b57: 83 ec 2c sub esp,0x2c b5a: 57 push edi b5b: 56 push esi b5c: 53 push ebx b5d: e8 fc ff ff ff call b5e <system_time+0xa> b62: 89 c6 mov esi,eax b64: 89 d7 mov edi,edx b66: 89 c1 mov ecx,eax b68: 89 7d f8 mov DWORD PTR [ebp-8],edi b6b: c7 45 fc 00 00 00 00 mov DWORD PTR [ebp-4],0x0 b72: a1 00 00 00 00 mov eax,ds:0x0 b77: 31 d2 xor edx,edx b79: 89 55 e4 mov DWORD PTR [ebp-28],edx b7c: 8b 75 e4 mov esi,DWORD PTR [ebp-28] b7f: 89 45 e0 mov DWORD PTR [ebp-32],eax b82: f7 e1 mul ecx b84: 0f af f1 imul esi,ecx b87: 8b 4d e0 mov ecx,DWORD PTR [ebp-32] b8a: 31 db xor ebx,ebx b8c: 0f af cb imul ecx,ebx b8f: 89 45 f0 mov DWORD PTR [ebp-16],eax b92: 8b 45 f8 mov eax,DWORD PTR [ebp-8] b95: 01 f2 add edx,esi b97: 89 55 f4 mov DWORD PTR [ebp-12],edx b9a: 01 4d f4 add DWORD PTR [ebp-12],ecx b9d: f7 65 e0 mul DWORD PTR [ebp-32] ba0: 8b 4d f8 mov ecx,DWORD PTR [ebp-8] ba3: 0f af 4d e4 imul ecx,DWORD PTR [ebp-28] ba7: 89 45 e8 mov DWORD PTR [ebp-24],eax baa: 8b 45 e0 mov eax,DWORD PTR [ebp-32] bad: 0f af 45 fc imul eax,DWORD PTR [ebp-4] bb1: 8b 5d f4 mov ebx,DWORD PTR [ebp-12] bb4: 31 f6 xor esi,esi bb6: 01 ca add edx,ecx bb8: 89 55 ec mov DWORD PTR [ebp-20],edx bbb: 01 45 ec add DWORD PTR [ebp-20],eax bbe: 89 d8 mov eax,ebx bc0: 89 f2 mov edx,esi bc2: 03 45 e8 add eax,DWORD PTR [ebp-24] bc5: 13 55 ec adc edx,DWORD PTR [ebp-20] bc8: 5b pop ebx bc9: 5e pop esi bca: 5f pop edi bcb: 89 ec mov esp,ebp bcd: 5d pop ebp bce: c3 ret bcf: 90 nop
comment:9 by , 5 years ago
Just for interest tested other compilers.
setarch x86 gcc -c -O3 time.cpp
:
00000000 <system_time>: 0: 55 push ebp 1: 89 e5 mov ebp,esp 3: 57 push edi 4: 56 push esi 5: 53 push ebx 6: e8 fc ff ff ff call 7 <system_time+0x7> b: 81 c3 02 00 00 00 add ebx,0x2 11: 83 ec 1c sub esp,0x1c 14: e8 fc ff ff ff call 15 <system_time+0x15> 19: 89 55 e4 mov DWORD PTR [ebp-28],edx 1c: 8b 93 00 00 00 00 mov edx,DWORD PTR [ebx] 22: 8b 32 mov esi,DWORD PTR [edx] 24: f7 e6 mul esi 26: 89 d0 mov eax,edx 28: 31 d2 xor edx,edx 2a: 89 c1 mov ecx,eax 2c: 8b 45 e4 mov eax,DWORD PTR [ebp-28] 2f: 89 d3 mov ebx,edx 31: f7 e6 mul esi 33: 01 c8 add eax,ecx 35: 11 da adc edx,ebx 37: 83 c4 1c add esp,0x1c 3a: 5b pop ebx 3b: 5e pop esi 3c: 5f pop edi 3d: 5d pop ebp 3e: c3 ret
clang -c -O3 time.cpp
:
00000000 <system_time>: 0: 55 push ebp 1: 89 e5 mov ebp,esp 3: 57 push edi 4: 56 push esi 5: e8 fc ff ff ff call 6 <system_time+0x6> a: 89 d1 mov ecx,edx c: 8b 3d 00 00 00 00 mov edi,DWORD PTR ds:0x0 12: f7 e7 mul edi 14: 89 d6 mov esi,edx 16: 89 c8 mov eax,ecx 18: f7 e7 mul edi 1a: 01 f0 add eax,esi 1c: 83 d2 00 adc edx,0x0 1f: 5e pop esi 20: 5f pop edi 21: 5d pop ebp 22: c3 ret
Clang is the best, almost same as handwritten assembly.
comment:10 by , 5 years ago
If you want better optimized code, shouldn't you yank rdtsc
inside the cpu.cpp file as a static inline assembly function? A bit of function overhead for essentially a single instruction as-is?
I see the only other place it's called from is bios_ia32/debug.cpp, but do we need that level of granularity in debug? Can replace with system_time()
.
E.g.
static inline __attribute__((always_inline)) uint64_t tdtsc() { uint64_t msr; asm volatile("rdtsc\n\t" "shl $32, %%rdx\n\t" "or %%rdx, %0" : "=a" (msr) : : "rdx"); return msr; }
comment:11 by , 5 years ago
Indeed, I assumed that rdtsc() would be inlined, but I see it isn't. We should fix that.
comment:12 by , 5 years ago
gcc2 with inlined rdtsc:
00000b8c <system_time>: b8c: 55 push %ebp b8d: 89 e5 mov %esp,%ebp b8f: 83 ec 2c sub $0x2c,%esp b92: 0f 31 rdtsc b94: 57 push %edi b95: 56 push %esi b96: 53 push %ebx b97: 89 c6 mov %eax,%esi b99: 89 d7 mov %edx,%edi b9b: 89 c1 mov %eax,%ecx b9d: 89 7d f8 mov %edi,0xfffffff8(%ebp) ba0: c7 45 fc 00 00 00 00 movl $0x0,0xfffffffc(%ebp) ba7: a1 00 00 00 00 mov 0x0,%eax bac: 31 d2 xor %edx,%edx bae: 89 55 e4 mov %edx,0xffffffe4(%ebp) bb1: 8b 75 e4 mov 0xffffffe4(%ebp),%esi bb4: 89 45 e0 mov %eax,0xffffffe0(%ebp) bb7: f7 e1 mul %ecx bb9: 0f af f1 imul %ecx,%esi bbc: 8b 4d e0 mov 0xffffffe0(%ebp),%ecx bbf: 31 db xor %ebx,%ebx bc1: 0f af cb imul %ebx,%ecx bc4: 89 45 f0 mov %eax,0xfffffff0(%ebp) bc7: 8b 45 f8 mov 0xfffffff8(%ebp),%eax bca: 01 f2 add %esi,%edx bcc: 89 55 f4 mov %edx,0xfffffff4(%ebp) bcf: 01 4d f4 add %ecx,0xfffffff4(%ebp) bd2: f7 65 e0 mull 0xffffffe0(%ebp) bd5: 8b 4d f8 mov 0xfffffff8(%ebp),%ecx bd8: 0f af 4d e4 imul 0xffffffe4(%ebp),%ecx bdc: 89 45 e8 mov %eax,0xffffffe8(%ebp) bdf: 8b 45 e0 mov 0xffffffe0(%ebp),%eax be2: 0f af 45 fc imul 0xfffffffc(%ebp),%eax be6: 8b 5d f4 mov 0xfffffff4(%ebp),%ebx be9: 31 f6 xor %esi,%esi beb: 01 ca add %ecx,%edx bed: 89 55 ec mov %edx,0xffffffec(%ebp) bf0: 01 45 ec add %eax,0xffffffec(%ebp) bf3: 89 d8 mov %ebx,%eax bf5: 89 f2 mov %esi,%edx bf7: 03 45 e8 add 0xffffffe8(%ebp),%eax bfa: 13 55 ec adc 0xffffffec(%ebp),%edx bfd: 5b pop %ebx bfe: 5e pop %esi bff: 5f pop %edi c00: 89 ec mov %ebp,%esp c02: 5d pop %ebp c03: c3 ret
comment:13 by , 5 years ago
Milestone: | Unscheduled → R1/beta2 |
---|
Assign tickets with status=closed and resolution=fixed within the R1/beta2 development window to the R1/beta2 Milestone
Screen syslog