es07-rpi-arm, vfp, neon inst.ppt - koreatech
TRANSCRIPT
Embedded Systems KUT1-1
Embedded Systems
ARM Instruction, VFP Instruction, and NEONInstruction
Prof. Myung-Eui Lee (A-405)[email protected]
Embedded Systems KUT1-2
ARM Instructions
.text
.global _start_start:
.func mainmain:
mov r1, #3 /* r1 <- 3 */mov r2, #4 /* r2 <- 4 */add r0, r1, r2 /* r0 <- r1 + r2 */
mov r7, #1 @ exit system callsvc 0
test.s
ARM Assembly Language : exam-add (ld) Check Return Code : #echo $?
CC = arm-linux-gnueabihf
all: test
test: test.o${CC}-ld -o $@ $<
test.o: test.s${CC}-as -g -o $@ $<
clean:rm -vf test *.o
makefile
Embedded Systems KUT1-3
ARM Instructions
.text
.global main
.func mainmain:
mov r1, #3 /* r1 <- 3 */mov r2, #4 /* r2 <- 4 */add r0, r1, r2 /* r0 <- r1 + r2 */
mov r7, #1 @ exit system callsvc 0
test.s
ARM Assembly Language : exam-add (gcc) Check Return Code : #echo $?
CC = arm-linux-gnueabihf
all: test
test: test.o${CC}-gcc -o $@ $<
test.o: test.s${CC}-as -g -o $@ $<
clean:rm -vf test *.o
makefile
Embedded Systems KUT1-4
ARM Instructions
.datamessage: .asciz "Result = %d\n"result: .word 0
.text
.global main
.func main main:
mov r1, #3 /* r1 <- 3 */mov r2, #4 /* r2 <- 4 */add r0, r1, r2 /* r0 <- r1 + r2 */
ldr r3, =resultstr r0, [r3]
ldr r0, =messageldr r1, [r3]bl printf
mov r7, #1 @ exitsvc 0
.extern printf
test.s
ARM Assembly Language : exam-add (gcc-printf) Check Return Code : no!!!, display on the screen
CC = arm-linux-gnueabihf
all: test
test: test.o${CC}-gcc -o $@ $<
test.o: test.s${CC}-as -g -o $@ $<
clean:rm -vf test *.o
makefile
Embedded Systems KUT1-5
ARM Instructions
.datamessage: .asciz "Hello World!\n"
.text
.global _start _start:
.func main main:
mov r0, #1 @ stdout (monitor)ldr r1, =message @ address of stringsmov r2, #14 @ length of stringsmov r7, #4 @ writesvc 0
mov r7, #1 @ exitsvc 0
test.s
ARM Assembly Language : exam-hello (ld)
CC = arm-linux-gnueabihf
all: test
test: test.o${CC}-ld -o $@ $<
test.o: test.s${CC}-as -g -o $@ $<
clean:rm -vf test *.o
makefile
Embedded Systems KUT1-6
ARM Instructions
.datamessage: .asciz "Hello World!\n"
.text
.global main
.func main main:
ldr r0, =messagebl printf
mov r7, #1 @ exitsvc 0
.extern printf
test.s
ARM Assembly Language : exam-hello (gcc)
CC = arm-linux-gnueabihf
all: test
test: test.o${CC}-gcc -o $@ $<
test.o: test.s${CC}-as -g -o $@ $<
clean:rm -vf test *.o
makefile
Embedded Systems KUT1-7
ARM Instructions
.datamyvar1: .word 0myvar2: .word 0
.text
.global main
.func main main:
ldr r1, =myvar1 @ r1 <- &myvar1mov r3, #3 @ r3 <- 3str r3, [r1] @ *r1 <- r3
ldr r2, =myvar2 @ r2 <- &myvar2mov r3, #4 @ r3 <- 4str r3, [r2] @ *r2 <- r3
test.s
ARM Assembly Language : load-store (gcc)- Same makefile as previous exam. and #echo $?
ldr r1, =myvar1 @ r1 <- &myvar1ldr r1, [r1] @ r1 <- *r1
ldr r2, =myvar2 @ r2 <- &myvar2ldr r2, [r2] @ r2 <- *r2
bl func_add
mov r7, #1 @ exitsvc 0
func_add:
add r0, r1, r2 @ r0 <- r1 + r2
bx lr
Embedded Systems KUT1-8
ARM Instructions
Advance-SIMD : Neon» https://developer.arm.com/architectures/instruction-
sets/simd-isas/neon» https://developer.arm.com/architectures/instruction-
sets/simd-isas/neon/neon-programmers-guide-for-armv8-a
Neon Intrinsics» Function calls that the compiler replaces with an
appropriate Neon instruction or sequence of Neon instructions.
» Intrinsics provide almost as much control as writing assembly language, but leave the allocation of registers to the compiler, so that developers can focus on the algorithms.
Difference between Neon and VFP» Neon = Parallel/128 bit, VFP = Sequential/64 bit
Embedded Systems KUT1-9
ARM Instructions
Parallel Vector Process
Embedded Systems KUT1-10
ARM Instructions
ARMv8 Vector Registers
3 types of Instruction» VFP» Both VFP and Neon» Neon
* Note:“NEON Programmers Guide”
p.168
Embedded Systems KUT1-11
ARM Instructions
#include <stdio.h> #include <arm_neon.h> //need to use intrinsics
int main(){ //vector addition 8x8 example.
uint8x8_t vec_a, vec_b, vec_dest; // a vector of 8 x 8bit ints vec_a = vdup_n_u8(9); vec_b = vdup_n_u8(10);
vec_dest = vec_a * vec_b; // 90
int i = 0; int result;
result = vget_lane_u8( vec_dest, 0 ); printf( "Lane %d: %d\n", i, result ); i++;
result = vget_lane_u8( vec_dest, 1 ); printf( "Lane %d: %d\n", i, result ); i++;
result = vget_lane_u8( vec_dest, 2 ); printf( "Lane %d: %d\n", i, result ); i++;
result = vget_lane_u8( vec_dest, 3 ); printf( "Lane %d: %d\n", i, result ); i++;
result = vget_lane_u8( vec_dest, 4 ); printf( "Lane %d: %d\n", i, result ); i++;
result = vget_lane_u8( vec_dest, 5 ); printf( "Lane %d: %d\n", i, result ); i++;
result = vget_lane_u8( vec_dest, 6 ); printf( "Lane %d: %d\n", i, result ); i++;
result = vget_lane_u8( vec_dest, 7 ); printf( "Lane %d: %d\n", i, result ); }
test.s
ARM SIMD Language : NEON
Embedded Systems KUT1-12
ARM Instructions
CC = arm-linux-gnueabihfCFLAGS = -g -march=armv8-a -mtune=cortex-a53 -mfpu=neon
all:${CC}-gcc $(CFLAGS) -o test test.c
clean: rm -vf test *.o
makefile
ARM SIMD Language : NEON
Four options for NEON and VFP
Raspberry Pi 3 Target Configuration
/usr/src/linux/.config
Embedded Systems KUT1-13
NEON Data Types
ARM Instructions
• Polynomial arithmetic is useful when implementing certain cryptography or data integrity algorithms.
• “NEON Programmers Guide”, p.70
vdup IntrinsicSet all lanes to the same value
Embedded Systems KUT1-14
ARM Instructions
vget IntrinsicExtract lanes from a vector
* Note: “RealView Compilation Tools Compiler Reference Guide”, p 482
Embedded Systems KUT1-15
NEON Debug : gdb#gdb test (gdb)start (gdb)info vector
ARM Instructions
Embedded Systems KUT1-16
NEON Dis-assemble code(gdb)disassem n n n . . . . info vector run
ARM Instructions