estudio de utilizaci on efectiva de procesadores vectoriales · universidad de las palmas de gran...
TRANSCRIPT
Universidad de Las Palmas de Gran Canaria
Estudio de utilizacion efectiva deprocesadores vectoriales
Modulo adherido al simulador
Laura Auton Garcıa
Tutores:Francisca Quintana DomınguezRoger Espasa Sans
Las Palmas de Gran Canaria, 17 de abril de 2014
Apendice A
Modulo CORE
1 #ifndef CORE_H
2 #define CORE_H
34 #include "common.h"
5 #include <map >
6 #include <bitset >
78 extern PIN_MUTEX printLock;
910 #ifdef DEBUG_DEP
11 #define LOCK_PRINT(X) \
12 PIN_MutexLock( &printLock ); \
13 cout << X; \
14 PIN_MutexUnlock( &printLock );
15 #else
16 #define LOCK_PRINT(X)
17 #endif
1819 #ifndef KNC_TLB_LVLS
20 #define KNC_TLB_LVLS 3
21 #endif
2223 #ifndef KNC_CACHE_LVLS
24 #define KNC_CACHE_LVLS 3
25 #endif
2627 // ========================================================================
28 // Global footprint with common info for all threads/application
29 // ========================================================================
3031 typedef enum
32 {
33 INS_TYPE_NONVPU ,
34 INS_TYPE_V_VECTOR ,
35 INS_TYPE_V_SCALAR ,
36 INS_TYPE_MEM ,
37 INS_TYPE_NUM
38 }INS_TYPE_t;
3940 typedef struct
41 {
42 bitset <INS_TYPE_NUM > insType;
43 UINT32 latency;
44 UINT32 insSize;
45 UINT32 srcReg [6];
46 UINT32 dstReg [6];
47 string disassemble;
48 UINT64 routine;
49 }INS_FOOT_PRINT_t;
50
1
APENDICE A. MODULO CORE 2
51 typedef map <UINT64 , INS_FOOT_PRINT_t > FOOT_PRINT;
52 extern FOOT_PRINT Footprint;
5354 // ========================================================================
55 // Basic Block state: Tracks basic block detailed instruction breakdown
56 // ========================================================================
5758 typedef struct BBL_STATE_t
59 {
60 // Last level accesed to get the required data
61 INT32 tlbLevelHit;
62 INT32 cacheLevelHit;
6364 // Breakdown of cycles accumulated
65 UINT32 breakdownTLB[KNC_TLB_LVLS ];
66 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];
6768 // Sumatory of breakdowns
69 UINT32 cycles;
7071 BBL_STATE_t(INT32 tlbHit , INT32 cacheHit):
72 tlbLevelHit(tlbHit),
73 cacheLevelHit(cacheHit),
74 breakdownTLB (),
75 breakdownCACHE (),
76 cycles (0){}
7778 }BBL_STATE_t;
7980 typedef std::pair <UINT64 , UINT32 > BBL_ENTRY_KEY;
81 typedef map <BBL_ENTRY_KEY , BBL_STATE_t > BBL_STATE;
8283 // ========================================================================
84 // Register File: Tracks register accesses
85 // ========================================================================
8687 typedef struct
88 {
89 // Cycle in which data in register will be available
90 COUNTER cycle;
9192 // Las instruction that wrote the register
93 BBL_ENTRY_KEY PC;
9495 // Breakdown of cycles if load
96 UINT32 breakdownTLB[KNC_TLB_LVLS ];
97 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];
9899 }REG_FILE_STATE_t;
100101 typedef map <UINT32 , REG_FILE_STATE_t > REG_FILE;
102103 // ========================================================================
104 // State: lastest state of simulation
105 // ========================================================================
106107 typedef struct STATE
108 {
109 REG_FILE *regFile;
110 BBL_STATE *bbl;
111112 // Memory access breakdown of last instruction of each thread
113 struct MEMORY_STATE{
114 UINT32 breakdownTLB[KNC_TLB_LVLS ];
115 UINT32 breakdownCACHE[KNC_CACHE_LVLS ];
116 MEMORY_STATE (): breakdownTLB (), breakdownCACHE (){};
117 }* memory;
118119 // SHARED
120 COUNTER issue;
APENDICE A. MODULO CORE 3
121 COUNTER wBackMemory;
122123 // Information of last instruction that used pipeline
124 struct LAST_INS{
125 UINT32 tid;
126 BBL_ENTRY_KEY key;
127 LAST_INS(UINT32 tid=0, BBL_ENTRY_KEY key=make_pair (0,0)): tid(tid), key(key)←↩{};
128 }lastInstruction;
129130 STATE(): issue (0), wBackMemory (0);
131132 }STATE;
133134 // ========================================================================
135 // Stats: accumulated stats of simulation. Both by ins and global
136 // ========================================================================
137138 typedef enum
139 {
140 INS_STALL_ISSUE ,
141 INS_STALL_NONVPU ,
142 INS_STALL_V_SCALAR ,
143 INS_STALL_V_VECTOR ,
144 INS_STALL_NUM
145 }INS_STALL_t;
146147 typedef enum
148 {
149 STALLS_ENTRY ,
150 TLB_ENTRY ,
151 CACHE_ENTRY
152 } BREAKDOWN_t;
153154 typedef struct STATS_INS_s
155 {
156 // Bytes loaded from load instructions
157 UINT32 bytesLoaded;
158159 // Sumatory of breakdown
160 UINT32 cycles;
161162 // Breakdown of stalls accumulated
163 COUNTER breakdownSTALLS[INS_STALL_NUM ];
164165 // Breakdown of cycles accumulated
166 COUNTER breakdownTLB[KNC_TLB_LVLS ];
167 COUNTER breakdownCACHE[KNC_CACHE_LVLS ];
168169 STATS_INS_s (): bytesLoaded (0), cycles (0),
170 breakdownSTALLS (),
171 breakdownTLB (),
172 breakdownCACHE (){}
173174 }STATS_INS_s;
175176 typedef map <UINT64 , STATS_INS_s > STATS_INS_t;
177178 typedef struct STATS_GLB_t
179 {
180 // Sumatory breakdown
181 COUNTER cycles;
182183 // Breakdown of stalls accumulated
184 COUNTER breakdownSTALLS[INS_STALL_NUM ];
185186 // Breakdown of cycles accumulated
187 COUNTER breakdownTLB[KNC_TLB_LVLS ];
188 COUNTER breakdownCACHE[KNC_CACHE_LVLS ];
189
APENDICE A. MODULO CORE 4
190 STATS_GLB_t (): cycles (0),
191 breakdownSTALLS (),
192 breakdownTLB (),
193 breakdownCACHE (){}
194195 }STATS_GLB_t;
196197 typedef struct
198 {
199 STATS_INS_t *stats_ins;
200 STATS_GLB_t *stats_glb;
201 } STATS;
202203 // ========================================================================
204 // CORE: Class with the context structures for every core
205 // ========================================================================
206207 class CORE{
208 PIN_MUTEX pipelineLock;
209210 STATE state;
211 STATS stats;
212213 // How many threads
214 UINT32 nThreads;
215 UINT32 coreID;
216217 // Pointers to latency information
218 UINT32 *latencyTLB;
219 UINT32 *latencyCACHE;
220221 void InsertInPipeline(
222 UINT32 tid ,
223 BBL_STATE :: iterator ins );
224225 UINT32 GetCacheLatency( BBL_ENTRY_KEY key );
226227 void DistributeCycles(
228 UINT32 tid ,
229 UINT64 storeLIP ,
230 COUNTER cycles ,
231 BBL_ENTRY_KEY culprit ,
232 bool regStall ,
233 bool memStall ,
234 BBL_ENTRY_KEY currentIP = make_pair (0,0),
235 INT32 regDependency = -1);
236237 void InsertBreakdownStats(
238 UINT32 tid ,
239 STATS_INS_t ::iterator ,
240 UINT32 cycles ,
241 BREAKDOWN_t breakdown ,
242 UINT32 index);
243244 inline UINT32 AdaptThreadID( UINT32 tid )
245 {
246 return tid % nThreads;
247 }
248249 public:
250251 CORE(UINT32 coreID , UINT32 nThreads){
252 // State fields
253 state.regFile = new REG_FILE[ nThreads ];
254 state.bbl = new BBL_STATE[ nThreads ];
255 state.memory = new STATE :: MEMORY_STATE[ nThreads ];
256257 // Stats fields
258 state.stats_ins = new STATS_INS_t[ nThreads ];
259 stats.stats_glb = new STATS_GLB_t[ nThreads ];
APENDICE A. MODULO CORE 5
260261 this ->nThreads = nThreads;
262 }
263264 ~CORE(){
265 // Delete Stats stuff and State stuff
266 delete [] state.regFile;
267 delete [] state.memory;
268269 for (UINT32 t = 0; t < nThreads; t++)
270 DestroyBBLData(t);
271272 delete [] state.bbl;
273 delete [] stats.stats_ins;
274 delete [] stats.stats_glb;
275 }
276277 // Funtions that operate on whole CORE
278 void SetMemorySetup(
279 UINT32 numLevelsTLB ,
280 UINT32 numLevelsCACHE ,
281 UINT32 *latencyTLB ,
282 UINT32 *latencyCACHE);
283284 // Functions that operate on STATE
285 void CreateBBLEntry(
286 UINT32 tid ,
287 UINT64 lip ,
288 INT32 tlbLevelHit ,
289 INT32 cacheLevelHit);
290291 void DestroyBBLData( UINT32 tid );
292293 void DestroyStats( UINT32 tid );
294295 void Pipeline( UINT32 tid , BBInfo *bbl );
296297 COUNTER GetGlobalCycles( UINT32 tid );
298299 string PrintGlobalStats( UINT32 tid );
300301 // Functions that operate on STATS
302 void SetBytesLoaded(
303 UINT32 tid ,
304 UINT64 lip ,
305 UINT32 size);
306307 UINT32 GetBytesLoaded( UINT32 tid , UINT64 lip );
308309 };
310311 extern CORE *CoreArray[MAX_EXPERIMENTS ][ MAX_NUM_THREADS ];
312313 inline UINT32 GetCoreID( UINT32 tid , UINT32 ShiftAmount )
314 {
315 UINT32 coreID = (tid >> ShiftAmount);
316 return coreID;
317 }
318319 // ========================================================================
320321 void SaveBBL( UINT32 ThreadID , BBInfo* BB)
322 {
323 ThreadStats[ThreadID ].BB = ThreadStats[ThreadID ]. prevBB;
324 if (bbinfo != NULL) ThreadStats[ThreadID ]. prevBB = bbinfo;
325 }
326327 void Pipeline ( UINT32 ThreadID )
328 {
329 if (dependencyControl && ThreadStats[ThreadID ].BB != NULL
APENDICE A. MODULO CORE 6
330 {
331 for (UINT32 exp = 0; exp < MAX_EXPERIMENTS; exp++ )
332 {
333 UINT32 coreID = 0;
334335 if (MAX_NUM_THREADS > 1 )
336 {
337 coreID = GetCoreID( ThreadID , ShiftAmount[exp ][0] );
338 }
339340 CORE *corePtr = CoreArray[exp][ coreID ];
341 corePtr ->Pipeline( ThreadID , THreadStats[threadID ].BB );
342 corePtr ->DestroyBBLData( ThreadID );
343 }
344 }
345 }
346347 void Instruction(INS ins)
348 {
349 if (Footprint.find(INS_Address(ins)) == Footprint.end())
350 {
351 INS_FOOT_PRINT_t instruction = {}
352 instruction.routine = RTN_Address(INS_Rtn(ins)):
353 instruction.insSize = INS_Size (ins);
354355 if ( dependencyControl )
356 {
357 instruction.disassemble = INS_Disassemble(ins);
358359 // What type of Instruction?
360 if ( INS_IsLoadOp(ins) )
361 {
362 instruction.insType.flip(INS_TYPE_MEM);
363 if (INS_IsVector(ins))
364 {
365 if (INS_IsScalar(ins))
366 instruction.insType.flip(INS_TYPE_V_SCALAR);
367 else
368 instruction.insType.flip(INS_TYPE_V_VECTOR);
369 }
370 else
371 {
372 instruction.insType.flip(INS_TYPE_NONVPU);
373 instruction.latency = NONVPU :: latency;
374 }
375 }
376 else
377 {
378 if (IsMemInstruction(ins))
379 instruction.insType.flip(INS_TYPE_MEM);
380 else if (INS_IsVector(ins))
381 {
382 if (INS_IsScalar(ins))
383 instruction.insType.flip(INS_TYPE_V_SCALAR);
384 else
385 instruction.insType.flip(INS_TYPE_V_VECTOR);
386387 instruction.latendy = GetLatencyByIclass(ins);
388 }
389 else
390 {
391 instruction.insType.flip(INS_TYPE_NONVPU);
392 instruction.latency = NONVPU :: latency;
393 }
394 }
395 }
396 }
397 }
398399 void SplitBlocks ()
APENDICE A. MODULO CORE 7
400 {
401 map <pair <UINT64 , UINT64 >, COUNTER > Worklist;
402 pair <UINT64 , UINT64 > el1 , el2;
403404 for (list <const BBInfo *>:: iterator bi = BBInfoList.begin (); bi != BBInfoList.←↩
end(); bi++)
405 {
406 COUNTER totalCountBBLbyTID = 0;
407408 for (UINT32 tid = 0; tid <= maxThreadID; tid++)
409 {
410 totalCountBBLbyTID += (*bi)->_counter[tid];
411 }
412413 WorkList[pair <UINT64 , UINT64 >((*bi)->StartAddress ,(*bi)->EndAddress)] += ←↩
totalCountBBLbyTID;
414 }
415416 WorkList[pair <UINT64 , UINT64 >(-1, -1)] = 0;
417418 while (WorkList.size() > 1)
419 {
420 el1 = WorkList.begin()->first;
421 el2 = (++ WorkList.begin ())->first;
422423 if (el1.second < el2.first)
424 {
425 BBInfoMap[el1] = WorkList[el1];
426 WOrkList.erase(el1);
427 }
428 else
429 {
430 if (el1.first == el2.first && el1.second < el2.second)
431 {
432 pair <UINT64 ,UINT64 > newel1 = el1;
433 pair <UINT64 ,UINT64 > newel2 = make_pair ((++ Footprint.find(el1.second))->←↩first ,el2.second);
434435 WorkList[newel1] += WorkList[el2];
436 WorkList[newel2] += WorkList[el2];
437438 WorkList.erase(el2);
439 }
440 else if (el1.first < el2.first && ( el1.second > el2.second ||
441 el1.second == el2.second || el1.second < el2.second ))
442 {
443 pair <UINT64 ,UINT64 > newel1 = make_pair(el1.first ,(--Footprint.find(el2.←↩first))->first);
444 pair <UINT64 ,UINT64 > newel2 = make_pair(el2.first , el1.second);
445446 WorkList[newel1] = WorkList[el1];
447 WorkList[newel2] = WorkList[el1];
448449 WorkList.erase(el1);
450 }
451 else
452 {
453 assert (1);
454 }
455 }
456 }
457 }
458459 #endif /* CORE_H */
460461 #include "core.h"
462 #include <sstream >
463464 FOOT_PRINT Footprint;
465 CORE *CoreArray[MAX_EXPERIMENTS ][ MAX_NUM_THREADS ];
APENDICE A. MODULO CORE 8
466 PIN_MUTEX printLock;
467468 // ========================================================================
469470 void CORE:: CreateBBLEntry (
471 UINT32 tid ,
472 UINT64 lip ,
473 INT32 tlbLevelHit ,
474 INT32 cacheLevelHit)
475 {
476 UINT32 realIndex = AdaptThreadID(tid);
477478 // First , prepare the structure
479 BBL_ENTRY_KEY key = make_pair(lip ,0);
480 BBL_STATE_t bblInfo(tlbLevelHit , cacheLevelHit);
481482 // When a bbl with cache information , key and breadowns are updated ←↩
accordingly
483 if ( tlbLevelHit != -1 || cacheLevelHit != -1 )
484 {
485 // If split or gather instruction , modify key
486 BBL_STATE :: reverse_iterator cait = sate.bbl[realIndex ]. rbegin ();
487 if (cait != state.bbl[realIndex ].rend() && cait ->first.first == lip)
488 key = make_pair(lip ,cait ->first.second +1);
489490 // Getting cycles from TLB
491492 for (INT32 i = 0; i <= tlbLevelHit; i++)
493 {
494 bblInfo.breakdownTLB[i] += latencyTLB[i];
495 bblInfo.cycles += latencyTLB[i];
496 }
497498 // Getting cycles from the CACHE
499500 bblInfo.cycles += latencyCACHE[cacheLevelHit ];
501 for (INT32 i = 0; i <= cacheLevelHit; i++)
502 {
503 if (!i)
504 bblInfo.breakdownCACHE[i] += latencyCACHE[i];
505 else
506 bblInfo.breakdownCACHE[i] += latencyCACHE[i] - latencyCACHE[i-1];
507 }
508 }
509510 state.bbl[realIndex ]. insert( make_pair(key ,bblInfo) );
511 }
512513 // ========================================================================
514515 void CORE:: DestroyBBLData( UINT32 tid )
516 {
517 UINT32 realIndex = AdaptThreadID(tid);
518 state.bbl[realIndex ].clear ();
519 }
520521 // ========================================================================
522523 void CORE:: InsertBreakdownStats(
524 UINT32 tid ,
525 STATS_INS_t :: iterator entry ,
526 UINT32 cycles ,
527 BREAKDOWN_t breakdown ,
528 UINT32 index)
529 {
530 UINT32 realIndex = AdaptThreadID(tid);
531532 switch(breakdown)
533 {
534 case STALLS_ENTRY:
APENDICE A. MODULO CORE 9
535 entry ->second.breakdownSTALLS[index] += cycles;
536 stats.stats_glb[realIndex ]. breakdownSTALLS[index] += cycles;
537 break;
538539 case CACHE_ENTRY:
540 entry ->second.breakdownCACHE[index] += cycles;
541 stats.stats_glb[realIndex ]. breakdownCACHE[index] += cycles;
542543 case TLB_ENTRY:
544 entry ->second.breakdownTLB[index] += cycles;
545 stats.stats_glb[realIndex ]. breakdownTLB[index] += cycles;
546 }
547548 entry ->second.cycles += cycles;
549 stats.stats_glb[realIndex ]. cycles += cycles;
550551 }
552 }
553554 // ========================================================================
555556 void CORE:: DistributeCycles(
557 UINT32 tid ,
558 UINT64 storeLIP ,
559 COUNTER cycles ,
560 BBL_ENTRY_KEY culprit ,
561 bool regStall ,
562 bool memStall ,
563 BBL_ENTRY_KEY currentIP ,
564 INT32 regDependency )
565 {
566 UINT32 realIndex = AdaptThread(tid);
567 UINT32 storeCycles = 0;
568569 // Lets get sure the storage does exist. If not , create.
570 STATS_INS_t :: iterator storage = stats.stats_ins[realIndex]-find(storeLIP);
571 if (storage == stats.stats_ins[tid].end())
572 {
573 STATS_INS_s statsInfo;
574 storage = stats.stats_ins[tid]. insert(make_pair(storeLIP ,statsInfo)).first;
575 }
576577 if ( !regStall and !memStall )
578 InsertBreakdownStats(tid , storage , cycles , STALLS_ENTRY , INS_STALL_ISSUE);
579580 if ( regStall )
581 {
582 // Get the footprint of the culprit
583 FOOT_PRINT :: iterator culpritInfo = Footprint.find(culprit.first);
584585 // Only if the dependency has nothing to do with pipeline being stalled
586 storeCycles = cycles >= culpritInfo ->second.latency ? culpritInfo ->second.←↩latency : cycles;
587588 if (culpritInfo ->second.insType.test(INS_TYPE_NONVPU))
589 {
590 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_NONVPU);
591 }
592 else if (culpritInfo ->second.insType.test(INS_TYPE_V_SCALAR))
593 {
594 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_V_SCALAR);
595 }
596 else if (culpritInfo ->second.insType.test(INS_TYPE_V_VECTOR))
597 {
598 InsertBreakdownStats(tid , storage , storeCycles , STALLS_ENTRY , ←↩INS_STALL_V_VECTOR);
599 }
600
APENDICE A. MODULO CORE 10
601 // Remaining cycles
602 cycles -= storeCycles;
603 }
604605 if ( memStall )
606 {
607 // Get the state info of the culprit
608 // To be taken into account: if there is a culprit , the stall is going to be
609 // with an instruction of the same thread (registers are not shared)
610 BBL_STATE :: iterator culpritState = state.bbl[realIndex ].find(culprit);
611612 // FOr special cases
613 UINT32 source = 0;
614615 if ( culpritState == state.bbl[realIndex ].end() || // Different basic block
616 culprit.first > currentIP.first || // Same basic block , different iteration
617 ( culprit.first == currentIP.first // BBL of 1 instruction , maybe splitted
618 && culprit.second >= currentIP.second ))
619 {
620 if (regDependency != -1)
621 source = 1;
622 else
623 source = 2;
624 }
625626 // Start with CACHE
627 for (INT32 level = KNC_CACHE_LVLS; level >= 0 && cycles > 0; level --)
628 {
629 UINT32 sourceCycles;
630 switch(source)
631 {
632 case 1:
633 // from regFile
634 sourceCycles = state.regFile[realIndex ].find(regDependency)->second.←↩breakdownCACHE[level];
635 break;
636 case 2:
637 // from last memory stat
638 sourceCycles = state.memory[realIndex ]. breakdownCACHE[level];
639 break;
640 default:
641 sourceCycles = culpritState ->second.breakdownCACHE[level];
642 }
643644 storeCycles = cycles >= sourceCycles ? sourceCycles : cycles;
645 InsertBreakdownStats(tid , storage , storeCycles , CACHE_ENTRY , level);
646647 // Remaining
648 cycles -= storeCycles;
649 }
650651 // Follow with TLB
652 for (INT32 level = KNC_TLB_LVLS -1; level >= 0; level --)
653 {
654 UINT32 sourceCycles;
655 switch(source)
656 {
657 case 1:
658 // from regFile
659 sourceCycles = state.regFile[realIndex ].find(regDependency)->second.←↩breakdownTLB[level ];
660 break;
661 case 2:
662 // from last memory stat
663 sourceCycles = state.memory[realIndex ]. breakdownTLB[level ];
664 break;
665 default:
666 sourceCycles = culpritState ->second.breakdownTLB[level];
667 }
668
APENDICE A. MODULO CORE 11
669 storeCycles = cycles >= sourceCycles ? sourceCycles : cycles;
670 InsertBreakdownStats(tid , storage , storeCycles , TLB_ENTRY , level);
671672 // Remaining
673 cycles -= storeCycles;
674 }
675 }
676 }
677678 // ========================================================================
679680 void CORE:: InsertInPipeline (UINT32 tid , BBL_STATE :: iterator ins)
681 {
682 UINT32 realIndex = AdaptThreadID(tid);
683684 BBL_ENTRY_KEY key = ins ->first;
685686 // Get the instruction footprint
687 INS_FOOT_PRINT_t insInfo = Footprint.find(key.first)->second;
688689 // Key to handle culprit instruction if any
690 BBL_ENTRY_KEY culprit = make_pair (0,0);
691692 // Last register read dependency
693 INT32 regDependency = 0;
694695 // Ideally , when does the instruction enter the pipeline?
696 COUNTER issue = state.issue + 1;
697 COUNTER saveIssue = state.issue;
698699 // Last instruction in pipeline
700 STATE:: LAST_INS lastInstruction = state.lastInstruction;
701702 // ====================================
703 // When are the source registers read?
704 // ====================================
705 if (key.second == 0)
706 {
707 for (UINT32 j = 1; j <= insInfo.srcReg [0]; j++)
708 {
709 REG_FILE :: iterator reg = state.regFile[realIndex ].find(insInfo.srcReg[j]);
710711 // If the register is found in RegTable , dependency spoted.
712 if (reg != state.regFile[realIndex ].end())
713 {
714 if (issue < reg ->second.cycle)
715 {
716 culprit = reg ->second.PC;
717 regDependency = reg ->first;
718 issue = max( issue , reg ->second.cycle );
719 }
720 }
721 }
722 }
723724 // ================================================
725 // Was the pipeline frozen by last ins of same TID?
726 // ================================================
727 if ( state.lastInstruction.tid == tid && issue < state.wBackMemory )
728 {
729 culprit = make_pair (0,0);
730 issue = max( Issue , state.wBackMemory );
731 }
732733 // ================================================
734 // State update before unlocking Mutex
735 // ================================================
736 state.issue = issue;
737738 if ( insInfo.insType.test(INS_TYPE_MEM) && ins ->second.cacheLevelHit > 0 )
APENDICE A. MODULO CORE 12
739 state.wBackMemory = issue + ins ->second.cycles;
740 else
741 state.wBackMemory = 0;
742743 state.lastInstruction = STATE:: LAST_INS(tid , key);
744745 PIN_MutexUnlock( &pipelineLock );
746747 // ================================================
748 // Distribution of cycles
749 // ================================================
750 if (saveIssue)
751 {
752 COUNTER cycles = issue - saveIssue;
753 UINT64 storeLIP = key.first;
754755 if (culprit.first)
756 {
757 DistributeCycles(tid , storeLIP , cycles , culprit , true , true , key , ←↩regDependency);
758 }
759 else
760 {
761 if (issue != saveIssue +1)
762 {
763 DistributeCycles(tid , storeLIP , cycles , lastInstruction.key , false , true , ←↩key);
764 }
765 else
766 {
767 DistributeCycles(tid , storeLIP , 1, make_pair (0,0), false , false);
768 }
769 }
770 }
771772 // ================================================
773 // When are the destiny register written?
774 // ================================================
775776 for (UINT32 j = 1; j < insInfo.dstReg [0]; j++)
777 {
778 REG_FILE :: iterator reg = state.regFile[realIndex ].find(insInfo.dstReg[j]);
779 if (reg == state.regFile[realIndex ].end())
780 {
781 REG_FILE_STATE_t regInfo;
782 reg = state.regFile[realIndex ]. insert(make_pair(insInfo.dstReg[j],regInfo)).←↩first;
783 }
784 reg ->second.cycle = issue + ins ->second.cycles + insInfo.latency;
785 reg ->second.PC = key;
786787 // Copy breakdown cycles if neccesary
788 if ( insInfo.insType.test(INS_TYPE_MEM) )
789 {
790 // Tlb
791 for (INT32 level = 0; level < KNC_TLB_LVLS; level ++)
792 reg ->second.breakdownTLB[level] = ins ->second.breakdownTLB[level];
793 // Cache
794 for (INT32 level = 0; level < KNC_CACHE_LVLS; level ++)
795 reg ->second.breakdownCACHE[level] = ins ->second.breakdownCACHE[level];
796 }
797798 }
799800 // ================================================
801 // Save last memory access if any
802 // ================================================
803 if (insInfo.insType.test(INS_TYPE_MEM) )
804 {
805 // Tlb
APENDICE A. MODULO CORE 13
806 for (INT32 level = 0; level < KNC_TLB_LVLS; level ++)
807 state.memory[realIndex ]. breakdownTLB[level] = ins ->second.breakdownTLB[level←↩];
808 // Cache
809 for (INT32 level = 0; level < KNC_CACHE_LVLS; level ++)
810 state.memory[realIndex ]. breakdownCACHE[level] = ins ->second.breakdownCACHE[←↩level];
811 }
812 }
813814 // ========================================================================
815 // bbl expected to be != NULL
816817 void CORE:: Pipeline(UINT32 tid , BBInfo *bbl)
818 {
819 UINT32 realIndex = AdaptThreadID(tid);
820821 // ==================================================
822 // Dependencies control
823 // ==================================================
824 // BBL info is updated with no memory instructions (tlbLevelHit -1 / ←↩cacheLevelHit -1)
825 map <UINT64 , INS_FOOT_PRINT_t >:: iterator lastIns = ++ Footprint.find(bbl ->←↩EndAddress);
826827 for(FOOT_PRINT :: iterator it = Footprint.find(bbl ->StartAddress);
828 it != lastIns; it++)
829 CreateBBLEntry(realIndex , it->first , -1, -1);
830831 // Lets travel through all the instructions of the block
832 for (BBL_STATE :: iterator ins = state.bbl[realIndex ].begin ();
833 ins != state.bbl[realIndex ].end(); ins++)
834 {
835 InsertInPipeline(tid ,ins);
836 }
837 }
838839 // ========================================================================
840841 COUNTER CORE:: GetGlobalCycles( UINT32 tid )
842 {
843 UINT32 realIndex = AdaptThreadID(tid);
844 return stats.stats_glb[realIndex ]. cycles;
845 }
846847 // ========================================================================
848849 string CORE:: PrintGlobalStats( UINT32 tid )
850 {
851 UINT32 realIndex = AdaptThreadID(tid);
852 stringstream output;
853854 for (UINT32 stall = 0; stall < INS_STALL_NUM; stall ++)
855 {
856 output << stats.stats_glb[realIndex ]. breakdownSTALLS[stall] << ",";
857 }
858 for (UINT32 level = 0; level < KNC_TLB_LVLS; level ++)
859 {
860 output << stats.stats_glb[realIndex ]. breakdownTLB[level] << ",";
861 }
862 for (UINT32 level = 0; level < KNC_CACHE_LVLS; level ++)
863 {
864 if (level == KNC_CACHE_lvls - 1)
865 output << stats.stats_glb[realIndex ]. breakdownCACHE[level ];
866 else
867 output << stats.stats_glb[realIndex ]. breakdownCACHE[level] << ",";
868 }
869 return output.str();
870 }
871
APENDICE A. MODULO CORE 14
872 // ========================================================================
873874 void CORE:: SetBytesLoaded( UINT32 tid , UINT64 lip , UINT32 size )
875 {
876 UINT32 realIndex = AdaptThreadID(tid);
877878 // As stats is a vector of jmaps , the map for the lip specified
879 // may not exists the first time this lip is encountered for
880 // the thread tid
881882 STATS_INS_t :: iterator it = stats.stats_ins[realIndex ].find(lip);
883884 if (it == stats.stats_ins[realIndex ].end())
885 {
886 STATS_INS_s data;
887 it = stats.stats_ins[realIndex ]. insert(make_pair(lip , data)).first;
888 }
889890 it->second.bytesLoaded += size;
891 }
892893 // ========================================================================
894895 void CORE:: SetMemorySetup(
896 UINT32 numLevelsTLB ,
897 UINT32 numLevelsCACHE ,
898 UINT32 numLevelsCACHE ,
899 UINT32 *latencyTLB ,
900 UINT32 *latencyCACHE)
901 {
902 this ->latencyTLB = latencyTLB;
903 this ->latencyCACHE = latencyCACHE;
904 }
905906 // ========================================================================
907908 UINT32 CORE:: GetBytesLoaded( UINT32 tid , UINT64 lip )
909 {
910 UINT32 realIndex = AdaptThreadID(tid);
911912 STATS_INS_t :: iterator it = stats.stats_ins[realIndex ].find(lip);
913 if (it != stats.stats_ins[realIndex ].end())
914 return stats.stats_ins[realIndex ].find(lip)->second.bytesLoaded;
915 else
916 return 0xffffffff;
917 }
918