ROSE  0.9.6a
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Partitioner.C
Go to the documentation of this file.
1 /* Algorithms to detect what instructions make up basic blocks and which blocks make up functions, and how to create the
2  * necessary SgAsmBlock and SgAsmFunction IR nodes from this information. */
3 #include "sage3basic.h"
4 
5 #include "Partitioner.h"
6 #include "Assembler.h"
7 #include "AssemblerX86.h"
8 #include "AsmUnparser_compat.h"
9 #include "BinaryLoader.h"
10 #include "PartialSymbolicSemantics.h" // FIXME: expensive to compile; remove when no longer needed [RPM 2012-05-06]
11 #include "stringify.h"
12 
13 #include "PartialSymbolicSemantics2.h"
14 #include "DispatcherX86.h"
15 
16 #include <errno.h>
17 #include <fcntl.h>
18 #include <math.h>
19 #include <stdarg.h>
20 
21 using namespace rose;
22 
23 /* See header file for full documentation. */
24 
25 std::ostream& operator<<(std::ostream &o, const Partitioner::Exception &e)
26 {
27  e.print(o);
28  return o;
29 }
30 
31 /* class method */
34 {
35  return insn ? isSgAsmInstruction(insn->node) : NULL;
36 }
37 
38 /* class method */
41 {
43 }
44 
45 /* class method */
48 {
49  return insn ? isSgAsmx86Instruction(insn->node) : NULL;
50 }
51 
52 /* class method */
55 {
57 }
58 
59 /* Progress report class variables. */
62 FILE *Partitioner::progress_file = stderr;
63 
64 /* Set progress reporting values. */
65 void
66 Partitioner::set_progress_reporting(FILE *output, unsigned min_interval)
67 {
68  progress_file = output;
69  progress_interval = min_interval;
70 }
71 
72 /* Produce a progress report if enabled. */
73 void
74 Partitioner::progress(FILE *debug, const char *fmt, ...) const
75 {
76  time_t now = time(NULL);
77 
78  if (0==progress_time)
79  progress_time = now;
80 
81  if (progress_file!=NULL && now-progress_time >= progress_interval) {
82  progress_time = now;
83  va_list ap;
84  va_start(ap, fmt);
85  vfprintf(progress_file, fmt, ap);
86  va_end(ap);
87  }
88 
89  if (debug!=NULL) {
90  va_list ap;
91  va_start(ap, fmt);
92  vfprintf(debug, fmt, ap);
93  va_end(ap);
94  }
95 }
96 
97 /* Parse argument for "-rose:partitioner_search" command-line swich. */
98 unsigned
99 Partitioner::parse_switches(const std::string &s, unsigned flags)
100 {
101  size_t at=0;
102  while (at<s.size()) {
103  enum { SET_BIT, CLEAR_BIT, SET_VALUE, NOT_SPECIFIED } howset = NOT_SPECIFIED;
104 
105  if (s[at]=='-') {
106  howset = CLEAR_BIT;
107  at++;
108  } else if (s[at]=='+') {
109  howset = SET_BIT;
110  at++;
111  } else if (s[at]=='=') {
112  howset = SET_VALUE;
113  at++;
114  }
115  if (at>=s.size())
116  throw Exception("heuristic name must follow qualifier");
117 
118  size_t comma = s.find(",", at);
119  std::string word = std::string(s, at, comma-at);
120  if (word.size()==0)
121  throw Exception("heuristic name must follow comma");
122 
123  unsigned bits = 0;
124  if (word=="entry" || word=="entry_point") {
126  } else if (word=="call_target") {
128  } else if (word=="call_insn") {
130  } else if (word=="call") {
132  } else if (word=="eh" || word=="eh_frame") {
134  } else if (word=="import") {
136  } else if (word=="export") {
138  } else if (word=="symbol") {
140  } else if (word=="pattern") {
142  } else if (word=="userdef") {
144  } else if (word=="pad" || word=="padding" || word=="interpad") {
146  } else if (word=="intrablock") {
148  } else if (word=="thunk") {
150  } else if (word=="misc" || word=="miscellaneous" || word=="interpadfunc") {
152  } else if (word=="unassigned" || word=="unclassified" || word=="leftover" || word=="leftovers") {
154  } else if (word=="default") {
156  if (howset==NOT_SPECIFIED) howset = SET_VALUE;
157  } else if (isdigit(word[0])) {
158  bits = strtol(word.c_str(), NULL, 0);
159  } else {
160  throw Exception("unknown partitioner heuristic: \"" + word + "\"");
161  }
162 
163  switch (howset) {
164  case SET_VALUE:
165  flags = 0;
166  case NOT_SPECIFIED:
167  case SET_BIT:
168  flags |= bits;
169  break;
170  case CLEAR_BIT:
171  flags &= ~bits;
172  break;
173  }
174 
175  at = comma==std::string::npos ? s.size() : comma+1;
176  }
177  return flags;
178 }
179 
180 /* Set disassembly and memory initialization maps. */
181 void
183 {
184  this->map = map;
185  if (map) {
186  if (ro_map) {
187  this->ro_map = *ro_map;
188  } else {
189  this->ro_map = *map;
191  }
192  } else {
193  this->ro_map.clear();
194  }
195 }
196 
197 /* Looks for a jump table. Documented in header file. */
199 Partitioner::discover_jump_table(BasicBlock *bb, bool do_create, ExtentMap *table_extent)
200 {
201  using namespace BinaryAnalysis::InstructionSemantics2;
202 
203  /* Do some cheap up-front checks. */
205  if (!insn_x86 || (insn_x86->get_kind()!=x86_jmp && insn_x86->get_kind()==x86_farjmp) ||
206  1!=insn_x86->get_operandList()->get_operands().size())
207  return Disassembler::AddressSet();
208  SgAsmExpression *target_expr = insn_x86->get_operandList()->get_operands()[0];
211  if (!mre && !rre)
212  return Disassembler::AddressSet(); // no indirection
213 
214  /* Evaluate the basic block semantically to get an expression for the final EIP. */
215  const RegisterDictionary *regdict = RegisterDictionary::dictionary_amd64(); // compatible w/ older x86 models
216  const RegisterDescriptor *REG_EIP = regdict->lookup("eip");
217  PartialSymbolicSemantics::RiscOperatorsPtr ops = PartialSymbolicSemantics::RiscOperators::instance(regdict);
218  BaseSemantics::DispatcherPtr dispatcher = DispatcherX86::instance(ops);
219  ops->set_memory_map(&ro_map);
220  try {
221  for (size_t i=0; i<bb->insns.size(); ++i) {
222  insn_x86 = isSgAsmx86Instruction(bb->insns[i]->node);
223  assert(insn_x86); // we know we're in a basic block of x86 instructions already
224  dispatcher->processInstruction(insn_x86);
225  }
226  } catch (...) {
227  return Disassembler::AddressSet(); // something went wrong, so just give up (e.g., unhandled instruction)
228 
229  }
230 
231  /* Scan through memory to find from whence the EIP value came. There's no need to scan for an EIP which is a known value
232  * since such control flow successors would be picked the usual way elsewhere. It's also quite possible that the EIP value
233  * is also stored at some other memory addresses outside the jump table (e.g., a function pointer argument stored on the
234  * stack), so we also skip over any memory whose address is known. */
235  Disassembler::AddressSet successors;
236  BaseSemantics::SValuePtr eip = ops->readRegister(*REG_EIP);
237  static const size_t entry_size = 4; // FIXME: bytes per jump table entry
238  uint8_t *buf = new uint8_t[entry_size];
239  if (!eip->is_number()) {
240  BaseSemantics::MemoryCellListPtr mem = BaseSemantics::MemoryCellList::promote(ops->get_state()->get_memory_state());
241  for (BaseSemantics::MemoryCellList::CellList::iterator mi=mem->get_cells().begin(); mi!=mem->get_cells().end(); ++mi) {
242  BaseSemantics::MemoryCellPtr cell = *mi;
243  if (cell->get_address()->is_number() && cell->get_value()->must_equal(eip)) {
244  rose_addr_t base_va = cell->get_address()->get_number();
245  size_t nentries = 0;
246  while (1) {
247  size_t nread = ro_map.read(buf, base_va+nentries*entry_size, entry_size);
248  if (nread!=entry_size)
249  break;
250  rose_addr_t target_va = 0;
251  for (size_t i=0; i<entry_size; i++)
252  target_va |= buf[i] << (i*8);
253  if (!map->exists(target_va, MemoryMap::MM_PROT_EXEC))
254  break;
255  successors.insert(target_va);
256  ++nentries;
257  }
258  if (nentries>0) {
259  if (table_extent)
260  table_extent->insert(Extent(base_va, nentries*entry_size));
261  if (do_create) {
262  DataBlock *dblock = find_db_starting(base_va, nentries*entry_size);
263  append(bb, dblock, SgAsmBlock::BLK_JUMPTABLE);
264  }
265  if (debug)
266  fprintf(debug, "[jump table at 0x%08"PRIx64"+%zu*%zu]", base_va, nentries, entry_size);
267  }
268  }
269  }
270  }
271  delete [] buf;
272  return successors;
273 }
274 
278 void
280 {
281  assert(bb!=NULL && !bb->insns.empty());
282  if (bb->valid_cache()) return;
283 
284  /* Successor analysis. */
285  std::vector<SgAsmInstruction*> inodes;
286  for (InstructionVector::const_iterator ii=bb->insns.begin(); ii!=bb->insns.end(); ++ii)
287  inodes.push_back(isSgAsmInstruction(*ii));
288  bb->cache.sucs = bb->insns.front()->node->get_successors(inodes, &(bb->cache.sucs_complete), &ro_map);
289 
290  /* Try to handle indirect jumps of the form "jmp ds:[BASE+REGISTER*WORDSIZE]". The trick is to assume that some kind of
291  * jump table exists beginning at address BASE, and that the table contains only addresses of valid code. All we need to
292  * do is look for the first entry in the table that doesn't point into an executable region of the disassembly memory
293  * map. We use a "do" loop so the logic nesting doesn't get so deep: just break when we find that something doesn't match
294  * what we expect. */
295  if (!bb->cache.sucs_complete && bb->cache.sucs.empty()) {
296  ExtentMap table_extent;
297  Disassembler::AddressSet table_entries = discover_jump_table(bb, true, &table_extent);
298  if (!table_entries.empty()) {
299  bb->cache.sucs.insert(table_entries.begin(), table_entries.end());
300  if (debug) {
301  std::ostringstream ss;
302  ss <<"[jump table at " <<table_extent <<"]";
303  fprintf(debug, "%s", ss.str().c_str());
304  }
305  }
306  }
307 
308  /* Call target analysis. A function call is any CALL-like instruction except when the call target is the fall-through
309  * address and the instruction at the fall-through address pops the top of the stack (this is one way position independent
310  * code loads the instruction pointer register into a general-purpose register). FIXME: For now we'll assume that any call
311  * to the fall-through address is not a function call. */
312  rose_addr_t fallthrough_va = bb->last_insn()->get_address() + bb->last_insn()->get_size();
313  rose_addr_t target_va = NO_TARGET;
314  bool looks_like_call = bb->insns.front()->node->is_function_call(inodes, &target_va, NULL);
315  if (looks_like_call && target_va!=fallthrough_va) {
316  bb->cache.is_function_call = true;
317  bb->cache.call_target = target_va;
318  } else {
319  bb->cache.is_function_call = false;
320  bb->cache.call_target = NO_TARGET;
321  }
322 
323  /* Function return analysis */
325  bb->insns.front()->node->is_function_return(inodes);
326 
327  bb->validate_cache();
328 }
329 
333 bool
335 {
336  update_analyses(bb); /*make sure cache is current*/
337  if (target_va) *target_va = bb->cache.call_target;
338  return bb->cache.is_function_call;
339 }
340 
355 {
356  update_analyses(bb); /*make sure cache is current*/
357 
358  /* Follow alias_for links. */
360  for (Disassembler::AddressSet::const_iterator si=bb->cache.sucs.begin(); si!=bb->cache.sucs.end(); ++si)
361  retval.insert(canonic_block(*si));
362  if (complete) *complete = bb->cache.sucs_complete;
363 
364  /* Run non-local analyses if necessary. These are never cached here in this block. */
365 
366  // If this block ends with what appears to be a function call then we should perhaps add the fall-through address as a
367  // successor. In fact, since most calls may return, assume that this call also may return unless we already know there's a
368  // function there and we haven't yet proven that it may return.
369  if (bb->cache.is_function_call) {
370  rose_addr_t fall_through_va = canonic_block(bb->last_insn()->get_address() + bb->last_insn()->get_size());
371  rose_addr_t call_target_va = call_target(bb);
372  if (call_target_va!=NO_TARGET) {
373  Instruction *target_insn = find_instruction(call_target_va, true);
374  BasicBlock *target_bb = target_insn ? find_bb_starting(call_target_va, false) : NULL;
375  if (!target_insn) {
376  // We know the call target, but could not obtain an instruction there. The target might be a dynamically
377  // linked function that isn't mapped yet. Assume it may return since most of them can.
378  retval.insert(fall_through_va);
379  } else if (target_bb && target_bb->function) {
380  // There is a basic block at the call target and the block belongs to a function already. This call may
381  // return if the target function may return
382  if (target_bb->function->possible_may_return())
383  retval.insert(fall_through_va);
384  } else if (target_bb) {
385  // There is a basic block at the call target but it hasn't been assigned to a function yet. Assume that it can
386  // return since most calls can.
387  retval.insert(fall_through_va);
388  } else {
389  // We were able to disassemble an instruction at the call target, but no basic block exists there yet. Assume
390  // that the call may return since most calls can.
391  assert(target_insn);
392  assert(!target_bb);
393  retval.insert(fall_through_va);
394  }
395  } else {
396  retval.insert(fall_through_va); // most calls may return, so assume this one can
397  }
398  }
399 
400  return retval;
401 }
402 
408 {
409  update_analyses(bb); /*make sure cache is current*/
410  if (bb->cache.call_target==NO_TARGET) return NO_TARGET;
411  return canonic_block(bb->cache.call_target);
412 }
413 
414 /* Returns true if the basic block at the specified virtual address appears to pop the return address from the top of the
415  * stack without returning.
416  *
417  * FIXME: This is far from perfect: it analyzes only the first basic block; it may have incomplete information about where the
418  * basic block ends due to not yet having discovered all incoming CFG edges; it doesn't consider cases where the return
419  * value is popped but saved and restored later; etc. It also only handles x86 instructions at this time.
420  * [RPM 2010-04-30] */
421 bool
423 {
424  using namespace BinaryAnalysis::InstructionSemantics;
425 
426  bool on_stack = true; /*assume return value stays on stack; prove otherwise*/
427 
428  /* Create the basic block if possible, but if we created it here then we should clear it below. */
429  BasicBlock *bb = find_bb_containing(va, false);
430  bool preexisting = bb!=NULL;
431  if (!bb) bb = find_bb_containing(va);
432  if (!bb) return false;
433  try {
434 
435  SgAsmx86Instruction *last_insn = isSgAsmx86Instruction(bb->last_insn());
436 
437  typedef PartialSymbolicSemantics::Policy<> Policy;
438  typedef X86InstructionSemantics<Policy, PartialSymbolicSemantics::ValueType> Semantics;
439  Policy policy;
440  policy.set_map(get_map());
441  PartialSymbolicSemantics::ValueType<32> orig_retaddr;
442  policy.writeMemory(x86_segreg_ss, policy.readRegister<32>("esp"), orig_retaddr, policy.true_());
443  Semantics semantics(policy);
444 
445 #if 0
446  fputs("Partitioner::pops_return_address:\n", stderr);
447 #endif
448  try {
449  for (InstructionVector::iterator ii=bb->insns.begin(); ii!=bb->insns.end(); ++ii) {
451  if (!insn) return false;
452  if (insn==last_insn && insn->get_kind()==x86_ret) break;
453  semantics.processInstruction(insn);
454 #if 0
455  std::ostringstream s;
456  s << "Analysis for " <<unparseInstructionWithAddress(insn) <<std::endl
457  <<policy.get_state()
458  fputs(s.str().c_str(), stderr);
459 #endif
460  }
461  on_stack = policy.on_stack(orig_retaddr);
462  if (!on_stack && debug)
463  fprintf(debug, "[B%08"PRIx64"#%zu discards return address]", va, bb->insns.size());
464  } catch (const Semantics::Exception&) {
465  /*void*/
466  } catch (const Policy::Exception&) {
467  /*void*/
468  }
469 
470  } catch(...) {
471  if (!preexisting)
472  discard(bb);
473  throw;
474  }
475 
476  /* We don't want to have a basic block created just because we did some analysis. */
477  if (!preexisting)
478  discard(bb);
479 
480  /* Is the original return value still on the stack? */
481  return !on_stack;
482 }
483 
488 {
489  assert(!insns.empty());
490  return insns.front()->get_address();
491 }
492 
497 {
498  assert(!nodes.empty());
499  return nodes.front()->get_address();
500 }
501 
507 {
508  if (dblock->function)
509  return dblock->function;
510  if (dblock->basic_block)
511  return dblock->basic_block->function;
512  return NULL;
513 }
514 
515 /* Returns last instruction, the one that exits from the block. */
518 {
519  assert(insns.size()>0);
520  return insns.back();
521 }
522 
523 /* Removes association between data blocks and basic blocks. */
524 void
526 {
527  for (std::set<DataBlock*>::iterator di=data_blocks.begin(); di!=data_blocks.end(); ++di)
528  (*di)->basic_block = NULL;
529  data_blocks.clear();
530 }
531 
532 /* Release all basic blocks from a function. Do not delete the blocks. */
533 void
535 {
536  for (BasicBlocks::iterator bi=basic_blocks.begin(); bi!=basic_blocks.end(); ++bi)
537  bi->second->function = NULL;
538  basic_blocks.clear();
539 }
540 
541 /* Release all data blocks from a function. Do not delete the blocks. */
542 void
544 {
545  for (DataBlocks::iterator bi=data_blocks.begin(); bi!=data_blocks.end(); ++bi)
546  bi->second->function = NULL;
547  data_blocks.clear();
548 }
549 
550 /* Move basic blocks from the other function to this one. */
551 void
553 {
554  for (BasicBlocks::iterator bbi=other->basic_blocks.begin(); bbi!=other->basic_blocks.end(); ++bbi) {
555  basic_blocks[bbi->first] = bbi->second;
556  bbi->second->function = this;
557  }
558  other->basic_blocks.clear();
559 
560  heads.insert(other->heads.begin(), other->heads.end());
561  other->heads.clear();
562 }
563 
564 /* Move data blocks from the other function to this one. */
565 void
567 {
568  for (DataBlocks::iterator dbi=other->data_blocks.begin(); dbi!=other->data_blocks.end(); ++dbi) {
569  data_blocks[dbi->first] = dbi->second;
570  dbi->second->function = this;
571  }
572  other->data_blocks.clear();
573 }
574 
575 /* Increase knowledge about may-return property. */
576 void
578  switch (get_may_return()) {
580  set_may_return(new_value);
581  break;
583  if (SgAsmFunction::RET_ALWAYS==new_value || SgAsmFunction::RET_NEVER==new_value)
584  set_may_return(new_value);
585  break;
588  break;
589  }
590 }
591 
592 void
594 {
595  if (debug) {
596  std::string may_return_str = stringifySgAsmFunctionMayReturn(get_may_return(), "RET_");
597  for (size_t i=0; i<may_return_str.size(); ++i)
598  may_return_str[i] = tolower(may_return_str[i]);
599  fprintf(debug, "{nbblocks=%zu, ndblocks=%zu, may-return=%s}",
600  basic_blocks.size(), data_blocks.size(), may_return_str.c_str());
601  }
602 }
603 
606 {
607  BasicBlocks::const_iterator bi=basic_blocks.find(entry_va);
608  return bi==basic_blocks.end() ? NULL : bi->second;
609 }
610 
611 /* Return partitioner to initial state */
612 void
614 {
615  /* Delete all functions */
616  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
617  fi->second->clear_basic_blocks();
618  fi->second->clear_data_blocks();
619  delete fi->second;
620  }
621  functions.clear();
622 
623  /* Delete all basic blocks. We don't need to call Partitioner::discard() to fix up ptrs because all functions that might
624  * have pointed to this block have already been deleted. */
625  for (BasicBlocks::iterator bi=basic_blocks.begin(); bi!=basic_blocks.end(); ++bi)
626  delete bi->second;
627  basic_blocks.clear();
628 
629  /* Delete all data blocks, but not the SgAsmStaticData nodes to which they point. */
630  for (DataBlocks::iterator bi=data_blocks.begin(); bi!=data_blocks.end(); ++bi)
631  delete bi->second;
632  data_blocks.clear();
633 
634  /* Delete all block IPD configuration data. */
635  for (BlockConfigMap::iterator bci=block_config.begin(); bci!=block_config.end(); ++bci)
636  delete bci->second;
637  block_config.clear();
638 
639  /* Delete all instructions by deleting the Partitioner::Instruction objects, but not the underlying SgAsmInstruction
640  * objects because the latter might be used by whatever's calling the Partitioner. */
641  for (InstructionMap::iterator ii=insns.begin(); ii!=insns.end(); ++ii)
642  delete ii->second;
643  insns.clear();
644 
645  /* Clear all disassembly failures from the cache. */
646  clear_disassembler_errors();
647 
648  /* Clear statistics and code criteria. This object manages memory for its statistics, but the caller manages the memory
649  * for the code criteria. */
650  delete aggregate_mean; aggregate_mean = NULL;
651  delete aggregate_variance; aggregate_variance = NULL;
652  code_criteria = NULL; // owned by user
653 }
654 
655 void
656 Partitioner::load_config(const std::string &filename) {
657  if (filename.empty())
658  return;
659 #ifdef _MSC_VER /* tps (06/23/2010) : Does not work under Windows */
660  throw IPDParser::Exception("IPD parsing not supported on Windows platforms");
661 #else
662  int fd = open(filename.c_str(), O_RDONLY);
663  if (fd<0)
664  throw IPDParser::Exception(strerror(errno), filename);
665  struct stat sb;
666  fstat(fd, &sb);
667  char *config = new char[sb.st_size];
668  ssize_t nread = read(fd, config, sb.st_size);
669  if (nread<0 || nread<sb.st_size) {
670  delete[] config;
671  close(fd);
672  throw IPDParser::Exception(strerror(errno), filename);
673  }
674  IPDParser(this, config, sb.st_size, filename).parse();
675  delete[] config;
676  close(fd);
677 #endif
678 }
679 
687 void
689 {
690  assert(bb);
691  assert(bb==find_bb_containing(va));
692 
693  /* Find the cut point in the instruction vector. I.e., the first instruction to remove from the vector. */
694  InstructionVector::iterator cut = bb->insns.begin();
695  while (cut!=bb->insns.end() && (*cut)->get_address()!=va) ++cut;
696  assert(cut!=bb->insns.begin()); /*we can't remove them all since basic blocks are never empty*/
697 
698  /* Remove instructions (from the cut point and beyond) and all the data blocks. */
699  for (InstructionVector::iterator ii=cut; ii!=bb->insns.end(); ++ii) {
700  Instruction *insn = *ii;
701  assert(insn->bblock==bb);
702  insn->bblock = NULL;
703  }
704  if (cut!=bb->insns.end()) {
705  bb->insns.erase(cut, bb->insns.end());
706  bb->clear_data_blocks();
707  }
708 }
709 
710 /* Append instruction to basic block */
711 void
713 {
714  assert(bb);
715  assert(insn);
716  assert(NULL==insn->bblock); /* insn must not have already belonged to a basic block */
717  insn->bblock = bb;
718  bb->insns.push_back(insn);
719 }
720 
730 void
731 Partitioner::append(BasicBlock *bb, DataBlock *db, unsigned reason)
732 {
733  assert(bb!=NULL);
734  assert(db!=NULL);
735  db->reason |= reason;
736 
737  if (db->basic_block!=NULL)
738  db->basic_block->data_blocks.erase(db);
739 
740  bb->data_blocks.insert(db);
741  db->basic_block = bb;
742 }
743 
755 void
756 Partitioner::append(Function* f, BasicBlock *bb, unsigned reason, bool keep/*=false*/)
757 {
758  assert(f);
759  assert(bb);
760 
761  if (keep)
762  f->heads.insert(bb->address());
763  bb->reason |= reason;
764 
765  if (bb->function==f)
766  return;
767 
768  assert(bb->function==NULL);
769  bb->function = f;
770  f->basic_blocks[bb->address()] = bb;
771 
772  /* If the block is a function return then mark the function as returning. On a transition from a non-returning function
773  * to a returning function, we must mark all calling functions as pending so that the fall-through address of their
774  * function calls to this function are eventually discovered. This includes recursive calls since we may have already
775  * discovered the recursive call but not followed the fall-through address. Marking callers as "pending" happens in the
776  * analyze_cfg() method, were we can handle all the calls at the same time (more efficient than doing one at a time right
777  * here). */
778  update_analyses(bb);
779  if (bb->cache.function_return)
781 }
782 
792 void
793 Partitioner::append(Function *func, DataBlock *block, unsigned reason, bool force)
794 {
795  assert(func);
796  assert(block);
797 
798  if (force) {
799  if (block->function)
800  remove(block->function, block);
801  if (block->basic_block)
802  remove(block->basic_block, block);
803  }
804 
805  block->reason |= reason;
806  if (block->function==func)
807  return;
808 
809  assert(block->function==NULL);
810  block->function = func;
811  func->data_blocks[block->address()] = block;
812 }
813 
816 void
818 {
819  assert(f);
820  assert(bb);
821  assert(bb->function==f);
822  bb->function = NULL;
823  f->basic_blocks.erase(bb->address());
824 }
825 
829 void
831 {
832  assert(f);
833  assert(db);
834  assert(db->function==f);
835  db->function = NULL;
836  f->data_blocks.erase(db->address());
837 }
838 
841 void
843 {
844  assert(bb!=NULL);
845  if (db && db->basic_block==bb) {
846  bb->data_blocks.erase(db);
847  db->basic_block = NULL;
848  }
849 }
850 
851 /* Remove instruction from consideration. */
853 Partitioner::discard(Instruction *insn, bool discard_entire_block)
854 {
855  if (insn) {
856  rose_addr_t va = insn->get_address();
857  BasicBlock *bb = find_bb_containing(va, false);
858  if (bb) {
859  if (discard_entire_block) {
860  discard(bb);
861  } else if (bb->insns.front()==insn) {
862  discard(bb);
863  } else {
864  truncate(bb, va);
865  }
866  }
867  insns.erase(va);
868  }
869  return NULL;
870 }
871 
872 /* Delete block, returning its instructions back to the (implied) list of free instructions. */
875 {
876  if (bb) {
877  assert(NULL==bb->function);
878 
879  /* Remove instructions from the block, returning them to the (implied) list of free instructions. */
880  for (InstructionVector::iterator ii=bb->insns.begin(); ii!=bb->insns.end(); ++ii) {
881  Instruction *insn = *ii;
882  assert(insn->bblock==bb);
883  insn->bblock = NULL;
884  }
885 
886  /* Remove the association between data blocks and this basic block. */
887  bb->clear_data_blocks();
888 
889  /* Remove the block from the partitioner. */
890  basic_blocks.erase(bb->address());
891  delete bb;
892  }
893  return NULL;
894 }
895 
896 /* Finds (or possibly creates) an instruction beginning at the specified address. */
899 {
900  InstructionMap::iterator ii = insns.find(va);
901  if (create && disassembler && ii==insns.end() && bad_insns.find(va)==bad_insns.end()) {
902  Instruction *insn = NULL;
903  try {
904  insn = new Instruction(disassembler->disassembleOne(map, va, NULL));
905  ii = insns.insert(std::make_pair(va, insn)).first;
906  } catch (const Disassembler::Exception &e) {
907  bad_insns.insert(std::make_pair(va, e));
908  }
909  }
910  return ii==insns.end() ? NULL : ii->second;
911 }
912 
934 {
935  Instruction *insn = find_instruction(va);
936  if (!insn)
937  return NULL;
938  if (!create || insn->bblock!=NULL)
939  return insn->bblock;
940  BasicBlock *bb = insn->bblock;
941  if (!bb) {
942  bb = new BasicBlock;
943  basic_blocks.insert(std::make_pair(va, bb));
944  }
945 
946  while (1) {
947  append(bb, insn);
948 
949  /* Find address of next instruction, or whether this insn is the end of the block */
950  va += insn->get_size();
951  if (insn->terminates_basic_block()) { /*naively terminates?*/
952  bool complete;
953  const Disassembler::AddressSet& sucs = successors(bb, &complete);
954  if ((func_heuristics & SgAsmFunction::FUNC_CALL_TARGET) && is_function_call(bb, NULL)) {
955  /* When we are detecting functions based on x86 CALL instructions (or similar for other architectures) then
956  * the instruction after the CALL should never be part of this basic block. Otherwise allow the call to be
957  * part of the basic block initially and we'll split the block later if we need to. */
958  break;
959  } else if (allow_discont_blocks) {
960  if (!complete || sucs.size()!=1)
961  break;
962  va = *(sucs.begin());
963  } else {
964  if (!complete || sucs.size()!=1 || *(sucs.begin())!=va)
965  break;
966  }
967  }
968 
969  /* Get the next instruction */
970  insn = find_instruction(va);
971  if (!insn || insn->bblock)
972  break;
973  }
974  return bb;
975 }
976 
982 {
983  BasicBlock *bb = find_bb_containing(va, create);
984  if (!bb)
985  return NULL;
986  if (va==bb->address())
987  return bb;
988  if (!create)
989  return NULL;
990  if (debug)
991  fprintf(debug, "[split from B%08"PRIx64"#%zu]", bb->address(), bb->insns.size());
992  if (bb->function!=NULL)
993  bb->function->pending = true;
994  truncate(bb, va);
995  bb = find_bb_containing(va);
996  assert(bb!=NULL);
997  assert(va==bb->address());
998  return bb;
999 }
1000 
1006 {
1007  for (size_t i=0; i<100; i++) {
1008  BasicBlock *bb = find_bb_starting(va, false);
1009  if (!bb || !bb->cache.alias_for) return va;
1010  if (debug) fprintf(debug, "[B%08"PRIx64"->B%08"PRIx64"]", va, bb->cache.alias_for);
1011  va = bb->cache.alias_for;
1012  }
1013  assert(!"possible alias loop");
1014  return va;
1015 }
1016 
1017 /* Finds an existing function definition. */
1020 {
1021  Functions::iterator fi = functions.find(entry_va);
1022  if (fi==functions.end()) return NULL;
1023  return fi->second;
1024 }
1025 
1026 /* Adds or updates a function definition. */
1028 Partitioner::add_function(rose_addr_t entry_va, unsigned reasons, std::string name)
1029 {
1030  Function *f = NULL;
1031  Functions::iterator fi = functions.find(entry_va);
1032  if (fi==functions.end()) {
1033  f = new Function(entry_va, reasons, name);
1034  functions[entry_va] = f;
1035  } else {
1036  f = fi->second;
1037  assert(f->entry_va==entry_va);
1038  if (reasons & SgAsmFunction::FUNC_MISCMASK)
1039  f->reason &= ~SgAsmFunction::FUNC_MISCMASK;
1040  f->reason |= reasons;
1041  if (name!="") f->name = name;
1042  }
1043  return f;
1044 }
1045 
1046 /* Do whatever's necessary to finish loading IPD configuration. */
1047 void
1049 {
1050  using namespace BinaryAnalysis::InstructionSemantics;
1051 
1052  for (BlockConfigMap::iterator bci=block_config.begin(); bci!=block_config.end(); ++bci) {
1053  rose_addr_t va = bci->first;
1054  BlockConfig *bconf = bci->second;
1055 
1056  BasicBlock *bb = find_bb_starting(va);
1057  if (!bb)
1058  throw Exception("cannot obtain IPD-specified basic block at " + StringUtility::addrToString(va));
1059  if (bb->insns.size()<bconf->ninsns)
1060  throw Exception("cannot obtain " + StringUtility::numberToString(bconf->ninsns) + "-instruction basic block at " +
1061  StringUtility::addrToString(va) + " (only " + StringUtility::numberToString(bb->insns.size()) +
1062  " available)");
1063  if (bb->insns.size()>bconf->ninsns)
1064  truncate(bb, bb->insns[bconf->ninsns]->get_address());
1065 
1066  /* Initial analysis followed augmented by settings from the configuration. */
1067  update_analyses(bb);
1068  bb->cache.alias_for = bconf->alias_for;
1069  if (bconf->sucs_specified) {
1070  bb->cache.sucs = bconf->sucs;
1071  bb->cache.sucs_complete = bconf->sucs_complete;
1072  }
1073  if (!bconf->sucs_program.empty()) {
1074  /* "Execute" the program that will detect successors. We do this by interpreting the basic block to initialize
1075  * registers, loading the successor program, pushing some arguments onto the program's stack, interpreting the
1076  * program, extracting return values from memory, and unloading the program. */
1077  bool debug = false;
1078  char block_name_str[64];
1079  sprintf(block_name_str, "B%08"PRIx64, va);
1080  std::string block_name = block_name_str;
1081  if (debug) fprintf(stderr, "running successors program for %s\n", block_name_str);
1082 
1083  // FIXME: Use a copy (COW) version of the map so we don't need to modify the real map and so that the simulated
1084  // program can't accidentally modify the stuff being disassembled. [RPM 2012-05-07]
1085  MemoryMap *map = get_map();
1086  assert(map!=NULL);
1087  typedef PartialSymbolicSemantics::Policy<> Policy;
1088  typedef X86InstructionSemantics<Policy, PartialSymbolicSemantics::ValueType> Semantics;
1089  Policy policy;
1090  policy.set_map(map);
1091  Semantics semantics(policy);
1092 
1093  if (debug) fprintf(stderr, " running semantics for the basic block...\n");
1094  for (InstructionVector::iterator ii=bb->insns.begin(); ii!=bb->insns.end(); ++ii) {
1096  assert(insn!=NULL);
1097  semantics.processInstruction(insn);
1098  }
1099 
1100  /* Load the program. Keep at least one unmapped byte between the program text, stack, and svec areas in order to
1101  * help with debugging. */
1102  if (debug) fprintf(stderr, " loading the program...\n");
1103 
1104  /* Load the instructions to execute */
1105  rose_addr_t text_va = map->find_free(0, bconf->sucs_program.size(), 4096);
1106  MemoryMap::Segment text_sgmt(MemoryMap::ExternBuffer::create(&(bconf->sucs_program[0]), bconf->sucs_program.size()),
1107  0, MemoryMap::MM_PROT_RX, block_name + " successors program text");
1108  map->insert(Extent(text_va, bconf->sucs_program.size()), text_sgmt);
1109 
1110  /* Create a stack */
1111  static const size_t stack_size = 8192;
1112  rose_addr_t stack_va = map->find_free(text_va+bconf->sucs_program.size()+1, stack_size, 4096);
1113  MemoryMap::Segment stack_sgmt(MemoryMap::AnonymousBuffer::create(stack_size), 0,
1114  MemoryMap::MM_PROT_RW, block_name + " successors stack");
1115  map->insert(Extent(stack_va, stack_size), stack_sgmt);
1116  rose_addr_t stack_ptr = stack_va + stack_size;
1117 
1118  /* Create an area for the returned vector of successors */
1119  static const size_t svec_size = 8192;
1120  rose_addr_t svec_va = map->find_free(stack_va+stack_size+1, svec_size, 4096);
1122  MemoryMap::MM_PROT_RW, block_name + " successors vector");
1123  map->insert(Extent(svec_va, svec_size), svec_sgmt);
1124 
1125  /* What is the "return" address. Eventually the successors program will execute a "RET" instruction that will
1126  * return to this address. We can choose something arbitrary as long as it doesn't conflict with anything else.
1127  * We'll use the first byte past the end of the successor program, which gives the added benefit that the
1128  * successor program doesn't actually have to even return -- it can just fall off the end. */
1129  rose_addr_t return_va = text_va + bconf->sucs_program.size();
1130  if (debug) {
1131  fprintf(stderr, " memory map after program is loaded:\n");
1132  map->dump(stderr, " ");
1133  }
1134 
1135  /* Push arguments onto the stack in reverse order. */
1136  if (debug) fprintf(stderr, " setting up the call frame...\n");
1137 
1138  /* old stack pointer */
1139  stack_ptr -= 4;
1140  policy.writeMemory<32>(x86_segreg_ss, policy.number<32>(stack_ptr),
1141  policy.readRegister<32>("esp"), policy.true_());
1142 
1143  /* address past the basic block's last instruction */
1144  stack_ptr -= 4;
1145  policy.writeMemory<32>(x86_segreg_ss, policy.number<32>(stack_ptr),
1146  policy.number<32>(bb->insns.back()->get_address()+bb->insns.back()->get_size()),
1147  policy.true_());
1148 
1149  /* address of basic block's first instruction */
1150  stack_ptr -= 4;
1151  policy.writeMemory<32>(x86_segreg_ss, policy.number<32>(stack_ptr),
1152  policy.number<32>(bb->insns.front()->get_address()), policy.true_());
1153 
1154  /* size of svec in bytes */
1155  stack_ptr -= 4;
1156  policy.writeMemory<32>(x86_segreg_ss, policy.number<32>(stack_ptr),
1157  policy.number<32>(svec_size), policy.true_());
1158 
1159  /* address of svec */
1160  stack_ptr -= 4;
1161  policy.writeMemory<32>(x86_segreg_ss, policy.number<32>(stack_ptr),
1162  policy.number<32>(svec_va), policy.true_());
1163 
1164  /* return address for successors program */
1165  stack_ptr -= 4;
1166  policy.writeMemory<32>(x86_segreg_ss, policy.number<32>(stack_ptr),
1167  policy.number<32>(return_va), policy.true_());
1168 
1169  /* Adjust policy stack pointer */
1170  policy.writeRegister("esp", policy.number<32>(stack_ptr));
1171 
1172  /* Interpret the program */
1173  if (debug) fprintf(stderr, " running the program...\n");
1175  assert(disassembler!=NULL);
1176  policy.writeRegister("eip", policy.number<32>(text_va));
1177  while (1) {
1178  rose_addr_t ip = policy.readRegister<32>("eip").known_value();
1179  if (ip==return_va) break;
1180  SgAsmx86Instruction *insn = isSgAsmx86Instruction(disassembler->disassembleOne(map, ip));
1181  if (debug) fprintf(stderr, " 0x%08"PRIx64": %s\n", ip, insn?unparseInstruction(insn).c_str():"<null>");
1182  assert(insn!=NULL);
1183  semantics.processInstruction(insn);
1184  assert(policy.readRegister<32>("eip").is_known());
1186  }
1187 
1188  /* Extract the list of successors. The number of successors is the first element of the list. */
1189  if (debug) fprintf(stderr, " extracting program return values...\n");
1190  PartialSymbolicSemantics::ValueType<32> nsucs = policy.readMemory<32>(x86_segreg_ss, policy.number<32>(svec_va),
1191  policy.true_());
1192  assert(nsucs.is_known());
1193  if (debug) fprintf(stderr, " number of successors: %"PRId64"\n", nsucs.known_value());
1194  assert(nsucs.known_value()*4 <= svec_size-4); /*first entry is size*/
1195  for (size_t i=0; i<nsucs.known_value(); i++) {
1196  PartialSymbolicSemantics::ValueType<32> suc_va = policy.readMemory<32>(x86_segreg_ss,
1197  policy.number<32>(svec_va+4+i*4),
1198  policy.true_());
1199  if (suc_va.is_known()) {
1200  if (debug) fprintf(stderr, " #%zu: 0x%08"PRIx64"\n", i, suc_va.known_value());
1201  bb->cache.sucs.insert(suc_va.known_value());
1202  } else {
1203  if (debug) fprintf(stderr, " #%zu: unknown\n", i);
1204  bb->cache.sucs_complete = false;
1205  }
1206  }
1207 
1208  /* Unmap the program */
1209  if (debug) fprintf(stderr, " unmapping the program...\n");
1210  map->erase(text_sgmt);
1211  map->erase(stack_sgmt);
1212  map->erase(svec_sgmt);
1213 
1214  if (debug) fprintf(stderr, " done.\n");
1215  }
1216  }
1217 }
1218 
1219 /* Marks program entry addresses as functions. */
1220 void
1222 {
1223  assert(fhdr!=NULL);
1224  SgRVAList entries = fhdr->get_entry_rvas();
1225 
1226  // Libraries don't have entry addresses
1227  if ((entries.empty() || (1==entries.size() && 0==entries[0].get_rva())) &&
1229  return;
1230  }
1231 
1232  for (size_t i=0; i<entries.size(); i++) {
1233  rose_addr_t entry_va = entries[i].get_rva() + fhdr->get_base_va();
1234  if (find_instruction(entry_va))
1235  add_function(entry_va, SgAsmFunction::FUNC_ENTRY_POINT);
1236  }
1237 }
1238 
1239 /* Use the Frame Descriptor Entry Records of the ELF .eh_frame section to mark functions. */
1240 void
1242 {
1243  SgAsmGenericSectionList *sections = fhdr->get_sections();
1244  for (size_t i=0; i<sections->get_sections().size(); i++) {
1246  if (ehframe!=NULL) {
1247  SgAsmElfEHFrameEntryCIList *ci_entries = ehframe->get_ci_entries();
1248  for (size_t j=0; j<ci_entries->get_entries().size(); j++) {
1249  SgAsmElfEHFrameEntryCI *cie = ci_entries->get_entries()[j];
1250  SgAsmElfEHFrameEntryFDList *fd_entries = cie->get_fd_entries();
1251  for (size_t k=0; k<fd_entries->get_entries().size(); k++) {
1252  SgAsmElfEHFrameEntryFD *fde = fd_entries->get_entries()[k];
1253  rose_addr_t target = fde->get_begin_rva().get_rva();
1254  if (find_instruction(target))
1255  add_function(target, SgAsmFunction::FUNC_EH_FRAME);
1256  }
1257  }
1258  }
1259  }
1260 }
1261 
1262 /* Adds each entry of the ELF procedure lookup table (.plt section) to the list of functions. */
1263 void
1265 {
1266  /* This function is ELF, x86 specific. */
1268  if (!elf) return;
1269 
1270  /* Find important sections */
1271  SgAsmGenericSection *plt = elf->get_section_by_name(".plt");
1272  if (!plt || !plt->is_mapped()) return;
1273  SgAsmGenericSection *gotplt = elf->get_section_by_name(".got.plt");
1274  if (!gotplt || !gotplt->is_mapped()) return;
1275 
1276  /* Find all relocation sections */
1277  std::set<SgAsmElfRelocSection*> rsects;
1278  const SgAsmGenericSectionPtrList &sections = elf->get_sections()->get_sections();
1279  for (SgAsmGenericSectionPtrList::const_iterator si=sections.begin(); si!=sections.end(); ++si) {
1280  SgAsmElfRelocSection *reloc_section = isSgAsmElfRelocSection(*si);
1281  if (reloc_section)
1282  rsects.insert(reloc_section);
1283  }
1284  if (rsects.empty()) return;
1285 
1286  /* Look at each instruction in the .plt section. If the instruction is a computed jump to an address stored in the
1287  * .got.plt then we've found the beginning of a plt trampoline. */
1288  rose_addr_t plt_offset = 14; /* skip the first entry (PUSH ds:XXX; JMP ds:YYY; 0x00; 0x00)--the JMP is not a function*/
1289  while (plt_offset<plt->get_mapped_size()) {
1290 
1291  /* Find an x86 instruction */
1292  Instruction *insn = find_instruction(plt->get_mapped_actual_va()+plt_offset);
1293  if (!insn) {
1294  ++plt_offset;
1295  continue;
1296  }
1297  plt_offset += insn->get_size();
1298  SgAsmx86Instruction *insn_x86 = isSgAsmx86Instruction(insn);
1299  if (!insn_x86) continue;
1300 
1301  rose_addr_t gotplt_va = get_indirection_addr(insn_x86, elf->get_base_va()+gotplt->get_mapped_preferred_rva());
1302  if (gotplt_va < elf->get_base_va() + gotplt->get_mapped_preferred_rva() ||
1303  gotplt_va >= elf->get_base_va() + gotplt->get_mapped_preferred_rva() + gotplt->get_mapped_size()) {
1304  continue; /* jump is not indirect through the .got.plt section */
1305  }
1306 
1307  /* Find the relocation entry whose offset is the gotplt_va and use that entry's symbol for the function name. */
1308  std::string name;
1309  for (std::set<SgAsmElfRelocSection*>::iterator ri=rsects.begin(); ri!=rsects.end() && name.empty(); ++ri) {
1310  SgAsmElfRelocEntryList *entries = (*ri)->get_entries();
1311  SgAsmElfSymbolSection *symbol_section = isSgAsmElfSymbolSection((*ri)->get_linked_section());
1312  SgAsmElfSymbolList *symbols = symbol_section ? symbol_section->get_symbols() : NULL;
1313  for (size_t ei=0; ei<entries->get_entries().size() && name.empty() && symbols; ++ei) {
1314  SgAsmElfRelocEntry *rel = entries->get_entries()[ei];
1315  if (rel->get_r_offset()==gotplt_va) {
1316  unsigned long symbol_idx = rel->get_sym();
1317  if (symbol_idx < symbols->get_symbols().size()) {
1318  SgAsmElfSymbol *symbol = symbols->get_symbols()[symbol_idx];
1319  name = symbol->get_name()->get_string() + "@plt";
1320  }
1321  }
1322  }
1323  }
1324 
1325  Function *plt_func = add_function(insn->get_address(), SgAsmFunction::FUNC_IMPORT, name);
1326 
1327  /* FIXME: Assume that most PLT functions return. We make this assumption for now because the PLT table contains an
1328  * indirect jump through the .plt.got data area and we don't yet do static analysis of the data. Because of
1329  * that, all the PLT functons will contain only a basic block with the single indirect jump, and no return
1330  * (e.g., x86 RET or RETF) instruction, and therefore the function would not normally be marked as returning.
1331  * [RPM 2010-05-11] */
1332  if ("abort@plt"!=name && "execl@plt"!=name && "execlp@plt"!=name && "execv@plt"!=name && "execvp@plt"!=name &&
1333  "exit@plt"!=name && "_exit@plt"!=name && "fexecve@plt"!=name &&
1334  "longjmp@plt"!=name && "__longjmp@plt"!=name && "siglongjmp@plt"!=name) {
1336  } else {
1338  }
1339  }
1340 }
1341 
1342 /* Use symbol tables to determine function entry points. */
1343 void
1345 {
1346  SgAsmGenericSectionList *sections = fhdr->get_sections();
1347  for (size_t i=0; i<sections->get_sections().size(); i++) {
1348 
1349  /* If this is a symbol table of some sort, then get the list of symbols. */
1350  std::vector<SgAsmGenericSymbol*> symbols;
1351  if (isSgAsmElfSymbolSection(sections->get_sections()[i])) {
1352  SgAsmElfSymbolList *elf_symbols = isSgAsmElfSymbolSection(sections->get_sections()[i])->get_symbols();
1353  for (size_t j=0; j<elf_symbols->get_symbols().size(); j++) {
1354  symbols.push_back(elf_symbols->get_symbols()[j]);
1355  }
1356  } else if (isSgAsmCoffSymbolTable(sections->get_sections()[i])) {
1357  SgAsmCoffSymbolList *coff_symbols = isSgAsmCoffSymbolTable(sections->get_sections()[i])->get_symbols();
1358  for (size_t j=0; j<coff_symbols->get_symbols().size(); j++) {
1359  symbols.push_back(coff_symbols->get_symbols()[j]);
1360  }
1361  }
1362 
1363  for (size_t j=0; j<symbols.size(); j++) {
1364  SgAsmGenericSymbol *symbol = symbols[j];
1367  symbol->get_value()!=0) {
1368  rose_addr_t value = fhdr->get_base_va() + symbol->get_value();
1369  SgAsmGenericSection *section = symbol->get_bound();
1370 
1371  // Add a function at the symbol's value. If the symbol is bound to a section and the section is mapped at a
1372  // different address than it expected to be mapped, then adjust the symbol's value by the same amount.
1373  rose_addr_t va_1 = value;
1374  if (section!=NULL && section->is_mapped() &&
1375  section->get_mapped_preferred_va()!=section->get_mapped_actual_va()) {
1376  va_1 += section->get_mapped_actual_va() - section->get_mapped_preferred_va();
1377  }
1378  if (find_instruction(va_1))
1379  add_function(va_1, SgAsmFunction::FUNC_SYMBOL, symbol->get_name()->get_string());
1380 
1381  // Sometimes weak symbol values are offsets from a section (this code handles that), but other times they're
1382  // the value is used directly (the above code handled that case). */
1383  if (section && symbol->get_binding()==SgAsmGenericSymbol::SYM_WEAK)
1384  value += section->get_mapped_actual_va();
1385  if (find_instruction(value))
1386  add_function(value, SgAsmFunction::FUNC_SYMBOL, symbol->get_name()->get_string());
1387  }
1388  }
1389  }
1390 }
1391 
1392 /* Adds PE exports as function entry points. */
1393 void
1395 {
1396  SgAsmGenericSectionList *sections = fhdr->get_sections();
1397  for (size_t i=0; i<sections->get_sections().size(); ++i) {
1398  if (SgAsmPEExportSection *export_section = isSgAsmPEExportSection(sections->get_sections()[i])) {
1399  const SgAsmPEExportEntryPtrList &exports = export_section->get_exports()->get_exports();
1400  for (SgAsmPEExportEntryPtrList::const_iterator ei=exports.begin(); ei!=exports.end(); ++ei) {
1401  rose_addr_t va = (*ei)->get_export_rva().get_va();
1402  if (find_instruction(va))
1403  add_function(va, SgAsmFunction::FUNC_EXPORT, (*ei)->get_name()->get_string());
1404  }
1405  }
1406  }
1407 }
1408 
1409 /* Tries to match "(mov rdi,rdi)?; push rbp; mov rbp,rsp" (or the 32-bit equivalent). */
1410 Partitioner::InstructionMap::const_iterator
1411 Partitioner::pattern1(const InstructionMap& insns, InstructionMap::const_iterator first, Disassembler::AddressSet &exclude)
1412 {
1413  InstructionMap::const_iterator ii = first;
1414  Disassembler::AddressSet matches;
1415 
1416  /* Look for optional "mov rdi, rdi"; if found, advance ii iterator to fall-through instruction */
1417  do {
1418  SgAsmx86Instruction *insn = isSgAsmx86Instruction(ii->second);
1419  if (!insn || insn->get_kind()!=x86_mov)
1420  break;
1421  const SgAsmExpressionPtrList &opands = insn->get_operandList()->get_operands();
1422  if (opands.size()!=2)
1423  break;
1425  if (!rre ||
1428  break;
1429  rre = isSgAsmx86RegisterReferenceExpression(opands[1]);
1430  if (!rre ||
1433  break;
1434  matches.insert(ii->first);
1435  ii = insns.find(ii->first + insn->get_size());
1436  } while (0);
1437 
1438  /* Look for "push rbp" */
1439  {
1440  if (ii==insns.end())
1441  return insns.end();
1442  SgAsmx86Instruction *insn = isSgAsmx86Instruction(ii->second);
1443  if (!insn || insn->get_kind()!=x86_push)
1444  return insns.end();
1445  const SgAsmExpressionPtrList &opands = insn->get_operandList()->get_operands();
1446  if (opands.size()!=1)
1447  return insns.end();
1449  if (!rre ||
1452  return insns.end();
1453  matches.insert(ii->first);
1454  ii = insns.find(ii->first + insn->get_size());
1455  }
1456 
1457  /* Look for "mov rbp,rsp" */
1458  {
1459  if (ii==insns.end())
1460  return insns.end();
1461  SgAsmx86Instruction *insn = isSgAsmx86Instruction(ii->second);
1462  if (!insn || insn->get_kind()!=x86_mov)
1463  return insns.end();
1464  const SgAsmExpressionPtrList &opands = insn->get_operandList()->get_operands();
1465  if (opands.size()!=2)
1466  return insns.end();
1468  if (!rre ||
1471  return insns.end();
1472  rre = isSgAsmx86RegisterReferenceExpression(opands[1]);
1473  if (!rre ||
1476  return insns.end();
1477  matches.insert(ii->first);
1478  }
1479 
1480  exclude.insert(matches.begin(), matches.end());
1481  return first;
1482 }
1483 
1484 #if 0 /*commented out in Partitioner::mark_func_patterns()*/
1485 /* Tries to match "nop;nop;nop" followed by something that's not a nop. */
1486 Partitioner::InstructionMap::const_iterator
1487 Partitioner::pattern2(const InstructionMap& insns, InstructionMap::const_iterator first, Disassembler::AddressSet &exclude)
1488 {
1489  InstructionMap::const_iterator ii = first;
1490  Disassembler::AddressSet matches;
1491 
1492  /* Look for three "nop" instructions */
1493  for (size_t i=0; i<3; i++) {
1494  SgAsmx86Instruction *nop = isSgAsmx86Instruction(ii->second);
1495  if (!nop) return insns.end();
1496  if (nop->get_kind()!=x86_nop) return insns.end();
1497  if (nop->get_operandList()->get_operands().size()!=0) return insns.end(); /*only zero-arg NOPs allowed*/
1498  matches.insert(ii->first);
1499  ii = insns.find(ii->first + nop->get_size());
1500  if (ii==insns.end()) return insns.end();
1501  }
1502 
1503  /* Look for something that's not a "nop"; this is the function entry point. */
1504  SgAsmx86Instruction *notnop = isSgAsmx86Instruction(ii->second);
1505  if (!notnop) return insns.end();
1506  if (notnop->get_kind()==x86_nop) return insns.end();
1507  matches.insert(ii->first);
1508 
1509  exclude.insert(matches.begin(), matches.end());
1510  return ii;
1511 }
1512 #endif
1513 
1514 #if 0 /* commented out in Partitioner::mark_func_patterns() */
1515 /* Tries to match "leave;ret" followed by one or more "nop" followed by a non-nop */
1516 Partitioner::InstructionMap::const_iterator
1517 Partitioner::pattern3(const InstructionMap& insns, InstructionMap::const_iterator first, Disassembler::AddressSet &exclude)
1518 {
1519  InstructionMap::const_iterator ii = first;
1520  Disassembler::AddressSet matches;
1521 
1522  /* leave; ret; nop */
1523  for (size_t i=0; i<3; i++) {
1524  SgAsmx86Instruction *insn = isSgAsmx86Instruction(ii->second);
1525  if (!insn) return insns.end();
1526  if ((i==0 && insn->get_kind()!=x86_leave) ||
1527  (i==1 && insn->get_kind()!=x86_ret) ||
1528  (i==2 && insn->get_kind()!=x86_nop))
1529  return insns.end();
1530  matches.insert(ii->first);
1531  ii = insns.find(ii->first + insn->get_size());
1532  if (ii==insns.end()) return insns.end();
1533  }
1534 
1535  /* Zero or more "nop" instructions */
1536  while (1) {
1537  SgAsmx86Instruction *insn = isSgAsmx86Instruction(ii->second);
1538  if (!insn) return insns.end();
1539  if (insn->get_kind()!=x86_nop) break;
1540  matches.insert(ii->first);
1541  ii = insns.find(ii->first + insn->get_size());
1542  if (ii==insns.end()) return insns.end();
1543  }
1544 
1545  /* This must be something that's not a "nop", but make sure it's an x86 instruction anyway. */
1546  SgAsmx86Instruction *insn = isSgAsmx86Instruction(ii->second);
1547  if (!insn) return insns.end();
1548  matches.insert(ii->first);
1549 
1550  exclude.insert(matches.begin(), matches.end());
1551  return ii;
1552 }
1553 #endif
1554 
1559 void
1561 {
1562  // Create functions when we see certain patterns of bytes
1563  struct T1: ByteRangeCallback {
1564  Partitioner *p;
1565  T1(Partitioner *p): p(p) {}
1566  virtual bool operator()(bool enabled, const Args &args) /*override*/ {
1567  assert(args.restrict_map!=NULL);
1568  uint8_t buf[4096];
1569  static const size_t patternsz=16;
1570  assert(patternsz<sizeof buf);
1571  if (enabled) {
1572  rose_addr_t va = args.range.first();
1573  while (va<=args.range.last()) {
1574  size_t nbytes = std::min(args.range.last()+1-va, (rose_addr_t)sizeof buf);
1575  size_t nread = args.restrict_map->read(buf, va, nbytes);
1576  for (size_t i=0; i<nread; ++i) {
1577  // "55 8b ec" is x86 "push ebp; mov ebp, esp"
1578  if (i+3<nread && 0x55==buf[i+0] && 0x8b==buf[i+1] && 0xec==buf[i+2]) {
1580  i+=2;
1581  }
1582  }
1583  va += nread;
1584  }
1585  }
1586  return enabled;
1587  }
1588  } t1(this);
1589  MemoryMap *mm = get_map();
1590  if (mm)
1591  scan_unassigned_bytes(&t1, mm);
1592 
1593  // Create functions when we see certain patterns of instructions. Note that this will only work if we've already
1594  // disassembled the instructions.
1595  Disassembler::AddressSet exclude;
1596  InstructionMap::const_iterator found;
1597 
1598  for (InstructionMap::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
1599  if (exclude.find(ii->first)==exclude.end() && (found=pattern1(insns, ii, exclude))!=insns.end())
1600  add_function(found->first, SgAsmFunction::FUNC_PATTERN);
1601  }
1602 #if 0 /* Disabled because NOPs sometimes legitimately appear inside functions */
1603  for (InstructionMap::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
1604  if (exclude.find(ii->first)==exclude.end() && (found=pattern2(insns, ii, exclude))!=insns.end())
1605  add_function(found->first, SgAsmFunction::FUNC_PATTERN);
1606  }
1607 #endif
1608 #if 0 /* Disabled because NOPs sometimes follow "leave;ret" for functions with multiple returns. */
1609  for (InstructionMap::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
1610  if (exclude.find(ii->first)==exclude.end() && (found=pattern3(insns, ii, exclude))!=insns.end())
1611  add_function(found->first, SgAsmFunction::FUNC_PATTERN);
1612  }
1613 #endif
1614 }
1615 
1616 /* Make all CALL/FARCALL targets functions. This is a naive approach that won't work for some obfuscated software. A more
1617  * thorough approach considers only those calls that are reachable. A CALL whose target is the address following the CALL
1618  * instruction is not counted as a function call. */
1619 void
1621 {
1622  for (InstructionMap::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
1623  std::vector<SgAsmInstruction*> iv;
1624  iv.push_back(ii->second->node);
1625  rose_addr_t target_va=NO_TARGET;
1626  if (ii->second->node->is_function_call(iv, &target_va, NULL) && target_va!=NO_TARGET &&
1627  target_va!=ii->first + ii->second->get_size()) {
1628  add_function(target_va, SgAsmFunction::FUNC_CALL_TARGET, "");
1629  }
1630  }
1631 }
1632 
1633 /* Scan through ranges of contiguous instructions */
1634 void
1636  Instruction *prev, Instruction *end)
1637 {
1638  while (!insns.empty()) {
1639  Instruction *first = insns.begin()->second;
1640  rose_addr_t va = first->get_address();
1641  InstructionMap::iterator ii = insns.find(va);
1642  InstructionVector contig;
1643  while (ii!=insns.end()) {
1644  contig.push_back(ii->second);
1645  va += ii->second->get_size();
1646  ii = insns.find(va);
1647  }
1648  cblist.apply(true, InsnRangeCallback::Args(this, prev, first, end, contig.size()));
1649  for (size_t i=0; i<contig.size(); i++)
1650  insns.erase(contig[i]->get_address());
1651  }
1652 }
1653 
1654 /* Scan through all unassigned instructions */
1655 void
1657 {
1658  if (cblist.empty())
1659  return;
1660 
1661  /* We can't iterate over the instruction list while invoking callbacks because one of them might change the instruction
1662  * list. Therefore, iterate over a copy of the list. Instructions are never deleted by the partitioner (only added), so
1663  * this is safe to do. */
1664  InstructionMap all = this->insns;
1665  InstructionMap range;
1666  Instruction *prev = NULL;
1667  for (InstructionMap::iterator ai=all.begin(); ai!=all.end(); ++ai) {
1668  BasicBlock *bb = find_bb_containing(ai->first, false);
1669  Function *func = bb ? bb->function : NULL;
1670  if (func) {
1671  if (!range.empty()) {
1672  scan_contiguous_insns(range, cblist, prev, ai->second);
1673  range.clear();
1674  }
1675  prev = ai->second;
1676  } else {
1677  range.insert(*ai);
1678  }
1679  }
1680  if (!range.empty())
1681  scan_contiguous_insns(range, cblist, prev, NULL);
1682 }
1683 
1684 /* Similar to scan_unassigned_insns except only invokes callbacks when the "prev" and "end" instructions belong to different
1685  * functions or one of them doesn't exist. */
1686 void
1688 {
1689  if (cblist.empty())
1690  return;
1691 
1692  struct Filter: public InsnRangeCallback {
1693  virtual bool operator()(bool enabled, const Args &args) {
1694  if (enabled) {
1695  if (!args.insn_prev || !args.insn_end)
1696  return true;
1697  BasicBlock *bb_lt = args.partitioner->find_bb_containing(args.insn_prev->get_address(), false);
1698  BasicBlock *bb_rt = args.partitioner->find_bb_containing(args.insn_end->get_address(), false);
1699  assert(bb_lt && bb_lt->function); // because we're invoked from scan_unassigned_insns
1700  assert(bb_rt && bb_rt->function); // ditto
1701  enabled = bb_lt->function != bb_rt->function;
1702  }
1703  return enabled;
1704  }
1705  } filter;
1706  InsnRangeCallbacks cblist2 = cblist;
1707  cblist2.prepend(&filter);
1708  scan_unassigned_insns(cblist2);
1709 }
1710 
1711 /* Similar to scan_unassigned_insns except only invokes callbacks when "prev" and "end" instructions belong to the same
1712  * function. */
1713 void
1715 {
1716  if (cblist.empty())
1717  return;
1718 
1719  struct Filter: public InsnRangeCallback {
1720  virtual bool operator()(bool enabled, const Args &args) {
1721  if (enabled) {
1722  if (!args.insn_prev || !args.insn_end)
1723  return false;
1724  BasicBlock *bb_lt = args.partitioner->find_bb_containing(args.insn_prev->get_address(), false);
1725  BasicBlock *bb_rt = args.partitioner->find_bb_containing(args.insn_end->get_address(), false);
1726  assert(bb_lt && bb_lt->function); // because we're invoked from scan_unassigned_insns
1727  assert(bb_rt && bb_rt->function); // ditto
1728  enabled = bb_lt->function == bb_rt->function;
1729  }
1730  return enabled;
1731  }
1732  } filter;
1733  InsnRangeCallbacks cblist2 = cblist;
1734  cblist2.prepend(&filter);
1735  scan_unassigned_insns(cblist2);
1736 }
1737 
1738 void
1740 {
1741  if (cblist.empty())
1742  return;
1743 
1744  /* Get range map for addresses assigned to functions (function instructions and data w/ function pointers). */
1745  FunctionRangeMap assigned;
1746  function_extent(&assigned);
1747 
1748  /* Unassigned ranges are the inverse of everything assigned. Then further restrict the unassigned range map according to
1749  * the supplied memory map. */
1750  ExtentMap unassigned = assigned.invert<ExtentMap>();
1751  if (restrict_map)
1752  unassigned.erase_ranges(restrict_map->va_extents().invert<ExtentMap>());
1753 
1754  /* Traverse the unassigned map, invoking the callbacks for each range. */
1755  for (ExtentMap::iterator ri=unassigned.begin(); ri!=unassigned.end(); ++ri)
1756  cblist.apply(true, ByteRangeCallback::Args(this, restrict_map, assigned, ri->first));
1757 }
1758 
1759 void
1761 {
1762  if (cblist.empty())
1763  return;
1764 
1765  struct Filter: public ByteRangeCallback {
1766  virtual bool operator()(bool enabled, const Args &args) {
1767  if (enabled) {
1768  if (args.range.first()<=Extent::minimum() || args.range.last()>=Extent::maximum())
1769  return false; // nothing before and/or after
1770 
1771  /* Find the closest function before this range. */
1772  FunctionRangeMap::const_iterator prev = args.ranges.find_prior(args.range.first()-1);
1773  if (prev==args.ranges.end())
1774  return false; // nothing before this range
1775 
1776  /* Find the closest function above this range. */
1777  FunctionRangeMap::const_iterator next = args.ranges.lower_bound(args.range.last()+1);
1778  if (next==args.ranges.end())
1779  return false; // nothing after this range
1780 
1781  /* Continue only if this range is between two of the same functions. */
1782  enabled = prev->second.get()==next->second.get();
1783  }
1784  return enabled;
1785  }
1786  } filter;
1787  ByteRangeCallbacks cblist2 = cblist;
1788  cblist2.prepend(&filter);
1789  scan_unassigned_bytes(cblist2, restrict_map);
1790 }
1791 
1792 void
1794 {
1795  if (cblist.empty())
1796  return;
1797 
1798  struct Filter: public ByteRangeCallback {
1799  virtual bool operator()(bool enabled, const Args &args) {
1800  if (enabled) {
1801  if (args.range.first()<=Extent::minimum() || args.range.last()>=Extent::maximum())
1802  return true; // nothing before and/or after
1803 
1804  /* Find the closest function before this range. */
1805  FunctionRangeMap::const_iterator prev = args.ranges.find_prior(args.range.first()-1);
1806  if (prev==args.ranges.end())
1807  return true; // nothing before this range
1808 
1809  /* Find the closest function above this range. */
1810  FunctionRangeMap::const_iterator next = args.ranges.lower_bound(args.range.last()+1);
1811  if (next==args.ranges.end())
1812  return true; // nothing after this range
1813 
1814  /* Continue only if this range is between two different functions. */
1815  enabled = prev->second.get()!=next->second.get();
1816  }
1817  return enabled;
1818  }
1819  } filter;
1820  ByteRangeCallbacks cblist2 = cblist;
1821  cblist2.prepend(&filter);
1822  scan_unassigned_bytes(cblist2, restrict_map);
1823 }
1824 
1825 bool
1827 {
1828  if (!enabled)
1829  return false;
1830  Partitioner *p = args.partitioner;
1831  Extent range = args.range;
1832 
1833  /* What is the maximum pattern length in bytes? */
1834  if (patterns.empty())
1835  return true;
1836  size_t max_psize = patterns[0].size();
1837  for (size_t pi=1; pi<patterns.size(); ++pi)
1838  max_psize = std::max(max_psize, patterns[pi].size());
1839 
1840  /* What is the previous function? The one to which padding is to be attached. */
1841  if (range.first()<=Extent::minimum())
1842  return true;
1843  FunctionRangeMap::const_iterator prev = begins_contiguously ?
1844  args.ranges.find(range.first()-1) :
1845  args.ranges.find_prior(range.first()-1);
1846  if (prev==args.ranges.end())
1847  return true;
1848  Function *func = prev->second.get();
1849  assert(func!=NULL);
1850 
1851  /* Do we need to be contiguous with a following function? This only checks whether the incoming range ends at the
1852  * beginning of a function. We'll check below whether a padding sequence also ends at the end of this range. */
1853  if (ends_contiguously) {
1854  if (max_psize*maximum_nrep < range.size())
1855  return true;
1856  if (range.last()>=Extent::maximum())
1857  return true;
1858  FunctionRangeMap::const_iterator next = args.ranges.find(range.last()+1);
1859  if (next==args.ranges.end())
1860  return true;
1861  }
1862 
1863  /* To keep things simple, we read the entire range (which might not be contigous in the MemoryMap) into a contiguous
1864  * buffer. However, that means we had better not try to read huge, anonymous ranges. Also handle the case of a short
1865  * read, although this shouldn't happen if the caller supplied the correct memory map to the scan_*_bytes() method. */
1866  if (range.size() > maximum_range_size)
1867  return true;
1868  MemoryMap *map = args.restrict_map ? args.restrict_map : &p->ro_map;
1869  SgUnsignedCharList buf = map->read(range.first(), range.size());
1870  if (ends_contiguously && buf.size()<range.size())
1871  return true;
1872  range.resize(buf.size());
1873 
1874  /* There might be more than one sequence of padding bytes. Look for all of them, but constrained according to
1875  * begins_contiguously and ends_contiguously. */
1876  size_t nblocks = 0; // number of blocks added to function
1877  while (!range.empty()) {
1878  if (begins_contiguously && range.first()>args.range.first())
1879  return true;
1880  for (size_t pi=0; pi<patterns.size(); ++pi) {
1881  size_t psize = patterns[pi].size();
1882  assert(psize>0);
1883  size_t nrep = 0;
1884  for (size_t offset=range.first()-args.range.first();
1885  offset+psize<=buf.size() && nrep<maximum_nrep;
1886  offset+=psize, ++nrep) {
1887  if (memcmp(&buf[offset], &patterns[pi][0], psize))
1888  break;
1889  }
1890  if (nrep>0 && nrep>=minimum_nrep && (!ends_contiguously || nrep*psize==range.size())) {
1891  /* Found a matching repeated pattern. Add data block to function. */
1892  DataBlock *dblock = p->find_db_starting(range.first(), nrep*psize);
1893  assert(dblock!=NULL);
1894  p->append(func, dblock, SgAsmBlock::BLK_PADDING);
1895  ++nblocks;
1896  ++nfound;
1897  if (p->debug) {
1898  if (1==nblocks)
1899  fprintf(p->debug, "Partitioner::FindDataPadding for F%08"PRIx64": added", func->entry_va);
1900  fprintf(p->debug, " D%08"PRIx64, range.first());
1901  }
1902  range.first(range.first()+nrep*psize-1); // will be incremented after break
1903  break;
1904  }
1905  }
1906  if (!range.empty())
1907  range.first(range.first()+1);
1908  }
1909  if (p->debug && nblocks>0)
1910  fprintf(p->debug, "\n");
1911 
1912  return true;
1913 }
1914 
1915 bool
1916 Partitioner::FindData::operator()(bool enabled, const Args &args)
1917 {
1918  if (!enabled)
1919  return false;
1920  Partitioner *p = args.partitioner;
1921 
1922  /* We must have a function immediately before this range and to which this range's data can be attached. */
1923  if (args.range.first()<=Extent::minimum())
1924  return true;
1925  FunctionRangeMap::const_iterator prev = args.ranges.find(args.range.first()-1);
1926  if (prev==args.ranges.end())
1927  return true;
1928  Function *func = prev->second.get();
1929  assert(func!=NULL);
1930 
1931  /* Don't append data to non-functions. */
1932  if (0!=(func->reason & excluded_reasons))
1933  return true;
1934 
1935  /* Padding ranges are computed once and cached. */
1936  if (NULL==padding_ranges) {
1937  padding_ranges = new DataRangeMap;
1938  p->padding_extent(padding_ranges);
1939  }
1940 
1941  /* Don't append data if the previous thing is padding. */
1942  if (padding_ranges->find(args.range.first()-1)!=padding_ranges->end())
1943  return true;
1944 
1945  /* Create a data block and add it to the previous function. */
1946  DataBlock *dblock = p->find_db_starting(args.range.first(), args.range.size());
1947  assert(dblock!=NULL);
1948  p->append(func, dblock, SgAsmBlock::BLK_FINDDATA);
1949  ++nfound;
1950  if (p->debug)
1951  fprintf(p->debug, "Partitioner::FindData: for F%08"PRIx64": added D%08"PRIx64"\n",
1952  func->entry_va, args.range.first());
1953  return true;
1954 }
1955 
1956 /* Create functions or data for inter-function padding instruction sequences. Returns true if we did not find interfunction
1957  * padding and other padding callbacks should proceed; returns false if we did find padding and the others should be skipped. */
1958 bool
1960 {
1961  if (!enabled)
1962  return false;
1963  if (!args.insn_prev)
1964  return true;
1965  assert(args.ninsns>0);
1966  assert(args.insn_prev!=NULL);
1967  assert(args.insn_begin!=NULL);
1968 
1969  if (begins_contiguously &&
1970  args.insn_begin->get_address()!=args.insn_prev->get_address()+args.insn_prev->get_size())
1971  return true;
1972 
1973  /* The preceding function. We'll add the padding as data to this function, unless we're creating explicity padding
1974  * functions. */
1975  Partitioner *p = args.partitioner;
1976  Function *prev_func = NULL;
1977  {
1978  BasicBlock *last_block = p->find_bb_containing(args.insn_prev->get_address(), false);
1979  assert(last_block!=NULL);
1980  prev_func = last_block->function;
1981  }
1982 
1983  /* Loop over the inter-function instructions and accumulate contiguous ranges of padding. */
1984  bool retval = true;
1985  InstructionVector padding;
1986  rose_addr_t va = args.insn_begin->get_address();
1987  Instruction *insn = p->find_instruction(va);
1988  for (size_t i=0; i<args.ninsns && insn!=NULL; i++) {
1989 
1990  /* Does this instruction match? */
1991  bool matches = false;
1992  assert(insn!=NULL); // callback is being invoked over instructions
1993  BasicBlock *bb = p->find_bb_containing(va, false);
1994  if (bb && bb->function)
1995  break; // insn is already assigned to a function
1996 
1998  if (!matches && insn_x86) {
1999  if (x86_kinds.find(insn_x86->get_kind())!=x86_kinds.end())
2000  matches = true;
2001  }
2002 
2003  for (size_t j=0; !matches && j<byte_patterns.size(); j++) {
2004  if (byte_patterns[j]==insn->get_raw_bytes())
2005  matches = true;
2006  }
2007 
2008 
2009  /* Advance to next instruction, or null. We do this inside the loop so we only need one copy of the code that inserts
2010  * the basic blocks for padding. */
2011  va += insn->get_size();
2012  if (matches)
2013  padding.push_back(insn);
2014  insn = i+1<args.ninsns ? p->find_instruction(va) : NULL;
2015 
2016  /* Do we have padding to insert, and are we at the end of that padding? If not, then continue looping. */
2017  if ((matches && insn) || padding.empty())
2018  continue; // try to grab more padding instructions
2019  if (begins_contiguously &&
2020  padding.front()->get_address()!=args.insn_prev->get_address()+args.insn_prev->get_size())
2021  return true; // no point in even continuing the loop, since we're already past the first instruction now.
2022  if (ends_contiguously) {
2023  if (!matches) {
2024  padding.clear();
2025  continue;
2026  } else if (args.insn_end) {
2027  if (padding.back()->get_address()+padding.back()->get_size() != args.insn_end->get_address()) {
2028  padding.clear();
2029  continue;
2030  }
2031  } else if (i+1<args.ninsns) {
2032  padding.clear();
2033  continue;
2034  }
2035  }
2036 
2037  /* Make sure we got enough bytes. We can subtract first from last since we know they're contiguous in memory. */
2038  if (padding.back()->get_address()+padding.back()->get_size() - padding.front()->get_address() < minimum_size) {
2039  padding.clear();
2040  continue;
2041  }
2042 
2043  /* If we get here, then we have padding instructions. Either create a data block to hold the padding, or a new
2044  * function to hold the padding. When creating a function, the basic blocks are added as CFG heads, which cause the
2045  * block to remain with the function even though it might not be reachable by the CFG starting from the function's
2046  * entry point. This is especially true for padding like x86 INT3 instructions, which have no known CFG successors and
2047  * occupy singleton basic blocks. */
2048  ++nfound;
2049  assert(!padding.empty());
2050  if (add_as_data) {
2051  assert(prev_func!=NULL);
2052  rose_addr_t begin_va = padding.front()->get_address();
2053  rose_addr_t end_va = padding.back()->get_address() + padding.back()->get_size();
2054  assert(end_va>begin_va);
2055  size_t size = end_va - begin_va;
2056  DataBlock *dblock = p->find_db_starting(begin_va, size);
2057  assert(dblock!=NULL);
2058  p->append(prev_func, dblock, SgAsmBlock::BLK_PADDING);
2059  for (size_t i=0; i<padding.size(); i++)
2060  p->discard(padding[i]);
2061  if (p->debug)
2062  fprintf(p->debug, "Partitioner::FindInsnPadding: for F%08"PRIx64": added D%08"PRIx64"\n",
2063  prev_func->entry_va, begin_va);
2064  } else {
2065  Function *new_func = p->add_function(padding.front()->get_address(), SgAsmFunction::FUNC_PADDING);
2066  p->find_bb_starting(padding.front()->get_address()); // split first block if necessary
2067  p->find_bb_starting(va); // split last block if necessary
2068  if (p->debug)
2069  fprintf(p->debug, "Partitioner::FindInsnPadding: for F%08"PRIx64": added", new_func->entry_va);
2070  for (size_t i=0; i<padding.size(); i++) {
2071  BasicBlock *bb = p->find_bb_containing(padding[i]->get_address());
2072  if (bb && !bb->function) {
2073  p->append(new_func, bb, SgAsmBlock::BLK_PADDING, true/*head of CFG subgraph*/);
2074  if (p->debug)
2075  fprintf(p->debug, " B%08"PRIx64"#%zu", bb->address(), bb->insns.size());
2076  }
2077  }
2078  if (p->debug)
2079  fprintf(p->debug, "\n");
2080  }
2081 
2082  retval = padding.size()!=args.ninsns; // allow other callbacks to run only if we didn't suck up all the instructions
2083  padding.clear();
2084  }
2085  return retval;
2086 }
2087 
2094 Partitioner::find_db_starting(rose_addr_t start_va, size_t size/*=0*/)
2095 {
2096  DataBlock *db = NULL;
2097  DataBlocks::iterator dbi = data_blocks.find(start_va);
2098  if (dbi!=data_blocks.end()) {
2099  db = dbi->second;
2100  if (0==size)
2101  return db; /* caller doesn't care about the size, only whether the block is present. */
2102 
2103  /* Check whether the block contains all the addresses we want. They might not all be in the first node. */
2104  DataRangeMap want; want.insert(Extent(start_va, size));
2105  DataRangeMap have; datablock_extent(db, &have);
2106  want.erase_ranges(have);
2107  if (want.empty())
2108  return db;
2109  }
2110  if (0==size)
2111  return NULL;
2112 
2113  /* Create a new SgAsmStaticData node to represent the entire address range and add it to the (possibly new) data block.
2114  * When adding to an existing block, the new SgAsmStaticData node will overlap with at least the initial node, and possibly
2115  * others. */
2116  if (!db) {
2117  db = new DataBlock;
2118  data_blocks[start_va] = db;
2119  }
2120 
2121  SgUnsignedCharList raw_bytes = map->read(start_va, size, MemoryMap::MM_PROT_NONE);
2122  assert(raw_bytes.size()==size);
2123  SgAsmStaticData *datum = new SgAsmStaticData;
2124  datum->set_address(start_va);
2125  datum->set_raw_bytes(raw_bytes);
2126  db->nodes.push_back(datum);
2127 
2128  return db;
2129 }
2130 
2131 bool
2133 {
2134  if (!enabled)
2135  return false;
2136  Partitioner *p = args.partitioner;
2137 
2138  /* Compute and cache the extents of all known functions. */
2139  if (!function_extents) {
2140  function_extents = new FunctionRangeMap;
2141  p->function_extent(function_extents);
2142  }
2143 
2144  /* Compute and cache code criteria. */
2145  if (!code_criteria) {
2146  RegionStats *mean = p->aggregate_statistics();
2147  RegionStats *variance = p->get_aggregate_variance();
2148  code_criteria = p->new_code_criteria(mean, variance, threshold);
2149  }
2150 
2151  /* This range must begin contiguously with a valid function. */
2152  if (args.range.first()<=Extent::minimum())
2153  return true;
2154  FunctionRangeMap::iterator prev = function_extents->find(args.range.first()-1);
2155  if (prev==function_extents->end())
2156  return true;
2157  Function *func = prev->second.get();
2158  if (0!=(func->reason & excluded_reasons))
2159  return true;
2160 
2161  /* Should this range end contiguously with the same function? Perhaps we should relax this and only require that the
2162  * preceding function has an address after this range? [RPM 2011-11-25] */
2163  if (require_intrafunction) {
2164  if (args.range.last()>=Extent::maximum())
2165  return true;
2166  FunctionRangeMap::iterator next = function_extents->find(args.range.last()+1);
2167  if (next==function_extents->end() || next->second.get()!=func)
2168  return true;
2169  }
2170 
2171  /* If the preceding function is interleaved with another then how can we know that the instructions in question should
2172  * actually belong to this function? If we're interleaved with one other function, then we could very easily be
2173  * interleaved with additional functions and this address region could belong to any of them. */
2174  if (require_noninterleaved && !p->is_contiguous(func, false))
2175  return true;
2176 
2177  /* Bail unless the region statistically looks like code. */
2178  ExtentMap pending;
2179  pending.insert(args.range);
2180  RegionStats *stats = p->region_statistics(pending);
2181  double raw_vote;
2182  if (!code_criteria->satisfied_by(stats, &raw_vote))
2183  return true;
2184 
2185  /* Get the list of basic blocks for the instructions in this range and their address extents. Bail if a basic block
2186  * extends beyond the address ranges we're considering, rather than splitting the block. Also bail if we encounter two or
2187  * more instructions that overlap since that's a pretty strong indication that this isn't code. */
2188  std::set<BasicBlock*> bblocks;
2189  while (!pending.empty()) {
2190  rose_addr_t va = pending.min();
2191  BasicBlock *bb = va==args.range.first() ? p->find_bb_starting(va) : p->find_bb_containing(va);
2192  if (!bb || bb->function)
2193  return true;
2194  if (bblocks.find(bb)==bblocks.end()) {
2195  bblocks.insert(bb);
2196  for (InstructionVector::iterator ii=bb->insns.begin(); ii!=bb->insns.end(); ++ii) {
2197  Extent ie((*ii)->get_address(), (*ii)->get_size());
2198  if (!pending.contains(ie))
2199  return true;
2200  pending.erase(ie);
2201  }
2202  }
2203  }
2204 
2205  /* All looks good. Add the basic blocks to the preceding function. */
2206  for (std::set<BasicBlock*>::iterator bi=bblocks.begin(); bi!=bblocks.end(); ++bi) {
2207  (*bi)->code_likelihood = raw_vote;
2208  p->append(func, *bi, SgAsmBlock::BLK_FRAGMENT, true/*CFG head*/);
2209  ++nfound;
2210  }
2211 
2212  return true;
2213 }
2214 
2215 /* Create functions for any basic blocks that consist of only a JMP to another function. */
2216 bool
2217 Partitioner::FindThunks::operator()(bool enabled, const Args &args)
2218 {
2219  if (!enabled)
2220  return false;
2221 
2222  Partitioner *p = args.partitioner;
2223  rose_addr_t va = args.insn_begin->get_address();
2224  rose_addr_t next_va = 0;
2225  for (size_t i=0; i<args.ninsns; i++, va=next_va) {
2226  Instruction *insn = p->find_instruction(va);
2227  assert(insn);
2228  next_va = va + insn->get_size();
2229 
2230  /* Instruction must be an x86 JMP */
2231  SgAsmx86Instruction *insn_x86 = isSgAsmx86Instruction(insn);
2232  if (!insn_x86 || (insn_x86->get_kind()!=x86_jmp && insn_x86->get_kind()!=x86_farjmp))
2233  continue;
2234 
2235  /* Instruction must not be in the middle of an existing basic block. */
2236  BasicBlock *bb = p->find_bb_containing(va, false);
2237  if (bb && bb->address()!=va)
2238  continue;
2239 
2240  if (validate_targets) {
2241  /* Instruction must have a single successor */
2242  bool complete;
2243  Disassembler::AddressSet succs = insn->get_successors(&complete);
2244  if (!complete && 1!=succs.size())
2245  continue;
2246  rose_addr_t target_va = *succs.begin();
2247 
2248  /* The target (single successor) must be a known function which is not padding. */
2249  Functions::iterator fi = p->functions.find(target_va);
2250  if (fi==p->functions.end() || 0!=(fi->second->reason & SgAsmFunction::FUNC_PADDING))
2251  continue;
2252  }
2253 
2254  /* Create the basic block for the JMP instruction. This block must be a single instruction, which it should be since
2255  * we already checked that its only successor is another function. */
2256  if (!bb)
2257  bb = p->find_bb_starting(va);
2258  assert(bb!=NULL);
2259  assert(1==bb->insns.size());
2261  p->append(thunk, bb, SgAsmBlock::BLK_ENTRY_POINT);
2262  ++nfound;
2263 
2264  if (p->debug)
2265  fprintf(p->debug, "Partitioner::FindThunks: found F%08"PRIx64"\n", va);
2266  }
2267 
2268  return true;
2269 }
2270 
2271 bool
2273 {
2274  if (!enabled)
2275  return false;
2276  Partitioner *p = args.partitioner;
2277 
2278  /* Initialize the data block ranges once and cache it in this object. */
2279  if (!padding_ranges) {
2280  padding_ranges = new DataRangeMap;
2281  p->padding_extent(padding_ranges);
2282  }
2283 
2284  /* Range must be immediately preceded by a padding block. */
2285  if (args.range.first()<=Extent::minimum())
2286  return true;
2287  if (padding_ranges->find(args.range.first()-1) == padding_ranges->end())
2288  return true;
2289 
2290  /* Range must be immediately followed by a padding block. */
2291  if (args.range.last()>=Extent::maximum())
2292  return true;
2293  DataRangeMap::iterator next = padding_ranges->find(args.range.last()+1);
2294  if (next==padding_ranges->end())
2295  return true;
2296  DataBlock *next_dblock = next->second.get();
2297 
2298  /* Create a new function and move the following padding to the new function. */
2300  p->append(new_func, next_dblock, SgAsmBlock::BLK_PADDING, true/*force*/);
2301  ++nfound;
2302 
2303  if (p->debug)
2304  fprintf(p->debug, "Partitioner::FindInterPadFunctions: added F%08"PRIx64"\n", new_func->entry_va);
2305  return true;
2306 }
2307 
2308 bool
2310 {
2311  if (!enabled)
2312  return false;
2313  Partitioner *p = args.partitioner;
2314  Extent range = args.range;
2315 
2316  while (!range.empty()) {
2317 
2318  /* Find a single, contiguous thunk table. */
2319  InstructionMap thunks; // each thunk is a single JMP instruction
2320  bool in_table = false;
2321  rose_addr_t va;
2322  for (va=range.first(); va<=range.last() && (in_table || thunks.empty()); va++) {
2323  if (begins_contiguously && !in_table && va>args.range.first())
2324  return true;
2325 
2326  /* Must be a JMP instruction. */
2327  Instruction *insn = p->find_instruction(va);
2328  SgAsmx86Instruction *insn_x86 = isSgAsmx86Instruction(insn);
2329  if (!insn_x86 || (insn_x86->get_kind()!=x86_jmp && insn_x86->get_kind()!=x86_farjmp)) {
2330  in_table = false;
2331  continue;
2332  }
2333 
2334  /* Instruction must not be part of a larger basic block. Be careful not to create basic blocks unecessarily. */
2335  BasicBlock *bb = p->find_bb_containing(va, false);
2336  if (bb && bb->insns.size()>1) {
2337  in_table = false;
2338  continue;
2339  }
2340  if (!bb)
2341  bb = p->find_bb_starting(va);
2342  assert(bb!=NULL);
2343 
2344  if (validate_targets) {
2345  /* Find successors of the JMP instruction. */
2347  bool complete;
2348  if (bb->insns.size()>1) {
2349  succs.insert(bb->insns[1]->get_address());
2350  complete = true;
2351  } else {
2352  succs = p->successors(bb, &complete);
2353  }
2354 
2355  /* If the successor is known, it should point to another instruction. */
2356  bool points_to_insn = true;
2357  for (Disassembler::AddressSet::iterator si=succs.begin(); si!=succs.end() && points_to_insn; ++si)
2358  points_to_insn = NULL != p->find_instruction(*si);
2359  if (!points_to_insn) {
2360  in_table = false;
2361  continue;
2362  }
2363  }
2364 
2365  /* This is a thunk. Save it and skip ahead to the start of the following instruction. */
2366  in_table = true;
2367  thunks[insn->get_address()] = insn;
2368  va += insn->get_size() - 1; // also incremented by loop
2369  }
2370 
2371  /* This is only a thunk table if we found enough thunks and (if appropriate) ends at the end of the range. */
2372  if (thunks.size()>minimum_nthunks && (!ends_contiguously || va==args.range.last()+1)) {
2373  for (InstructionMap::iterator ii=thunks.begin(); ii!=thunks.end(); ++ii) {
2374  Function *thunk = p->add_function(ii->first, SgAsmFunction::FUNC_THUNK);
2375  BasicBlock *bb = p->find_bb_starting(ii->first);
2376  p->append(thunk, bb, SgAsmBlock::BLK_ENTRY_POINT);
2377  ++nfound;
2378  if (p->debug)
2379  fprintf(p->debug, "Partitioner::FindThunkTable: thunk F%08"PRIx64"\n", thunk->entry_va);
2380  }
2381  }
2382 
2383  range.first(va);
2384  }
2385  return true;
2386 }
2387 
2395 bool
2397 {
2398  assert(func);
2399  if (1!=func->basic_blocks.size())
2400  return false;
2401 
2402  BasicBlock *bb = func->basic_blocks.begin()->second;
2403  if (1!=bb->insns.size())
2404  return false;
2405 
2406  Instruction *insn = bb->insns.front();
2407  SgAsmx86Instruction *insn_x86 = isSgAsmx86Instruction(insn);
2408  if (!insn_x86 || (insn_x86->get_kind()!=x86_jmp && insn_x86->get_kind()!=x86_farjmp))
2409  return false;
2410 
2411  bool complete;
2412  Disassembler::AddressSet succs = successors(bb, &complete);
2413  if (!complete || 1!=succs.size())
2414  return false;
2415 
2416  rose_addr_t target_va = *succs.begin();
2417  Functions::iterator fi = functions.find(target_va);
2418  Function *target_func = fi==functions.end() ? NULL : fi->second;
2419  if (!target_func)
2420  return false;
2421 
2422  if (0!=(target_func->reason & SgAsmFunction::FUNC_LEFTOVERS) ||
2423  0!=(fi->second->reason & SgAsmFunction::FUNC_PADDING))
2424  return false;
2425 
2426  return true;
2427 }
2428 
2429 bool
2431 {
2432  if (!enabled)
2433  return false;
2434  if (!args.insn_prev || args.insn_begin->get_address()!=args.insn_prev->get_address()+args.insn_prev->get_size())
2435  return true;
2436  Partitioner *p = args.partitioner;
2438  if (!bb || !bb->function)
2439  return true;
2440  Function *func = bb->function;
2441  if (0!=(func->reason & SgAsmFunction::FUNC_PADDING) ||
2442  0!=(func->reason & SgAsmFunction::FUNC_THUNK))
2443  return true; // don't append instructions to certain "functions"
2444 
2445  size_t nadded = 0;
2446  rose_addr_t va = args.insn_begin->get_address();
2447  for (size_t i=0; i<args.ninsns; i++) {
2448  bb = p->find_bb_containing(va);
2449  assert(bb!=NULL); // because we know va is an instruction
2450  if (!bb->function) {
2451  if (p->debug) {
2452  if (0==nadded)
2453  fprintf(p->debug, "Partitioner::PostFunctionBlocks: for F%08"PRIx64": added", func->entry_va);
2454  fprintf(p->debug, " B%08"PRIx64"#%zu", bb->address(), bb->insns.size());
2455  }
2456  p->append(func, bb, SgAsmBlock::BLK_POSTFUNC, true/*head of CFG subgraph*/);
2457  func->pending = true;
2458  ++nadded;
2459  ++nfound;
2460  }
2461 
2462  Instruction *insn = p->find_instruction(va);
2463  assert(insn!=NULL);
2464  va += insn->get_size();
2465  }
2466  if (p->debug && nadded)
2467  fprintf(p->debug, "\n");
2468 
2469  return true;
2470 }
2471 
2472 /* class method */
2475 {
2476  if (!e) {
2477  return 0;
2478  } else if (isSgAsmIntegerValueExpression(e)) {
2480  } else {
2481  return 0;
2482  }
2483 }
2484 
2485 /* class method */
2488 {
2489  rose_addr_t retval = 0;
2490 
2492  if (!insn ||
2493  !x86InstructionIsUnconditionalBranch(insn) ||
2494  1!=insn->get_operandList()->get_operands().size())
2495  return retval;
2496 
2498  if (!mref)
2499  return retval;
2500 
2501  SgAsmExpression *mref_addr = mref->get_address();
2502  if (isSgAsmBinaryExpression(mref_addr)) {
2503  SgAsmBinaryExpression *mref_bin = isSgAsmBinaryExpression(mref_addr);
2506  if (reg!=NULL && val!=NULL) {
2507  if (reg->get_descriptor().get_major()==x86_regclass_ip) {
2508  retval = value_of(val) + insn->get_address() + insn->get_size();
2509  } else if (reg->get_descriptor().get_major()==x86_regclass_gpr) {
2510  retval = value_of(val) + offset;
2511  }
2512  }
2513  } else if (isSgAsmValueExpression(mref_addr)) {
2514  retval = value_of(isSgAsmValueExpression(mref_addr));
2515  }
2516 
2517  return retval; /*calculated value, or defaults to zero*/
2518 }
2519 
2524 void
2526 {
2527  /* This function is ELF, x86 specific. [FIXME RPM 2009-02-06] */
2529  if (!elf) return;
2530 
2531  /* Find important sections */
2532  SgAsmGenericSection *plt = elf->get_section_by_name(".plt");
2533  if (!plt || !plt->is_mapped()) return;
2534  SgAsmGenericSection *gotplt = elf->get_section_by_name(".got.plt");
2535  if (!gotplt || !gotplt->is_mapped()) return;
2536 
2537  /* Find all relocation sections */
2538  std::set<SgAsmElfRelocSection*> rsects;
2539  for (SgAsmGenericSectionPtrList::iterator si=elf->get_sections()->get_sections().begin();
2540  si!=elf->get_sections()->get_sections().end();
2541  si++) {
2542  SgAsmElfRelocSection *reloc_section = isSgAsmElfRelocSection(*si);
2543  if (reloc_section)
2544  rsects.insert(reloc_section);
2545  }
2546  if (rsects.empty()) return;
2547 
2548  /* Process each .plt trampoline */
2549  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); fi++) {
2550  rose_addr_t func_addr = fi->first;
2551 
2552  if (fi->second->name!="")
2553  continue; /* function already has a name */
2554 
2555  if (func_addr < elf->get_base_va() + plt->get_mapped_preferred_rva() ||
2556  func_addr >= elf->get_base_va() + plt->get_mapped_preferred_rva() + plt->get_mapped_size())
2557  continue; /* function is not in the .plt section */
2558 
2559  /* Sometimes the first instruction of a basic block cannot be disassembled and the basic block will have a different
2560  * starting address than its first instruction. If that basic block is also the start of a function then the
2561  * function also will have no initial instruction. */
2562  Instruction *insn = find_instruction(func_addr);
2563  if (NULL==insn)
2564  continue;
2565 
2566  /* The target in the ".plt" section will be an indirect (through the .got.plt section) jump to the actual dynamically
2567  * linked function (or to the dynamic linker itself). The .got.plt address is what we're really interested in. */
2568  SgAsmx86Instruction *insn_x86 = isSgAsmx86Instruction(insn);
2569  assert(insn_x86!=NULL);
2570  rose_addr_t gotplt_va = get_indirection_addr(insn_x86, elf->get_base_va() + gotplt->get_mapped_preferred_rva());
2571 
2572  if (gotplt_va < elf->get_base_va() + gotplt->get_mapped_preferred_rva() ||
2573  gotplt_va >= elf->get_base_va() + gotplt->get_mapped_preferred_rva() + gotplt->get_mapped_size())
2574  continue; /* PLT entry doesn't dereference a value in the .got.plt section */
2575 
2576  /* Find the relocation entry whose offset is the gotplt_rva and use that entry's symbol for the function name. */
2577  for (std::set<SgAsmElfRelocSection*>::iterator ri=rsects.begin(); ri!=rsects.end() && fi->second->name==""; ri++) {
2578  SgAsmElfRelocEntryList *entries = (*ri)->get_entries();
2579  SgAsmElfSymbolSection *symbol_section = isSgAsmElfSymbolSection((*ri)->get_linked_section());
2580  if (symbol_section) {
2581  SgAsmElfSymbolList *symbols = symbol_section->get_symbols();
2582  for (size_t ei=0; ei<entries->get_entries().size() && fi->second->name==""; ei++) {
2583  SgAsmElfRelocEntry *rel = entries->get_entries()[ei];
2584  if (rel->get_r_offset()==gotplt_va) {
2585  unsigned long symbol_idx = rel->get_sym();
2586  assert(symbol_idx < symbols->get_symbols().size());
2587  SgAsmElfSymbol *symbol = symbols->get_symbols()[symbol_idx];
2588  fi->second->name = symbol->get_name()->get_string() + "@plt";
2589  }
2590  }
2591  }
2592  }
2593  }
2594 }
2595 
2603 void
2605 {
2606  /* This function is PE x86 specific */
2608  if (!pe)
2609  return;
2610 
2611  /* Build an index mapping memory addresses to function names. The addresses are the virtual address through which an
2612  * indirect jump would go when calling an imported function. */
2613  struct ImportIndexBuilder: public AstSimpleProcessing {
2614  typedef std::map<rose_addr_t, std::string> Index;
2615  typedef Index::iterator iterator;
2616  Partitioner *partitioner;
2617  Index index;
2618  SgAsmGenericHeader *fhdr;
2619  ImportIndexBuilder(Partitioner *partitioner, SgAsmGenericHeader *fhdr): partitioner(partitioner), fhdr(fhdr) {
2620  traverse(fhdr, preorder);
2621  }
2622  void visit(SgNode *node) {
2623  SgAsmPEImportItem *import = isSgAsmPEImportItem(node);
2624  if (import) {
2625  std::string name = import->get_name()->get_string();
2626  rose_addr_t va = import->get_bound_rva().get_va();
2627  if (va!=0 && !name.empty())
2628  index[va] = name;
2629  }
2630  }
2631  } imports(this, fhdr);
2632 
2633  /* Look for functions whose first instruction is an indirect jump through one of the memory addresses we indexed above. */
2634  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
2635  Function *func = fi->second;
2636  if (!func->name.empty())
2637  continue;
2638  Instruction *insn = find_instruction(func->entry_va);
2639  SgAsmx86Instruction *insn_x86 = isSgAsmx86Instruction(insn);
2640  if (!insn_x86 ||
2641  (insn_x86->get_kind()!=x86_jmp && insn_x86->get_kind()!=x86_farjmp) ||
2642  1!=insn_x86->get_operandList()->get_operands().size())
2643  continue;
2645  if (!mre)
2646  continue;
2648  if (!base)
2649  continue;
2650  rose_addr_t base_va = value_of(base);
2651 
2652  ImportIndexBuilder::iterator found = imports.index.find(base_va);
2653  if (found==imports.index.end())
2654  continue;
2655  func->name = found->second + "@import";
2656  if (debug)
2657  fprintf(debug, "Partitioner::name_import_entries: F%08"PRIx64": named \"%s\"\n", func->entry_va, func->name.c_str());
2658  }
2659 }
2660 
2662 void
2664 {
2665  SgAsmGenericSectionPtrList iat_sections = hdr->get_sections_by_name("Import Address Table");
2666  for (size_t i=0; i<iat_sections.size(); ++i) {
2667  if (-1==iat_sections[i]->get_id() && iat_sections[i]->is_mapped())
2668  pe_iat_extents.insert(Extent(iat_sections[i]->get_mapped_actual_va(), iat_sections[i]->get_mapped_size()));
2669  }
2670 }
2671 
2672 /* Seed function starts based on criteria other than control flow graph. */
2673 void
2675 {
2676  if (debug) {
2677  fprintf(debug, "Function reasons referenced by Partitioner debugging output:\n%s",
2678  SgAsmFunction::reason_key(" ").c_str());
2679  }
2680 
2681  mark_ipd_configuration(); /*seed partitioner based on IPD configuration information*/
2682 
2683  if (interp) {
2684  const SgAsmGenericHeaderPtrList &headers = interp->get_headers()->get_headers();
2685  for (size_t i=0; i<headers.size(); i++) {
2686  find_pe_iat_extents(headers[i]);
2687  if (func_heuristics & SgAsmFunction::FUNC_ENTRY_POINT)
2688  mark_entry_targets(headers[i]);
2689  if (func_heuristics & SgAsmFunction::FUNC_EH_FRAME)
2690  mark_eh_frames(headers[i]);
2691  if (func_heuristics & SgAsmFunction::FUNC_SYMBOL)
2692  mark_func_symbols(headers[i]);
2693  if (func_heuristics & SgAsmFunction::FUNC_IMPORT)
2694  mark_elf_plt_entries(headers[i]);
2695  if (func_heuristics & SgAsmFunction::FUNC_EXPORT)
2696  mark_export_entries(headers[i]);
2697  }
2698  }
2699  if (func_heuristics & SgAsmFunction::FUNC_PATTERN)
2700  mark_func_patterns();
2701  if (func_heuristics & SgAsmFunction::FUNC_CALL_INSN)
2702  mark_call_insns();
2703 
2704 
2705 
2706  /* Run user-defined function detectors, making sure that the basic block starts are up-to-date for each call. */
2707  if (func_heuristics & SgAsmFunction::FUNC_USERDEF) {
2708  if (interp) {
2709  const SgAsmGenericHeaderPtrList &headers = interp->get_headers()->get_headers();
2710  for (size_t i=0; i<user_detectors.size(); i++) {
2711  for (size_t j=0; j<=headers.size(); j++) {
2712  SgAsmGenericHeader *hdr = 0==j ? NULL : headers[j-1];
2713  user_detectors[i](this, hdr);
2714  }
2715  }
2716  } else {
2717  for (size_t i=0; i<user_detectors.size(); i++) {
2718  user_detectors[i](this, NULL);
2719  }
2720  }
2721  }
2722 }
2723 
2741 void
2743 {
2744  if (debug) {
2745  fprintf(debug, "1st block %s F%08"PRIx64" \"%s\": B%08"PRIx64" ",
2746  SgAsmFunction::reason_str(true, func->reason).c_str(),
2747  func->entry_va, func->name.c_str(), func->entry_va);
2748  }
2749  BasicBlock *bb = find_bb_containing(func->entry_va);
2750 
2751  /* If this function's entry block collides with some other function, then truncate that other function's block and
2752  * subsume part of it into this function. Mark the other function as pending because its block may have new
2753  * successors now. */
2754  if (bb && func->entry_va!=bb->address()) {
2755  assert(bb->function!=func);
2756  if (debug) fprintf(debug, "[split from B%08"PRIx64, bb->address());
2757  if (bb->function) {
2758  if (debug) fprintf(debug, " in F%08"PRIx64" \"%s\"", bb->address(), bb->function->name.c_str());
2759  bb->function->pending = true;
2760  }
2761  if (debug) fprintf(debug, "] ");
2762  truncate(bb, func->entry_va);
2763  bb = find_bb_containing(func->entry_va);
2764  assert(bb!=NULL);
2765  assert(func->entry_va==bb->address());
2766  } else if (bb && bb->function!=NULL && bb->function!=func) {
2767  if (debug) fprintf(debug, "[removing B%08"PRIx64" from F%08"PRIx64"]", func->entry_va, bb->function->entry_va);
2768  bb->function->pending = true;
2769  remove(bb->function, bb);
2770  }
2771 
2772  if (bb) {
2773  append(func, bb, SgAsmBlock::BLK_ENTRY_POINT);
2774  if (debug)
2775  fprintf(debug, "#%zu ", bb->insns.size());
2776  } else if (debug) {
2777  fprintf(debug, "no instruction at function entry address ");
2778  }
2779  if (debug) {
2780  func->show_properties(debug);
2781  fputc('\n', debug);
2782  }
2783 }
2784 
2789 void
2791 {
2792  if (debug) fprintf(debug, " B%08"PRIx64, va);
2793  Instruction *insn = find_instruction(va);
2794  if (!insn) return; /* No instruction at this address. */
2795  rose_addr_t target_va = NO_TARGET; /*target of function call instructions (e.g., x86 CALL and FARCALL)*/
2796 
2797  /* This block might be the entry address of a function even before that function has any basic blocks assigned to it. This
2798  * can happen when a new function was discovered during the current pass. It can't happen for functions discovered in a
2799  * previous pass since we would have called discover_first_block() by now for any such functions. */
2800  Functions::iterator fi = functions.find(va);
2801  if (fi!=functions.end() && fi->second!=f) {
2802  if (debug) fprintf(debug, "[entry \"%s\"]", fi->second->name.c_str());
2803  return;
2804  }
2805 
2806  /* Find basic block at address, creating it if necessary. */
2807  BasicBlock *bb = find_bb_starting(va);
2808  assert(bb!=NULL);
2809  if (debug) fprintf(debug, "#%zu", bb->insns.size());
2810 
2811  /* If the current function has been somehow marked as pending then we might as well give up discovering its blocks because
2812  * some of its blocks' successors may have changed. This can happen, for instance, if the create_bb() called above had to
2813  * split one of this function's blocks. */
2814  if (f->pending) {
2815  if (debug) fprintf(debug, " abandon");
2816  throw AbandonFunctionDiscovery();
2817  }
2818 
2819  /* Don't reprocess blocks for this function. However, we need to reprocess the first block because it was added by
2820  * discover_first_block(), which is not recursive. Care should be taken so none of the recursive calls below are invoked
2821  * for the first block, or we'll have infinite recurision! */
2822  if (bb->function==f && bb->address()!=f->entry_va)
2823  return;
2824 
2825  if (bb->function && bb->function!=f) {
2826  if (va==bb->function->entry_va) {
2827  /* This is a call to some other existing function. Do not add it to the current function. */
2828  if (debug) fprintf(debug, "[entry \"%s\"]", bb->function->name.c_str());
2829  } else {
2830  /* This block belongs internally to some other function. Since ROSE requires that blocks be owned by exactly one
2831  * function (the function/block relationship is an edge in the abstract syntax tree), we have to remove this block
2832  * from the other function. We'll mark both the other function and this function as being in conflict and try
2833  * again later.
2834  *
2835  * However, there is a special case we need to watch out for: the case when the block in conflict is no longer
2836  * reachable from the original function due to having made changes to other blocks in the original function. For
2837  * instance, consider the following sequence of events:
2838  * F000 contains B000 (the entry block) and B010
2839  * B000 has 10 instructions, and ends with a call to F100 which returns
2840  * B010 is the fall-through address of B000
2841  * We then begin to discover F005 whose entry address is the fifth instruction of B000, so
2842  * B000 is split into B000 containing the first five instrucitons and B005 containing the second five
2843  * F000 is marked as pending due to the splitting of its B000 block
2844  * B005 is added to F005 as its entry block
2845  * B005 calls F100 which returns to B010, so we want to add B010 to F005
2846  * So we have a conflict:
2847  * B010 belongs to F000 because we never removed it, but we need B010 also in F005.
2848  * In this example, the only CFG edge to B010 inside F000 was the fall-through edge from the call to F100, which
2849  * no longer exists in F000. Unfortunately we have no way of knowing (short of doing a CFG analysis in F000) that
2850  * the last edge was removed. Even if we did a CFG analysis, we may be working with incomplete information (F000
2851  * might not be fully discovered yet).
2852  *
2853  * The way we handle this special case is as follows:
2854  * If the original function (F000 in the example) is marked as pending then the blocks it currently owns might
2855  * not actually belong to the function anymore. Therefore we will not create a new FUNC_GRAPH function for the
2856  * block in conflict, but rather mark both functions as pending and abandon until the next pass. Otherwise we
2857  * assume the block in conflict really is in conflict and we'll create a FUNC_GRAPH function. */
2858  if (functions.find(va)==functions.end() && !bb->function->pending) {
2859  add_function(va, SgAsmFunction::FUNC_GRAPH);
2860  if (debug) fprintf(debug, "[conflict F%08"PRIx64" \"%s\"]", bb->function->entry_va, bb->function->name.c_str());
2861  } else if (debug) {
2862  fprintf(debug, "[possible conflict F%08"PRIx64" \"%s\"]", bb->function->entry_va, bb->function->name.c_str());
2863  }
2864  bb->function->pending = f->pending = true;
2865  if (debug) fprintf(debug, " abandon");
2866  throw AbandonFunctionDiscovery();
2867  }
2868  } else if ((target_va=call_target(bb))!=NO_TARGET) {
2869  /* Call to a known target */
2870  if (debug) fprintf(debug, "[call F%08"PRIx64"]", target_va);
2871 
2872  append(f, bb, reason);
2873  BasicBlock *target_bb = find_bb_containing(target_va);
2874 
2875  /* Optionally create or add reason flags to called function. */
2876  Function *new_function = NULL;
2877  if ((func_heuristics & SgAsmFunction::FUNC_CALL_TARGET)) {
2878  new_function = add_function(target_va, SgAsmFunction::FUNC_CALL_TARGET);
2879  } else if (find_function(target_va)!=NULL) {
2880  find_function(target_va)->reason |= SgAsmFunction::FUNC_CALL_TARGET;
2881  }
2882 
2883  /* If the call target is part of a function (the current function or some other) and it's not the entry block then we
2884  * might need to rediscover the blocks of that function. We don't need to rediscover the blocks of that function if
2885  * that function is the current function and should remain in the current function (i.e., we didn't create a new
2886  * function). */
2887  if (target_bb && target_bb->function && target_va!=target_bb->function->entry_va &&
2888  (target_bb->function!=f || new_function!=NULL))
2889  target_bb->function->pending = true;
2890 
2891  /* Discovery continues at the successors. */
2892  const Disassembler::AddressSet &suc = successors(bb);
2893  for (Disassembler::AddressSet::const_iterator si=suc.begin(); si!=suc.end(); ++si) {
2894  if (*si!=f->entry_va)
2895  discover_blocks(f, *si, reason);
2896  }
2897 
2898  } else {
2899  append(f, bb, reason);
2900  const Disassembler::AddressSet& suc = successors(bb);
2901  for (Disassembler::AddressSet::const_iterator si=suc.begin(); si!=suc.end(); ++si) {
2902  if (*si!=f->entry_va)
2903  discover_blocks(f, *si, reason);
2904  }
2905  }
2906 }
2907 
2908 void
2910 {
2911  Disassembler::AddressSet heads = f->heads;
2912  heads.insert(f->entry_va);
2913  for (Disassembler::AddressSet::iterator hi=heads.begin(); hi!=heads.end(); ++hi)
2914  discover_blocks(f, *hi, reason);
2915 }
2916 
2917 bool
2919 {
2920  SgAsmx86Instruction *insn_x86 = insn ? isSgAsmx86Instruction(insn->node) : NULL;
2921  if (!insn_x86 || x86_jmp!=insn_x86->get_kind() || insn_x86->get_operandList()->get_operands().size()!=1)
2922  return false; // not a thunk: wrong instruction
2925  if (!addr)
2926  return false; // not a dynamic linking thunk: wrong addressing mode
2927  return pe_iat_extents.contains(Extent(addr->get_absolute_value(), 4));
2928 }
2929 
2930 bool
2932 {
2933  return bb && bb->insns.size()==1 && is_pe_dynlink_thunk(bb->insns.front());
2934 }
2935 
2936 bool
2938 {
2939  return func && func->basic_blocks.size()==1 && is_pe_dynlink_thunk(func->entry_basic_block());
2940 }
2941 
2942 void
2944 {
2945  for (size_t pass=1; true; pass++) {
2946  if (debug) fprintf(debug, "\n========== Partitioner::analyze_cfg() pass %zu ==========\n", pass);
2947  progress(debug, "Partitioner: starting %s pass %zu: "
2948  "%zu function%s, %zu insn%s, %zu block%s\n",
2949  stringifySgAsmBlockReason(reason, "BLK_").c_str(), pass,
2950  functions.size(), 1==functions.size()?"":"s", insns.size(), 1==insns.size()?"":"s",
2951  basic_blocks.size(), 1==basic_blocks.size()?"":"s");
2952 
2953  /* Analyze function return characteristics. */
2954  for (BasicBlocks::iterator bi=basic_blocks.begin(); bi!=basic_blocks.end(); ++bi) {
2955  BasicBlock *bb = bi->second;
2956  if (!bb->function)
2957  continue;
2958 
2959  rose_addr_t target_va = NO_TARGET; /*call target*/
2960  bool iscall = is_function_call(bb, &target_va);
2961  rose_addr_t return_va = canonic_block(bb->last_insn()->get_address() + bb->last_insn()->get_size()); // fall through
2962  BasicBlock *return_bb = NULL, *target_bb = NULL; /* computed only if needed since they might split basic blocks */
2963  bool succs_complete;
2964  Disassembler::AddressSet succs = successors(bb, &succs_complete);
2965 
2966  if (iscall && target_va!=NO_TARGET &&
2967  NULL!=(return_bb=find_bb_starting(return_va)) &&
2968  NULL!=(target_bb=find_bb_starting(target_va, false)) &&
2969  target_bb->function && target_bb->function->possible_may_return()) {
2970  if (return_bb->function && return_va==target_bb->function->entry_va && !bb->function->possible_may_return()) {
2971  /* This handles the case when function A's return from B falls through into B. In this case, since B
2972  * returns then A also returns. We mark A as returning.
2973  * function_A:
2974  * ...
2975  * CALL function_B
2976  * function_B:
2977  * RET
2978  */
2980  if (debug) {
2981  fprintf(debug, " Function F%08"PRIx64" may return by virtue of call fall-through at B%08"PRIx64"\n",
2982  bb->function->entry_va, bb->address());
2983  }
2984  }
2985  } else if (!bb->function->possible_may_return() && !is_function_call(bb, NULL) && succs_complete) {
2986  for (Disassembler::AddressSet::iterator si=succs.begin();
2987  si!=succs.end() && !bb->function->possible_may_return();
2988  ++si) {
2989  target_va = *si;
2990  target_bb = target_va!=0 ? find_bb_starting(target_va, false) : NULL;
2991  if (target_bb && target_bb->function && target_bb->function!=bb->function &&
2992  target_va==target_bb->function->entry_va && target_bb->function->possible_may_return()) {
2993  /* The block bb isn't a function call, but branches to the entry point of another function. If that
2994  * function returns then so does this one. This handles situations like:
2995  * function_A:
2996  * ...
2997  * JMP function_B
2998  * ...
2999  * function_B:
3000  * RET
3001  * We don't need to set function_A->pending because the reachability of the instruction after its JMP
3002  * won't change regardless of whether the "called" function returns (i.e., the return is to the caller
3003  * of function_A, not to function_A itself. */
3005  if (debug) {
3006  fprintf(debug, " F%08"PRIx64" may return by virtue of branching to function F%08"PRIx64
3007  " which may return\n", bb->function->entry_va, target_bb->function->entry_va);
3008  }
3009  }
3010  }
3011  } else if (!bb->function->possible_may_return() && !is_function_call(bb, NULL) && !succs_complete) {
3012  /* If the basic block's successor is not known, then we must assume that it branches to something that could
3013  * return. */
3015  if (debug) {
3016  fprintf(debug, " F%08"PRIx64" may return by virtue of incomplete successors\n",
3017  bb->function->entry_va);
3018  }
3019  }
3020 
3021  // PE dynamic linking thunks are typically placed in the .text section and consist of an indirect jump through one
3022  // of the import address tables. If we didn't dynamically link in ROSE, then the IATs probably don't hold valid
3023  // function addresses, in which case we can't determine if the thunk returns. Therefore, when this situation
3024  // happens, we assume that the imported function returns.
3025  if (!bb->function->possible_may_return() && is_pe_dynlink_thunk(bb->function)) {
3026  bool invalid_callee_va = !succs_complete;
3027  for (Disassembler::AddressSet::iterator si=succs.begin(); !invalid_callee_va && si!=succs.end(); ++si)
3028  invalid_callee_va = NULL==find_instruction(*si);
3029  if (invalid_callee_va) {// otherwise we can just analyze the linked-in code
3031  }
3032  }
3033  }
3034 
3035  /* Which functions did we think didn't return but now think they might return? */
3036  Disassembler::AddressSet might_now_return;
3037  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
3038  Function *func = fi->second;
3039  if (func->changed_may_return() && func->possible_may_return()) {
3040  if (debug)
3041  fprintf(debug, "%s F%08"PRIx64, might_now_return.empty()?"newly returning functions:":"", func->entry_va);
3042  might_now_return.insert(func->entry_va);
3043  func->commit_may_return();
3044  }
3045  }
3046  if (debug && !might_now_return.empty())
3047  fprintf(debug, "\n");
3048 
3049  /* If we previously thought a function didn't return, but now we think it might return, we need to mark as pending all
3050  * callers if the return address in that caller isn't already part of the caller function. There's no need to do this
3051  * fairly expensive loop of we didn't transition any functions from does-not-return to may-return. We use the
3052  * might_now_return set rather than looking up functions with find_function() because the former is probably faster,
3053  * especially if we have lots of functions but only a few transitioned from does-not-return to may-return, which is the
3054  * common case. */
3055  if (!might_now_return.empty()) {
3056  for (BasicBlocks::iterator bi=basic_blocks.begin(); bi!=basic_blocks.end(); ++bi) {
3057  BasicBlock *bb = bi->second;
3058  if (bb->function && !bb->function->pending) {
3059  Disassembler::AddressSet succs = successors(bb, NULL);
3060  for (Disassembler::AddressSet::iterator si=succs.begin(); si!=succs.end(); ++si) {
3061  if (might_now_return.find(*si)!=might_now_return.end()) {
3062  // This is a call from a basic block (bb) to a function that we now think might return. If the
3063  // return-to block is not already part of the calling function, then we should mark the calling
3064  // function as pending.
3065  rose_addr_t return_va = canonic_block(bb->last_insn()->get_address() + bb->last_insn()->get_size());
3066  BasicBlock *return_bb = find_bb_starting(return_va, false); // do not create the block
3067  if (return_bb && return_bb->function!=bb->function) {
3068  bb->function->pending = true;
3069  if (debug) {
3070  Function *called_func = find_function(*si); // don't call this unless debugging (performance)
3071  assert(called_func!=NULL);
3072  fprintf(debug,
3073  "newreturn %s F%08"PRIx64" \"%s\" returns to B%08"PRIx64" in F%08"PRIx64"\n",
3074  SgAsmFunction::reason_str(true, called_func->reason).c_str(), called_func->entry_va,
3075  called_func->name.c_str(), return_bb->address(), bb->function->entry_va);
3076  }
3077  }
3078  }
3079  }
3080  }
3081  }
3082  }
3083 
3084  /* Get a list of functions we need to analyze because they're marked as pending. */
3085  std::vector<Function*> pending;
3086  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
3087  assert(fi->second->entry_va==fi->first);
3088  if (fi->second->pending) {
3089  fi->second->clear_basic_blocks();
3090  fi->second->pending = false; /*might be set back to true by discover_blocks() in loop below*/
3091  pending.push_back(fi->second);
3092  }
3093  }
3094 
3095  if (pending.size()==0) {
3096  if (debug)
3097  fprintf(debug, "finished for %s", stringifySgAsmBlockReason(reason, "BLK_").c_str());
3098  break;
3099  }
3100 
3101  /* Make sure all functions have an initial basic block if possible. */
3102  for (size_t i=0; i<pending.size(); ++i)
3103  discover_first_block(pending[i]);
3104 
3105  /* (Re)discover each function's blocks starting with the function entry point */
3106  for (size_t i=0; i<pending.size(); ++i) {
3107  if (debug) {
3108  fprintf(debug, "analyzing %s F%08"PRIx64" \"%s\" pass %zu: ",
3109  SgAsmFunction::reason_str(true, pending[i]->reason).c_str(),
3110  pending[i]->entry_va, pending[i]->name.c_str(), pass);
3111  }
3112  try {
3113  discover_blocks(pending[i], reason);
3114  } catch (const AbandonFunctionDiscovery&) {
3115  /* thrown when discover_blocks() decides it needs to start over on a function */
3116  }
3117  if (debug) {
3118  fputc(' ', debug);
3119  pending[i]->show_properties(debug);
3120  fputc('\n', debug);
3121  }
3122  }
3123  }
3124 }
3125 
3126 size_t
3128 {
3129  size_t retval = 0;
3130  Functions functions = this->functions; // so iterators remain valid inside the loop
3131  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
3132  while (detach_thunk(fi->second))
3133  ++retval;
3134  }
3135  return retval;
3136 }
3137 
3138 bool
3140 {
3141  /* Don't split functions if it has only zero or one instruction. */
3142  if (func->basic_blocks.empty())
3143  return false;
3144  BasicBlock *entry_bb = find_bb_starting(func->entry_va, false);
3145  if (NULL==entry_bb || entry_bb->function!=func)
3146  return false;
3147  if (func->basic_blocks.size()==1 && entry_bb->insns.size()==1)
3148  return false;
3149 
3150  /* Don't split function whose first instruction is not an x86 JMP. */
3151  SgAsmx86Instruction *insn_x86 = isSgAsmx86Instruction(entry_bb->insns[0]);
3152  if (!insn_x86 || (insn_x86->get_kind()!=x86_jmp && insn_x86->get_kind()!=x86_farjmp))
3153  return false;
3154 
3155  /* The JMP must have a single target. */
3156  rose_addr_t second_va = 0;
3157  if (entry_bb->insns.size()>1) {
3158  second_va = entry_bb->insns[1]->get_address();
3159  } else {
3160  bool complete;
3161  Disassembler::AddressSet succs = successors(entry_bb, &complete);
3162  if (!complete || succs.size()!=1)
3163  return false;
3164  second_va = *(succs.begin());
3165  }
3166 
3167  /* The JMP target must be an instruction in the same function. */
3168  if (BasicBlock *target_bb = find_bb_containing(second_va)) {
3169  if (target_bb->function!=func)
3170  return false;
3171  } else {
3172  return false;
3173  }
3174 
3175  /* Don't split the function if the first instruction is a successor of any of the function's blocks. */
3176  for (BasicBlocks::iterator bi=func->basic_blocks.begin(); bi!=func->basic_blocks.end(); ++bi) {
3177  BasicBlock *bb = bi->second;
3178  Disassembler::AddressSet succs = successors(bb, NULL);
3179  if (std::find(succs.begin(), succs.end(), func->entry_va) != succs.end())
3180  return false;
3181  }
3182 
3183  /* Create a new function to hold everything but the entry instruction */
3184  if (debug)
3185  fprintf(debug, "Partitioner::detach_thunk: detaching thunk F%08"PRIx64" from body F%08"PRIx64"\n",
3186  func->entry_va, second_va);
3187  Function *new_func = add_function(second_va, func->reason);
3188  new_func->name = func->name;
3189  new_func->set_may_return(func->get_may_return());
3190 
3191  /* Adjust the old function, which now represents the thunk. */
3193  func->pending = false;
3194  if (!func->name.empty() && std::string::npos==func->name.find("-thunk"))
3195  func->name += "-thunk";
3196 
3197  /* Transfer all instructions (except the thunk itself) to new_func. */
3198  new_func->heads = func->heads;
3199  func->heads.clear();
3200  new_func->heads.erase(func->entry_va);
3201  BasicBlocks bblocks = func->basic_blocks;
3202  for (BasicBlocks::iterator bi=bblocks.begin(); bi!=bblocks.end(); ++bi) {
3203  if (bi->first==func->entry_va) {
3204  BasicBlock *new_bb = find_bb_starting(second_va);
3205  assert(new_bb!=NULL);
3206  if (new_bb->function==func) {
3207  remove(func, new_bb);
3208  append(new_func, new_bb, SgAsmBlock::BLK_ENTRY_POINT);
3209  } else if (new_bb->function==new_func) {
3210  /*void*/
3211  } else {
3212  assert(NULL==new_bb->function);
3213  append(new_func, new_bb, SgAsmBlock::BLK_ENTRY_POINT);
3214  }
3215  append(new_func, new_bb, SgAsmBlock::BLK_ENTRY_POINT);
3216  } else {
3217  BasicBlock *bb = bi->second;
3218  if (bb->function!=new_func) {
3219  assert(bb->function==func);
3220  remove(func, bb);
3221  append(new_func, bb, bb->reason);
3222  }
3223  }
3224  }
3225 
3226  /* Transfer all data blocks to new_func. */
3227  DataBlocks dblocks = func->data_blocks;
3228  for (DataBlocks::iterator di=dblocks.begin(); di!=dblocks.end(); ++di) {
3229  DataBlock *dblock = di->second;
3230  remove(func, dblock);
3231  append(new_func, dblock, dblock->reason);
3232  }
3233  return true;
3234 }
3235 
3236 /* Moves padding blocks to correct functions. */
3237 void
3239 {
3240  /* Compute two maps: one for non-padding bytes belonging to functions, and one for the padding bytes. */
3241  FunctionRangeMap nonpadding_ranges;
3242  function_extent(&nonpadding_ranges);
3243  DataRangeMap padding_ranges;
3244  padding_extent(&padding_ranges);
3245  for (DataRangeMap::iterator pi=padding_ranges.begin(); pi!=padding_ranges.end(); ++pi)
3246  nonpadding_ranges.erase(pi->first);
3247 
3248  /* For each padding block, find the closest prior function and make that the owner of the padding. */
3249  for (DataRangeMap::iterator pi=padding_ranges.begin(); pi!=padding_ranges.end(); ++pi) {
3250  DataBlock *padding = pi->second.get();
3251  FunctionRangeMap::iterator npi = nonpadding_ranges.find_prior(pi->first.first());
3252  if (npi==nonpadding_ranges.end())
3253  continue;
3254  Function *func = npi->second.get();
3255  if (func!=effective_function(padding))
3256  append(func, padding, padding->reason, true/*force*/);
3257  }
3258 }
3259 
3260 /* Merge function fragments when possible. */
3261 void
3263 {
3264  // Find connected components of the control flow graph, but only considering function fragments. We do this in a single
3265  // pass, and at the end of this loop each function fragment, F, will have group number group_number[traversal_number[F]].
3266  typedef std::map<Function*, size_t> TravNumMap;
3267  TravNumMap traversal_number; // which DFS traversal first visited the function?
3268  std::vector<size_t> group_number; // group number for each traversal (indexed by traversal number)
3269  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
3270  if (SgAsmFunction::FUNC_GRAPH!=fi->second->reason)
3271  continue; // only consider functions that are strictly fragments
3272  if (traversal_number.find(fi->second)!=traversal_number.end())
3273  continue; // we already visited this function
3274 
3275  size_t tnum = group_number.size();
3276  group_number.push_back(tnum);
3277  traversal_number[fi->second] = tnum;
3278 
3279  // Depth first search considering only function fragments
3280  std::vector<Function*> dfs_functions;
3281  dfs_functions.push_back(fi->second);
3282  while (!dfs_functions.empty()) {
3283  Function *source_func = dfs_functions.back(); dfs_functions.pop_back();
3284  for (BasicBlocks::iterator bi=source_func->basic_blocks.begin(); bi!=source_func->basic_blocks.end(); ++bi) {
3285  BasicBlock *source_bb = bi->second;
3286  Disassembler::AddressSet succs = successors(source_bb);
3287  for (Disassembler::AddressSet::iterator si=succs.begin(); si!=succs.end(); ++si) {
3288  BasicBlock *target_bb = find_bb_starting(*si, false); // do not create the block
3289  Function *target_func = target_bb ? target_bb->function : NULL;
3290  if (target_func && target_func!=source_func && SgAsmFunction::FUNC_GRAPH==target_func->reason) {
3291  bool inserted = traversal_number.insert(std::make_pair(target_func, tnum)).second;
3292  if (inserted) {
3293  dfs_functions.push_back(target_func);
3294  } else {
3295  group_number[traversal_number[target_func]] = tnum;
3296  }
3297  }
3298  }
3299  }
3300  }
3301  }
3302 
3303  /* Reorganize so that we have lists of function fragments by group number. */
3304  typedef std::vector<std::vector<Function*> > FragmentIndex;
3305  FragmentIndex fragment_index(group_number.size(), std::vector<Function*>());
3306  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
3307  TravNumMap::iterator tn_found = traversal_number.find(fi->second);
3308  if (tn_found!=traversal_number.end()) {
3309  size_t gnum = group_number[tn_found->second];
3310  fragment_index[gnum].push_back(fi->second);
3311  }
3312  }
3313 
3314  /* Find the non-fragment predecessors of each fragment group. A fragment group can be merged into another function only if
3315  * the fragment group has a single predecessor. */
3316  std::vector<Function*> parent(fragment_index.size(), NULL);
3317  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
3318  Function *source_func = fi->second;
3319  if (SgAsmFunction::FUNC_GRAPH!=source_func->reason) {
3320  bool multi_parents = false;
3321  for (BasicBlocks::iterator bi=source_func->basic_blocks.begin();
3322  bi!=source_func->basic_blocks.end() && !multi_parents;
3323  ++bi) {
3324  Disassembler::AddressSet succs = successors(bi->second);
3325  for (Disassembler::AddressSet::iterator si=succs.begin(); si!=succs.end() && !multi_parents; ++si) {
3326  BasicBlock *target_bb = find_bb_starting(*si, false/*do not create*/);
3327  Function *target_func = target_bb ? target_bb->function : NULL;
3328  TravNumMap::iterator tn_found = target_func ? traversal_number.find(target_func) : traversal_number.end();
3329  size_t gnum = tn_found!=traversal_number.end() ? group_number[tn_found->second] : (size_t)(-1);
3330  if (gnum!=(size_t)(-1)) {
3331  /* source_func (non-fragment) branches to fragment group number <gnum> */
3332  if (parent[gnum]) {
3333  parent[gnum] = NULL;
3334  fragment_index[gnum].clear(); // multiple non-fragment predecessors of this group; discard group
3335  multi_parents = true;
3336  } else {
3337  parent[gnum] = source_func;
3338  }
3339  }
3340  }
3341  }
3342  }
3343  }
3344 
3345  /* Merge functions */
3346  if (debug)
3347  fprintf(debug, "Partitioner::merge_function_fragments...\n");
3348  for (size_t gnum=0; gnum<fragment_index.size(); ++gnum) {
3349  if (parent[gnum]!=NULL && !fragment_index[gnum].empty()) {
3350  if (debug) {
3351  fprintf(debug, "fragments %s F%08"PRIx64" \"%s\" merging",
3352  SgAsmFunction::reason_str(true, parent[gnum]->reason).c_str(),
3353  parent[gnum]->entry_va, parent[gnum]->name.c_str());
3354  }
3355  for (std::vector<Function*>::iterator fi=fragment_index[gnum].begin(); fi!=fragment_index[gnum].end(); ++fi) {
3356  if (debug)
3357  fprintf(debug, " F0x%08"PRIx64, (*fi)->entry_va);
3358  merge_functions(parent[gnum], *fi); *fi = NULL;
3359  parent[gnum]->reason &= ~SgAsmFunction::FUNC_GRAPH;
3360  }
3361  if (debug) {
3362  fputc(' ', debug);
3363  parent[gnum]->show_properties(debug);
3364  fputc('\n', debug);
3365  }
3366  }
3367  }
3368 }
3369 
3370 void
3372 {
3373  parent->reason |= other->reason;
3374 
3375  if (parent->name.empty()) {
3376  parent->name = other->name;
3377  } else if (!other->name.empty() && 0!=parent->name.compare(other->name)) {
3378  parent->name += "+" + other->name;
3379  }
3380 
3381  parent->move_basic_blocks_from(other);
3382  parent->move_data_blocks_from(other);
3383 
3384  if (other->pending)
3385  parent->pending = true;
3386 
3387  parent->promote_may_return(other->get_may_return());
3388 
3389  functions.erase(other->entry_va);
3390  delete other;
3391 }
3392 
3394 void
3396 {
3397  // AST visitor finds PE Import Items and adds their address/name pair to a map.
3398  struct AddrName: AstSimpleProcessing {
3399  typedef std::map<rose_addr_t, std::string> NameMap;
3400  NameMap names;
3401  SgAsmGenericHeader *hdr;
3402  AddrName(SgAsmInterpretation *interp) {
3403  if (interp) {
3404  const SgAsmGenericHeaderPtrList &hdrs = interp->get_headers()->get_headers();
3405  for (SgAsmGenericHeaderPtrList::const_iterator hi=hdrs.begin(); hi!=hdrs.end(); ++hi) {
3406  hdr = *hi;
3407  traverse(hdr, preorder);
3408  }
3409  }
3410  }
3411  void visit(SgNode *node) {
3412  if (SgAsmPEImportItem *import_item = isSgAsmPEImportItem(node)) {
3413  std::string name = import_item->get_name()->get_string();
3414  if (!name.empty() && !import_item->get_by_ordinal()) {
3415  // Add a name for both the absolute virtual address and the relative virtual address. The IAT will contain
3416  // relative addresses unless BinaryLoader applied fixups.
3417  rose_addr_t va = import_item->get_hintname_rva().get_va();
3418  names[va] = name;
3419  rose_addr_t rva = va - hdr->get_base_va();
3420  names[rva] = name;
3421  }
3422  }
3423  }
3424  std::string operator()(rose_addr_t va) const {
3425  NameMap::const_iterator found = names.find(va);
3426  return found==names.end() ? std::string() : found->second;
3427  }
3428  } names(interp);
3429 
3430  // Identify PE dynamic linking thunks and give them the name of the imported function to which they branch. FIXME: we
3431  // might want to change the name slightly because otherwise the thunk will have the same name as the linked-in function
3432  // if/after BinaryLoader does the linking. In contrast, the ELF executables typically place their dynamic linking
3433  // thunks in a ".plt" section (Procedure Lookup Table) and we name the thunks so that if the linked-in function is
3434  // named "printf", the thunk is named "printf@plt".
3435  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
3436  Function *func = fi->second;
3437  if (is_pe_dynlink_thunk(func)) {
3439  if (func->name.empty()) {
3440  BasicBlock *bb = func->entry_basic_block();
3441  bool complete;
3442  Disassembler::AddressSet succs = successors(bb, &complete);
3443  if (complete && 1==succs.size())
3444  func->name = names(*succs.begin());
3445  }
3446  }
3447  }
3448 }
3449 
3450 void
3452 {
3453  /* Obtain aggregate statistics over all the functions, and cache them. These statistics describe code, so we want to do
3454  * this before we add data blocks to the functions. Any statistics already cached should be considered outdated. */
3455  clear_aggregate_statistics();
3456  RegionStats *mean = aggregate_statistics();
3457  RegionStats *variance = get_aggregate_variance();
3458  if (debug) {
3459  std::ostringstream s;
3460  s <<"=== Mean ===\n" <<*mean <<"\n"
3461  <<"=== Variance ===\n" <<*variance <<"\n";
3462  fputs(s.str().c_str(), debug);
3463  }
3464 
3465  /* A memory map that contains only the executable regions. I.e., those that might contain instructions. */
3466  MemoryMap exe_map = *map;
3467  exe_map.prune(MemoryMap::MM_PROT_EXEC);
3468 
3469  /* Add unassigned intra-function blocks to the surrounding function. This needs to come before detecting inter-function
3470  * padding, otherwise it will also try to add the stuff between the true function and its following padding. */
3471  if (func_heuristics & SgAsmFunction::FUNC_INTRABLOCK) {
3473  fff.require_noninterleaved = true;
3474  fff.require_intrafunction = true;
3475  fff.threshold = 0; // treat all intra-function regions as code
3476  scan_unassigned_bytes(&fff, &exe_map);
3477  }
3478 
3479  /* Detect inter-function padding */
3480  if (func_heuristics & SgAsmFunction::FUNC_PADDING) {
3481  FindDataPadding cb;
3482  cb.minimum_nrep = 2;
3483  cb.maximum_nrep = 1024*1024;
3484  cb.begins_contiguously = false;
3485  cb.ends_contiguously = false;
3486 
3487  SgUnsignedCharList pattern;
3488  pattern.push_back(0x90); /* x68 NOP */
3489  cb.patterns.push_back(pattern);
3490 
3491  pattern.clear();
3492  pattern.push_back(0xcc); /* x86 INT3 */
3493  cb.patterns.push_back(pattern);
3494 
3495  /* Scan only executable regions of memory. */
3496  MemoryMap exe_map = *map;
3497  exe_map.prune(MemoryMap::MM_PROT_EXEC);
3498  scan_interfunc_bytes(&cb, &exe_map);
3499  }
3500 
3501  /* Find thunks. First use FindThunkTables, which has a more relaxed definition of a "thunk" but requires some minimum
3502  * number of consecutive thunks in order to trigger. Then use FindThunks, which has a strict definition including that the
3503  * thunk target must be a function. By running the latter in a loop, we can find thunks that branch to other thunks. */
3504  if (func_heuristics & SgAsmFunction::FUNC_THUNK) {
3505  FindThunkTables find_thunk_tables;
3506  find_thunk_tables.minimum_nthunks = 3; // at least this many JMPs per table
3507  find_thunk_tables.validate_targets = false;
3508  scan_unassigned_bytes(&find_thunk_tables, &exe_map);
3509  for (size_t npasses=0; npasses<5; ++npasses) {
3510  FindThunks find_thunks;
3511  scan_unassigned_insns(&find_thunks);
3512  if (0==find_thunks.nfound)
3513  break;
3514  }
3515  }
3516 
3517  /* Find functions that we missed between inter-function padding. */
3518  if (func_heuristics & SgAsmFunction::FUNC_MISCMASK) {
3519  FindInterPadFunctions find_interpad_functions;
3520  scan_unassigned_bytes(&find_interpad_functions, &exe_map);
3521  }
3522 
3523  /* Find code fragments that appear after a function. */
3524  if (func_heuristics & SgAsmFunction::FUNC_INTRABLOCK) {
3526  fff.require_noninterleaved = false;
3527  fff.require_intrafunction = false;
3528  fff.threshold = 0.7;
3529  scan_unassigned_bytes(&fff, &exe_map);
3530  }
3531 
3532  /* Run another analysis of the CFG because we may need to fix some things up after having added more blocks from the
3533  * post-cfg analyses we did above. If nothing happened above, then analyze_cfg() should be fast. */
3534  analyze_cfg(SgAsmBlock::BLK_GRAPH2);
3535 
3536  /* Split thunks off from their jumped-to function. Not really necessary, but the result is more like other common
3537  * disassemblers and also more closely matches what would happen if we had debugging information in the executable. This
3538  * should only run after analyze_cfg() because it assumes that a function's blocks have all been discovered -- it does some
3539  * analysis on the function's internal control flow. */
3540  if (0!=(func_heuristics & SgAsmFunction::FUNC_THUNK) && detach_thunks()>0)
3541  analyze_cfg(SgAsmBlock::BLK_GRAPH3);
3542 
3543  /* Find thunks again. We might have more things satisfying the relatively strict thunk definition. */
3544  if (func_heuristics & SgAsmFunction::FUNC_THUNK) {
3545  for (size_t npasses=0; npasses<5; ++npasses) {
3546  FindThunks find_thunks;
3547  scan_unassigned_insns(&find_thunks);
3548  if (0==find_thunks.nfound)
3549  break;
3550  }
3551  }
3552 
3553  /* Append data to the end(s) of each normal function. */
3554  FindData find_data;
3555  scan_unassigned_bytes(&find_data, &ro_map);
3556 
3557  /* Make sure padding is back where it belongs. */
3558  adjust_padding();
3559 
3560  /* Merge extra functions that we might have created. Sometimes it's possible that we break a function into too many parts,
3561  * and we can recombine those parts now. */
3562  merge_function_fragments();
3563 
3564  /* Give existing functions names from symbol tables. Don't create more functions. */
3565  if (interp && 0!=(func_heuristics & SgAsmFunction::FUNC_IMPORT)) {
3566  name_pe_dynlink_thunks(interp);
3567  const SgAsmGenericHeaderPtrList &headers = interp->get_headers()->get_headers();
3568  for (size_t i=0; i<headers.size(); i++) {
3569  name_plt_entries(headers[i]); // give names to ELF .plt trampolines
3570  name_import_entries(headers[i]); // give names to PE import thunks
3571  }
3572  }
3573 
3574  /* Add the BLK_CFGHEAD reason to all blocks that are also in the function's CFG head list. We do this once here rather
3575  * than searching the heads list in each pass of analyze_cfg(). */
3576  for (BasicBlocks::iterator bi=basic_blocks.begin(); bi!=basic_blocks.end(); ++bi) {
3577  BasicBlock *bb = bi->second;
3578  if (bb->function && 0==(bb->function->reason & SgAsmFunction::FUNC_LEFTOVERS) &&
3579  bb->function->heads.find(bb->address())!=bb->function->heads.end())
3581  }
3582 
3583  progress(debug,
3584  "Partitioner completed: %zu function%s, %zu insn%s, %zu block%s\n",
3585  functions.size(), 1==functions.size()?"":"s", insns.size(), 1==insns.size()?"":"s",
3586  basic_blocks.size(), 1==basic_blocks.size()?"":"s");
3587 }
3588 
3589 size_t
3591 {
3592  size_t retval = 0;
3593  for (Functions::iterator fi=functions.begin(); fi!=functions.end(); ++fi)
3594  retval += function_extent(fi->second, extents);
3595  return retval;
3596 }
3597 
3598 size_t
3600  FunctionRangeMap *extents/*out*/,
3601  rose_addr_t *lo_addr_ptr/*out*/, rose_addr_t *hi_addr_ptr/*out*/)
3602 {
3603  size_t nnodes=0;
3604  rose_addr_t lo_addr=0, hi_addr=0;
3605  std::set<DataBlock*> my_dblocks;
3606 
3607  for (BasicBlocks::iterator bi=func->basic_blocks.begin(); bi!=func->basic_blocks.end(); ++bi) {
3608  /* Find the extents for all the instructions in this basic block. */
3609  BasicBlock *bb = bi->second;
3610  for (InstructionVector::iterator ii=bb->insns.begin(); ii!=bb->insns.end(); ++ii) {
3611  rose_addr_t start = (*ii)->get_address();
3612  size_t size = (*ii)->get_size();
3613  if (0==nnodes++) {
3614  lo_addr = start;
3615  hi_addr = start + size;
3616  } else {
3617  lo_addr = std::min(lo_addr, start);
3618  hi_addr = std::max(hi_addr, start + size);
3619  }
3620  if (extents)
3621  extents->insert(Extent(start, size), func);
3622  }
3623 
3624  /* Gather data blocks associated with this basic block, but only if they aren't explicitly assigned to a function. */
3625  for (std::set<DataBlock*>::iterator di=bb->data_blocks.begin(); di!=bb->data_blocks.end(); ++di) {
3626  if (NULL==(*di)->function)
3627  my_dblocks.insert(*di);
3628  }
3629  }
3630 
3631  /* Gather the data blocks associated with this function. */
3632  for (DataBlocks::iterator bi=func->data_blocks.begin(); bi!=func->data_blocks.end(); ++bi)
3633  my_dblocks.insert(bi->second);
3634 
3635  /* Add the extents of all this function's data blocks. */
3636  for (std::set<DataBlock*>::iterator di=my_dblocks.begin(); di!=my_dblocks.end(); ++di) {
3637  DataBlock *dblock = *di;
3638  DataRangeMap data_extents;
3639  DataRangeMap *data_extents_ptr = extents ? &data_extents : NULL;
3640  rose_addr_t lo, hi;
3641  size_t n = datablock_extent(dblock, data_extents_ptr, &lo, &hi);
3642  if (n>0) {
3643  if (0==nnodes) {
3644  lo_addr = lo;
3645  hi_addr = hi;
3646  } else {
3647  lo_addr = std::min(lo_addr, lo);
3648  hi_addr = std::max(hi_addr, hi);
3649  }
3650  nnodes += n;
3651  if (extents) {
3652  for (DataRangeMap::iterator di2=data_extents.begin(); di2!=data_extents.end(); ++di2)
3653  extents->insert(di2->first, func);
3654  }
3655  }
3656  }
3657 
3658  /* Return values */
3659  if (lo_addr_ptr)
3660  *lo_addr_ptr = lo_addr;
3661  if (hi_addr_ptr)
3662  *hi_addr_ptr = hi_addr;
3663  return nnodes;
3664 }
3665 
3666 size_t
3668 {
3669  size_t nblocks = 0;
3670  for (DataBlocks::const_iterator di=data_blocks.begin(); di!=data_blocks.end(); ++di) {
3671  DataBlock *dblock = di->second;
3672  if (0!=(dblock->reason & SgAsmBlock::BLK_PADDING) && NULL!=effective_function(dblock)) {
3673  datablock_extent(dblock, extents);
3674  ++nblocks;
3675  }
3676  }
3677  return nblocks;
3678 }
3679 
3680 size_t
3682 {
3683  size_t nblocks = 0;
3684  for (DataBlocks::const_iterator di=data_blocks.begin(); di!=data_blocks.end(); ++di) {
3685  DataBlock *dblock = di->second;
3686  if (NULL!=effective_function(dblock)) {
3687  datablock_extent(dblock, extents);
3688  ++nblocks;
3689  }
3690  }
3691  return nblocks;
3692 }
3693 
3694 size_t
3696  DataRangeMap *extents/*in,out*/,
3697  rose_addr_t *lo_addr_ptr/*out*/, rose_addr_t *hi_addr_ptr/*out*/)
3698 {
3699  if (db->nodes.empty()) {
3700  if (lo_addr_ptr)
3701  *lo_addr_ptr = 0;
3702  if (hi_addr_ptr)
3703  *hi_addr_ptr = 0;
3704  } else {
3705  rose_addr_t start = db->nodes.front()->get_address();
3706  size_t size = db->nodes.front()->get_size();
3707  if (lo_addr_ptr)
3708  *lo_addr_ptr = start;
3709  if (hi_addr_ptr)
3710  *hi_addr_ptr = start+size;
3711  if (extents)
3712  extents->insert(Extent(start, size), db);
3713  }
3714 
3715  for (size_t i=1; i<db->nodes.size(); i++) {
3716  SgAsmStaticData *node = db->nodes[i];
3717  rose_addr_t start = node->get_address();
3718  size_t size = node->get_size();
3719 
3720  if (lo_addr_ptr)
3721  *lo_addr_ptr = std::min(*lo_addr_ptr, start);
3722  if (hi_addr_ptr)
3723  *hi_addr_ptr = std::max(*hi_addr_ptr, start+size);
3724  if (extents)
3725  extents->insert(Extent(start, size), db);
3726  }
3727  return db->nodes.size();
3728 }
3729 
3730 /* The function is contiguous if the stuff between its extent doesn't belong to any other function. */
3731 bool
3733 {
3734  FunctionRangeMap extents;
3735  rose_addr_t lo_addr, hi_addr;
3736  if (0==function_extent(func, &extents, &lo_addr, &hi_addr) || 1==extents.size())
3737  return true;
3738  if (strict)
3739  return false;
3740 
3741  /* Check for instructions belonging to other functions. */
3742  const rose_addr_t max_insn_size = 16; /* FIXME: This is a kludge, but should work 99% of the time [RPM 2011-09-16] */
3743  InstructionMap::iterator ii = insns.lower_bound(std::max(lo_addr,max_insn_size)-max_insn_size);
3744  for (/*void*/; ii!=insns.end() && ii->first<hi_addr; ++ii) {
3745  if (ii->first>=lo_addr) {
3746  BasicBlock *bb = find_bb_containing(ii->first, false);
3747  if (bb && bb->function && bb->function!=func)
3748  return false;
3749  }
3750  }
3751 
3752  /* Check for data belonging to other functions.
3753  * FIXME: we could use a faster method of doing this! [RPM 2011-09-29] */
3754  for (DataBlocks::iterator dbi=data_blocks.begin(); dbi!=data_blocks.end(); ++dbi) {
3755  DataBlock *block = dbi->second;
3756  Function *block_func = effective_function(block);
3757  if (block_func!=NULL && block_func!=func) {
3758  for (size_t i=0; i<block->nodes.size(); i++) {
3759  if (block->nodes[i]->get_address() < hi_addr &&
3760  block->nodes[i]->get_address() + block->nodes[i]->get_size() > lo_addr)
3761  return false;
3762  }
3763  }
3764  }
3765 
3766  return true;
3767 }
3768 
3769 /* Update CFG edge nodes. */
3770 void
3772 {
3773  typedef std::map<rose_addr_t, SgAsmBlock*> BlockMap;
3774 
3775  /* Build a map from address to SgAsmBlock so we can do lookups quickly. */
3776  struct BlockMapBuilder: public SgSimpleProcessing {
3777  BlockMap *block_map;
3778  BlockMapBuilder(SgNode *ast, BlockMap *block_map): block_map(block_map) {
3779  traverse(ast, preorder);
3780  }
3781  void visit(SgNode *node) {
3782  SgAsmBlock *block = isSgAsmBlock(node);
3783  if (block!=NULL) {
3784  const SgAsmStatementPtrList &stmts = block->get_statementList();
3785  SgAsmInstruction *insn = stmts.empty() ? NULL : isSgAsmInstruction(stmts.front());
3786  if (insn)
3787  block_map->insert(std::make_pair(insn->get_address(), block));
3788  }
3789  }
3790  };
3791 
3792  /* Now add block pointers to the successor targets. */
3793  struct TargetPopulator: public SgSimpleProcessing {
3794  const BlockMap &block_map;
3795  TargetPopulator(SgNode *ast, const BlockMap &block_map): block_map(block_map) {
3796  traverse(ast, preorder);
3797  }
3798  void visit(SgNode *node) {
3799  SgAsmBlock *block = isSgAsmBlock(node);
3800  if (block) {
3801  for (size_t i=0; i<block->get_successors().size(); i++) {
3802  SgAsmIntegerValueExpression *target = block->get_successors()[i];
3803  if (target && NULL==target->get_base_node()) {
3804  BlockMap::const_iterator bi=block_map.find(target->get_absolute_value());
3805  if (bi!=block_map.end())
3806  target->make_relative_to(bi->second);
3807  }
3808  }
3809  }
3810  }
3811  };
3812 
3813  BlockMap block_map;
3814  BlockMapBuilder(ast, &block_map);
3815  TargetPopulator(ast, block_map);
3816 }
3817 
3818 /* Make pointers relative to what they point into. */
3819 void
3821 {
3822 
3823  struct FixerUpper: public AstPrePostProcessing {
3824  Partitioner *p;
3825  SgAsmInterpretation *interp;
3826  SgAsmInstruction *insn;
3827  SgAsmGenericSectionPtrList mapped_sections;
3828  DataRangeMap static_data;
3829 
3830  FixerUpper(Partitioner *p, SgAsmInterpretation *interp)
3831  : p(p), interp(interp), insn(NULL) {}
3832 
3833  void atTraversalStart() {
3834  /* Get a list of all memory-mapped sections in the interpretation. */
3835  if (interp) {
3836  const SgAsmGenericHeaderPtrList &headers = interp->get_headers()->get_headers();
3837  for (SgAsmGenericHeaderPtrList::const_iterator hi=headers.begin(); hi!=headers.end(); ++hi) {
3838  if ((*hi)->is_mapped())
3839  mapped_sections.push_back(*hi);
3840  SgAsmGenericSectionPtrList file_sections = (*hi)->get_mapped_sections();
3841  mapped_sections.insert(mapped_sections.end(), file_sections.begin(), file_sections.end());
3842  }
3843  }
3844 
3845  /* Get a list of all static data blocks */
3846  p->datablock_extent(&static_data);
3847  }
3848 
3849  void preOrderVisit(SgNode *node) {
3850  if (!insn) {
3851  insn = isSgAsmInstruction(node);
3852  } else if (isSgAsmIntegerValueExpression(node)) {
3854 
3855  /* Don't monkey with constants that are already relative to some other node. These are things that have been
3856  * already fixed up by other methods. */
3857  if (ival->get_base_node()!=NULL)
3858  return;
3859  rose_addr_t va = ival->get_absolute_value();
3860 
3861  /* If this constant is a code pointer, then make the pointer relative to the instruction it points to. If that
3862  * instruction is the entry instruction of a function, then point to the function instead. A value is
3863  * considered a code pointer only if it points to an existing instruction that's contained in a basic block,
3864  * and that basic block is part of a valid function. This constraint weeds out pointers to code that was
3865  * disassembled but later discarded. */
3866  Instruction *target_insn = p->find_instruction(va, false/*do not create*/);
3867  if (target_insn && target_insn->bblock && target_insn->bblock->function &&
3868  0==(target_insn->bblock->function->reason & SgAsmFunction::FUNC_LEFTOVERS)) {
3869  SgAsmFunction *target_func = SageInterface::getEnclosingNode<SgAsmFunction>(target_insn->node);
3870  if (target_func && target_func->get_entry_va()==target_insn->get_address()) {
3871  ival->make_relative_to(target_func);
3872  } else {
3873  ival->make_relative_to(target_insn->node);
3874  }
3875  return;
3876  }
3877 
3878  /* If this constant points into a static data block, then make it relative to that block. */
3879  DataRangeMap::iterator dbi = static_data.find(va);
3880  if (dbi!=static_data.end()) {
3881  DataBlock *dblock = dbi->second.get();
3882  for (size_t i=0; i<dblock->nodes.size(); ++i) {
3883  SgAsmStaticData *sd = dblock->nodes[i];
3884  if (va>=sd->get_address() && va<sd->get_address()+sd->get_size()) {
3885  ival->make_relative_to(sd);
3886  return;
3887  }
3888  }
3889  }
3890 
3891  /* If this constant points into a non-executable data segment, then make the pointer relative to that data
3892  * segment. */
3894  if (section && !section->get_mapped_xperm()) {
3895  ival->make_relative_to(section);
3896  return;
3897  }
3898  }
3899  }
3900 
3901  void postOrderVisit(SgNode *node) {
3902  if (isSgAsmInstruction(node))
3903  insn = NULL;
3904  }
3905  };
3906 
3907  FixerUpper(this, interp).traverse(ast);
3908 }
3909 
3910 /* Build the global block containing all functions. */
3911 SgAsmBlock *
3913 {
3914  /* Build a function to hold all the unassigned instructions. Update documentation if changing the name of
3915  * this generated function! We do this by traversing the instructions and obtaining a basic block for each one. If the
3916  * basic block doesn't belong to a function yet, we add it to this special one. Note that we cannot traverse the list of
3917  * instructions directly because creating the basic block might cause additional instructions to be created.
3918  *
3919  * Do not include the instruction in the leftovers functions if that instruction is completely overlapped by the bytes of
3920  * an existing, non-leftovers function. */
3921  Function *catchall = NULL;
3922  if ((func_heuristics & SgAsmFunction::FUNC_LEFTOVERS)) {
3923 
3924  /* List of all bytes occupied by functions. */
3925  FunctionRangeMap existing;
3926  function_extent(&existing);
3927 
3928  /* Repeatedly add unassigned instructions to the leftovers function. */
3929  bool process_instructions;
3930  do {
3931  process_instructions = false;
3932  InstructionMap insns_copy = insns;
3933  for (InstructionMap::iterator ii=insns_copy.begin(); ii!=insns_copy.end(); ++ii) {
3934  rose_addr_t va = ii->first;
3935  size_t size = ii->second->get_size();
3936  if (!existing.contains(Extent(va, size))) {
3937  BasicBlock *bb = find_bb_containing(ii->first);
3938  assert(bb!=NULL);
3939  if (!bb->function) {
3940  if (!catchall)
3941  catchall = add_function(ii->first, SgAsmFunction::FUNC_LEFTOVERS, "***uncategorized blocks***");
3942  append(catchall, bb, SgAsmBlock::BLK_LEFTOVERS);
3943  process_instructions = true;
3944  }
3945  }
3946  }
3947  } while (process_instructions);
3948  }
3949 
3950  /* Build the AST */
3951  SgAsmBlock *retval = new SgAsmBlock;
3952  for (Functions::const_iterator fi=functions.begin(); fi!=functions.end(); ++fi) {
3953  SgAsmFunction *func_decl = build_ast(fi->second);
3954  if (!func_decl) continue;
3955  retval->get_statementList().push_back(func_decl);
3956  func_decl->set_parent(retval);
3957  }
3958 
3959  /* Return catchall blocks to the free pool */
3960  if (catchall) {
3961  catchall->clear_basic_blocks();
3962  catchall->clear_data_blocks();
3963  functions.erase(catchall->entry_va);
3964  delete catchall;
3965  }
3966 
3967  /* Make pointers relative to the thing into which they point. */
3968  fixup_cfg_edges(retval);
3969  fixup_pointers(retval, interp);
3970  return retval;
3971 }
3972 
3973 /* Build a function node containing all basic blocks and data blocks of the function. */
3974 SgAsmFunction *
3976 {
3977  if (f->basic_blocks.empty()) {
3978  if (debug)
3979  fprintf(debug, "function F%08"PRIx64" \"%s\" has no basic blocks!\n", f->entry_va, f->name.c_str());
3980  return NULL;
3981  }
3982 
3983  /* Get the list of basic blocks and data blocks. We'll want them to be added to the function in order of their starting
3984  * address, with basic blocks and data blocks interleaved. */
3985  typedef std::multimap<rose_addr_t, SgAsmStatement*> NodeMap;
3986  std::set<DataBlock*> my_data_blocks;
3987  NodeMap nodes;
3988  BasicBlock *first_basic_block = NULL;
3989  for (BasicBlocks::iterator bi=f->basic_blocks.begin(); bi!=f->basic_blocks.end(); ++bi) {
3990  BasicBlock *bblock = bi->second;
3991  if (!first_basic_block)
3992  first_basic_block = bblock;
3993 
3994  /* The instructions for this basic block */
3995  SgAsmStatement *node = build_ast(bblock);
3996  nodes.insert(std::make_pair(bblock->address(), node));
3997 
3998  /* The data associated with this basic block */
3999  for (std::set<DataBlock*>::iterator di=bblock->data_blocks.begin(); di!=bblock->data_blocks.end(); ++di) {
4000  DataBlock *dblock = *di;
4001  Function *dblock_func = effective_function(dblock);
4002  if (dblock_func==f)
4003  my_data_blocks.insert(dblock);
4004  }
4005  }
4006 
4007  for (DataBlocks::iterator di=f->data_blocks.begin(); di!=f->data_blocks.end(); ++di) {
4008  DataBlock *dblock = di->second;
4009  assert(dblock->function==f);
4010  my_data_blocks.insert(dblock);
4011  }
4012 
4013  for (std::set<DataBlock*>::iterator di=my_data_blocks.begin(); di!=my_data_blocks.end(); ++di) {
4014  DataBlock *dblock = *di;
4015  SgAsmBlock *ast_block = build_ast(dblock);
4016  nodes.insert(std::make_pair(dblock->address(), ast_block));
4017  }
4018 
4019  /* Create the AST function node. */
4020  SgAsmFunction *retval = new SgAsmFunction;
4021  retval->set_entry_va(f->entry_va);
4022  retval->set_name(f->name);
4023  retval->set_address(first_basic_block->address());
4024 
4025  /* Set the SgAsmFunction::can_return property. If we've never indicated that a function might return then assume it
4026  * doesn't return. We're all done with analysis now, so it must not return. */
4029  } else {
4030  retval->set_may_return(f->get_may_return());
4031  }
4032 
4033  for (NodeMap::iterator ni=nodes.begin(); ni!=nodes.end(); ++ni) {
4034  retval->get_statementList().push_back(ni->second);
4035  ni->second->set_parent(retval);
4036  }
4037 
4038  unsigned reasons = f->reason;
4039  if (0==(reasons & SgAsmFunction::FUNC_DISCONT)) {
4040  FunctionRangeMap extent;
4041  function_extent(f, &extent);
4042  if (extent.nranges()>1)
4043  reasons |= SgAsmFunction::FUNC_DISCONT;
4044  }
4045  retval->set_reason(reasons);
4046  return retval;
4047 }
4048 
4049 /* Build a basic block node containing all instructions for the basic block. */
4050 SgAsmBlock *
4052 {
4053  SgAsmBlock *retval = new SgAsmBlock;
4054  retval->set_id(block->address());
4055  retval->set_address(block->address());
4056  retval->set_reason(block->reason);
4057  retval->set_code_likelihood(block->code_likelihood);
4058 
4059  for (InstructionVector::const_iterator ii=block->insns.begin(); ii!=block->insns.end(); ++ii) {
4060  Instruction *insn = *ii;
4061  retval->get_statementList().push_back(insn->node);
4062  insn->node->set_parent(retval);
4063  }
4064 
4065  /* Cache block successors so other layers don't have to constantly compute them. We fill in the successor
4066  * SgAsmIntegerValueExpression objects with only the address and not pointers to blocks since we don't have all the blocks
4067  * yet. The pointers will be initialized in the no-argument version build_ast() higher up on the stack. */
4068  bool complete;
4069  Disassembler::AddressSet successor_addrs = successors(block, &complete);
4070  for (Disassembler::AddressSet::iterator si=successor_addrs.begin(); si!=successor_addrs.end(); ++si) {
4072  value->set_parent(retval);
4073  retval->get_successors().push_back(value);
4074  }
4075  retval->set_successors_complete(complete);
4076  return retval;
4077 }
4078 
4079 /* Buid a data block node for each data block. */
4080 SgAsmBlock *
4082 {
4083  SgAsmBlock *retval = new SgAsmBlock;
4084  retval->set_id(block->address());
4085  retval->set_address(block->address());
4086  retval->set_reason(block->reason);
4087 
4088  for (std::vector<SgAsmStaticData*>::const_iterator ni=block->nodes.begin(); ni!=block->nodes.end(); ++ni) {
4089  retval->get_statementList().push_back(*ni);
4090  assert(NULL==(*ni)->get_parent());
4091  (*ni)->set_parent(retval);
4092  }
4093  return retval;
4094 }
4095 
4096 /* Top-level function to run the partitioner in passive mode. */
4097 SgAsmBlock *
4099 {
4100  disassembler = NULL;
4101  add_instructions(insns);
4102 
4103  MemoryMap *old_map = get_map();
4104  MemoryMap old_ro_map = ro_map;
4105  if (!map && !old_map)
4106  throw Exception("no memory map");
4107  if (map)
4108  set_map(map);
4109 
4110  SgAsmBlock *retval = NULL;
4111  try {
4112  pre_cfg(interp);
4113  analyze_cfg(SgAsmBlock::BLK_GRAPH1);
4114  post_cfg(interp);
4115  retval = build_ast(interp);
4116  set_map(old_map, &old_ro_map);
4117  } catch (...) {
4118  set_map(old_map, &old_ro_map);
4119  throw;
4120  }
4121 
4122  return retval;
4123 }
4124 
4125 /* Top-level function to run the partitioner in active mode. */
4126 SgAsmBlock *
4128 {
4129  assert(d!=NULL);
4130  disassembler = d;
4131  assert(m!=NULL);
4132  set_map(m);
4133  pre_cfg(interp);
4134  analyze_cfg(SgAsmBlock::BLK_GRAPH1);
4135  post_cfg(interp);
4136  return build_ast(interp);
4137 }
4138 
4139 void
4141 {
4142  for (Disassembler::InstructionMap::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
4143  Instruction *insn = new Instruction(ii->second);
4144  this->insns.insert(std::make_pair(ii->first, insn));
4145  }
4146 }
4147 
4150 {
4152  for (InstructionMap::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
4153  SgAsmInstruction *insn = ii->second->node;
4154  retval.insert(std::make_pair(ii->first, insn));
4155  }
4156  return retval;
4157 }
4158 
4159 // class method
4160 void
4162 {
4163  assert(interp!=NULL);
4164  if (interp->get_global_block())
4165  return;
4166 
4167  // Map segments into virtual memory if this hasn't been done yet
4168  MemoryMap *map = interp->get_map();
4169  if (map==NULL) {
4170  map = new MemoryMap;
4171  interp->set_map(map);
4172  BinaryLoader *loader = BinaryLoader::lookup(interp)->clone();
4173  loader->remap(interp);
4174  }
4175 
4176  // Obtain a disassembler based on the type of file we're disassembling and configure it according to the
4177  // -rose:disassembler_search switches stored in the enclosing SgFile node.
4178  Disassembler *disassembler = Disassembler::lookup(interp);
4179  if (!disassembler)
4180  throw std::runtime_error("no valid disassembler for this interpretation");
4181  disassembler = disassembler->clone(); // so we can change settings without affecting the registry
4182  SgFile *file = SageInterface::getEnclosingNode<SgFile>(interp);
4183  assert(file!=NULL);
4184  disassembler->set_search(file->get_disassemblerSearchHeuristics());
4185 
4186  // Obtain a partitioner to organize instructions into basic blocks and basic blocks into functions.
4187  Partitioner *partitioner = new Partitioner();
4188  partitioner->set_search(file->get_partitionerSearchHeuristics());
4189  partitioner->load_config(file->get_partitionerConfigurationFileName());
4190 
4191  // Decide what to disassemble. Include at least the entry addresses.
4192  Disassembler::AddressSet worklist;
4193  const SgAsmGenericHeaderPtrList &headers = interp->get_headers()->get_headers();
4194  for (SgAsmGenericHeaderPtrList::const_iterator hi=headers.begin(); hi!=headers.end(); ++hi) {
4195  SgRVAList entry_rvalist = (*hi)->get_entry_rvas();
4196  for (size_t i=0; i<entry_rvalist.size(); ++i) {
4197  rose_addr_t entry_va = (*hi)->get_base_va() + entry_rvalist[i].get_rva();
4198  worklist.insert(entry_va);
4199  }
4200  if (disassembler->get_search() & Disassembler::SEARCH_FUNCSYMS)
4201  disassembler->search_function_symbols(&worklist, map, *hi);
4202  }
4203 
4204  // Run the disassembler first to populate the instruction map for the partitioner. This will allow the partitioner to do
4205  // pattern recognition to find function boundaries if desired.
4206  Disassembler::BadMap errors;
4207  Disassembler::InstructionMap insns = disassembler->disassembleBuffer(map, worklist, NULL, &errors);
4208  partitioner->add_instructions(insns);
4209 
4210  // Organize the instructions into basic blocks and functions. This will call the disassembler to get any additional
4211  // instructions that are needed (the partitioner does deeper analysis than the disassembler).
4212  if (SgAsmBlock *block = partitioner->partition(interp, disassembler, map)) {
4213  interp->set_global_block(block);
4214  block->set_parent(interp);
4215  }
4216 
4217  delete partitioner;
4218  delete disassembler;
4219 }
4220 
4221 /* FIXME: Deprecated 2010-01-01 */
4224 {
4225  BasicBlockStarts bb_starts;
4226 
4227  /* The first instruction always starts a basic block. */
4228  if (insns.size()>0) {
4229  rose_addr_t insn_va = insns.begin()->first;
4230  bb_starts[insn_va] = BasicBlockStarts::mapped_type();
4231  }
4232 
4233  for (Disassembler::InstructionMap::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
4234  SgAsmInstruction *insn = ii->second;
4235  rose_addr_t insn_va = insn->get_address();
4236  rose_addr_t next_va = insn->get_address() + insn->get_size();
4237 
4238  /* If this instruction is one which terminates a basic block then make the next instruction (if any) the beginning of
4239  * a basic block. However, a sequence like the following should not be a basic block boundary because the CALL is
4240  * acting more like a "PUSH EIP" (we should probably just look at the CALL instruction itself rather than also looking
4241  * for the following POP, but since ROSE doesn't currently apply the relocation tables before disassembling, the CALL
4242  * with a zero offset is quite common. [RPM 2009-08-24] */
4243  if (insn->terminates_basic_block()) {
4244  Disassembler::InstructionMap::const_iterator found = insns.find(next_va);
4245  if (found!=insns.end()) {
4246  SgAsmx86Instruction *insn_x86 = isSgAsmx86Instruction(insn);
4247  SgAsmx86Instruction *insn2_x86 = isSgAsmx86Instruction(found->second);
4248  rose_addr_t branch_target_va;
4249  if (insn_x86 &&
4250  (insn_x86->get_kind()==x86_call || insn_x86->get_kind()==x86_farcall) &&
4251  insn->get_branch_target(&branch_target_va) &&
4252  branch_target_va==next_va && insn2_x86->get_kind()==x86_pop) {
4253  /* The CALL is acting more like a "PUSH EIP" and should not end the basic block. */
4254  } else if (bb_starts.find(next_va)==bb_starts.end()) {
4255  bb_starts[next_va] = BasicBlockStarts::mapped_type();
4256  }
4257  }
4258  }
4259 
4260  /* If this instruction has multiple known successors then make each of those successors the beginning of a basic
4261  * block (provided there's an instruction at that address). However, if there's only one successor and it's the
4262  * fall-through address then ignore it. */
4263  bool complete;
4264  Disassembler::AddressSet successors = insn->get_successors(&complete);
4265  for (Disassembler::AddressSet::const_iterator si=successors.begin(); si!=successors.end(); ++si) {
4266  rose_addr_t successor_va = *si;
4267  if ((successor_va != next_va || successors.size()>1) && insns.find(successor_va)!=insns.end())
4268  bb_starts[successor_va].insert(insn_va);
4269  }
4270  }
4271  return bb_starts;
4272 }
4273 
4274 /* FIXME: Deprecated 2010-01-01 */
4277  BasicBlockStarts &bb_starts/*out*/) const
4278 {
4279  FunctionStarts retval;
4280  for (Functions::const_iterator fi=functions.begin(); fi!=functions.end(); ++fi)
4281  retval.insert(std::make_pair(fi->first, FunctionStart(fi->second->reason, fi->second->name)));
4282  return retval;
4283 }