ROSE  0.9.6a
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
SgAsmX86Instruction.C
Go to the documentation of this file.
1 /* SgAsmx86Instruction member definitions. Do not move them to src/ROSETTA/Grammar/BinaryInstruction.code (or any *.code file)
2  * because then they won't get indexed/formatted/etc. by C-aware tools. */
3 
4 #include "sage3basic.h"
5 #include "SymbolicSemantics.h"
6 #include "SymbolicSemantics2.h"
7 #include "PartialSymbolicSemantics.h"
8 #include "DispatcherX86.h"
9 #include "YicesSolver.h"
10 #include "Disassembler.h"
11 
12 // see base class
13 bool
16  return true;
17  return x86InstructionIsControlTransfer(this);
18 }
19 
20 // see base class
21 bool
22 SgAsmx86Instruction::is_function_call(const std::vector<SgAsmInstruction*>& insns, rose_addr_t *target, rose_addr_t *return_va)
23 {
24  static const size_t EXECUTION_LIMIT = 25; // max size of basic blocks for expensive analyses
25  if (insns.empty())
26  return false;
27  SgAsmx86Instruction *last = isSgAsmx86Instruction(insns.back());
28  if (!last)
29  return false;
30 
31  // Quick method based only on the kind of instruction
32  if (x86_call==last->get_kind() || x86_farcall==last->get_kind()) {
33  last->get_branch_target(target);
34  if (return_va)
35  *return_va = last->get_address() + last->get_size();
36  return true;
37  }
38 
39  // The following stuff works only if we have a relatively complete AST.
40  SgAsmFunction *func = SageInterface::getEnclosingNode<SgAsmFunction>(last);
41  SgAsmInterpretation *interp = SageInterface::getEnclosingNode<SgAsmInterpretation>(func);
42 
43  // Slow method: Emulate the instructions and then look at the EIP and stack. If the EIP points outside the current
44  // function and the top of the stack holds an address of an instruction within the current function, then this must be a
45  // function call. FIXME: The implementation here assumes a 32-bit machine. [Robb P. Matzke 2013-09-06]
46  if (interp && insns.size()<=EXECUTION_LIMIT) {
47  using namespace BinaryAnalysis::InstructionSemantics2;
48  using namespace BinaryAnalysis::InstructionSemantics2::SymbolicSemantics;
49  const InstructionMap &imap = interp->get_instruction_map();
51  SMTSolver *solver = NULL; // using a solver would be more accurate, but slower
52  BaseSemantics::RiscOperatorsPtr ops = RiscOperators::instance(regdict, solver);
53  DispatcherX86Ptr dispatcher = DispatcherX86::instance(ops);
54  SValuePtr orig_esp = SValue::promote(ops->readRegister(dispatcher->REG_ESP));
55  try {
56  for (size_t i=0; i<insns.size(); ++i)
57  dispatcher->processInstruction(insns[i]);
58  } catch (const BaseSemantics::Exception &e) {
59  return false;
60  }
61 
62  // If the next instruction address is concrete but does not point to a function entry point, then this is not a call.
63  SValuePtr eip = SValue::promote(ops->readRegister(dispatcher->REG_EIP));
64  if (eip->is_number()) {
65  rose_addr_t target_va = eip->get_number();
66  SgAsmFunction *target_func = SageInterface::getEnclosingNode<SgAsmFunction>(imap.get_value_or(target_va, NULL));
67  if (!target_func || target_va!=target_func->get_entry_va())
68  return false;
69  }
70 
71  // If nothing was pushed onto the stack, then this isn't a function call.
72  SValuePtr esp = SValue::promote(ops->readRegister(dispatcher->REG_ESP));
73  SValuePtr stack_delta = SValue::promote(ops->add(esp, ops->negate(orig_esp)));
74  SValuePtr stack_delta_sign = SValue::promote(ops->extract(stack_delta, 31, 32));
75  if (stack_delta_sign->is_number() && 0==stack_delta_sign->get_number())
76  return false;
77 
78  // If the top of the stack does not contain a concrete value or the top of the stack does not point to an instruction
79  // in this basic block's function, then this is not a function call.
80  SValuePtr top = SValue::promote(ops->readMemory(dispatcher->REG_SS, esp, esp->boolean_(true), 32));
81  if (top->is_number()) {
82  rose_addr_t va = top->get_number();
83  SgAsmFunction *return_func = SageInterface::getEnclosingNode<SgAsmFunction>(imap.get_value_or(va, NULL));
84  if (!return_func || return_func!=func) {
85  return false;
86  }
87  } else {
88  return false;
89  }
90 
91  // Since EIP might point to a function entry address and since the top of the stack contains a pointer to an
92  // instruction in this function, we assume that this is a function call.
93  if (target && eip->is_number())
94  *target = eip->get_number();
95  if (return_va && top->is_number())
96  *return_va = top->get_number();
97  return true;
98  }
99 
100  // Similar to the above method, but works when all we have is the basic block (e.g., this case gets hit quite a bit from
101  // the Partitioner). Returns true if, after executing the basic block, the top of the stack contains the fall-through
102  // address of the basic block. We depend on our caller to figure out if EIP is reasonably a function entry address.
103  if (!interp && insns.size()<=EXECUTION_LIMIT) {
104  using namespace BinaryAnalysis::InstructionSemantics2;
105  using namespace BinaryAnalysis::InstructionSemantics2::SymbolicSemantics;
107  SMTSolver *solver = NULL; // using a solver would be more accurate, but slower
108  BaseSemantics::RiscOperatorsPtr ops = RiscOperators::instance(regdict, solver);
109  DispatcherX86Ptr dispatcher = DispatcherX86::instance(ops);
110  try {
111  for (size_t i=0; i<insns.size(); ++i)
112  dispatcher->processInstruction(insns[i]);
113  } catch (const BaseSemantics::Exception &e) {
114  return false;
115  }
116 
117  // Look at the top of the stack
118  SValuePtr top = SValue::promote(ops->readMemory(dispatcher->REG_SS, ops->readRegister(dispatcher->REG_ESP),
119  ops->get_protoval()->boolean_(true), 32));
120  if (top->is_number() && top->get_number() == last->get_address()+last->get_size()) {
121  if (target) {
122  SValuePtr eip = SValue::promote(ops->readRegister(dispatcher->REG_EIP));
123  if (eip->is_number())
124  *target = eip->get_number();
125  }
126  if (return_va)
127  *return_va = top->get_number();
128  return true;
129  }
130  }
131 
132 
133  return false;
134 }
135 
137 bool
138 SgAsmx86Instruction::is_function_return(const std::vector<SgAsmInstruction*> &insns) {
139  if (insns.empty())
140  return false;
141  SgAsmx86Instruction *last_insn = isSgAsmx86Instruction(insns.back());
142  if (!last_insn)
143  return false;
144  if (last_insn->get_kind()==x86_ret || last_insn->get_kind()==x86_retf)
145  return true;
146  return false;
147 }
148 
150 bool
152 {
153  return x86_unknown_instruction == get_kind();
154 }
155 
160  *complete = true; /*assume true and prove otherwise*/
161 
162  switch (get_kind()) {
163  case x86_call:
164  case x86_farcall:
165  case x86_jmp:
166  case x86_farjmp: {
167  /* Unconditional branch to operand-specified address. We cannot assume that a CALL instruction returns to the
168  * fall-through address. */
169  rose_addr_t va;
170  if (get_branch_target(&va)) {
171  retval.insert(va);
172  } else {
173  *complete = false;
174  }
175  break;
176  }
177 
178  case x86_ja:
179  case x86_jae:
180  case x86_jb:
181  case x86_jbe:
182  case x86_jcxz:
183  case x86_jecxz:
184  case x86_jrcxz:
185  case x86_je:
186  case x86_jg:
187  case x86_jge:
188  case x86_jl:
189  case x86_jle:
190  case x86_jne:
191  case x86_jno:
192  case x86_jns:
193  case x86_jo:
194  case x86_jpe:
195  case x86_jpo:
196  case x86_js:
197  case x86_loop:
198  case x86_loopnz:
199  case x86_loopz: {
200  /* Conditional branches to operand-specified address */
201  rose_addr_t va;
202  if (get_branch_target(&va)) {
203  retval.insert(va);
204  } else {
205  *complete = false;
206  }
207  retval.insert(get_address() + get_size());
208  break;
209  }
210 
211  case x86_ret:
212  case x86_iret:
213  case x86_int1:
214  case x86_int3:
215  case x86_into:
216  case x86_rsm:
217  case x86_ud2:
218  case x86_retf: {
219  /* Unconditional branch to run-time specified address */
220  *complete = false;
221  break;
222  }
223 
224  case x86_hlt: {
225  /* Instructions having no successor. */
226  break;
227  }
228 
230  /* Instructions having unknown successors */
231  *complete = false;
232  }
233 
234  default: {
235  /* Instructions that always fall through to the next instruction */
236  retval.insert(get_address() + get_size());
237  break;
238  }
239  }
240  return retval;
241 }
242 
243 bool
245  // Treats far destinations as "unknown"
246  switch (get_kind()) {
247  case x86_call:
248  case x86_farcall:
249  case x86_jmp:
250  case x86_ja:
251  case x86_jae:
252  case x86_jb:
253  case x86_jbe:
254  case x86_jcxz:
255  case x86_jecxz:
256  case x86_jrcxz:
257  case x86_je:
258  case x86_jg:
259  case x86_jge:
260  case x86_jl:
261  case x86_jle:
262  case x86_jne:
263  case x86_jno:
264  case x86_jns:
265  case x86_jo:
266  case x86_jpe:
267  case x86_jpo:
268  case x86_js:
269  case x86_loop:
270  case x86_loopnz:
271  case x86_loopz: {
273  if (args.size()!=1)
274  return false;
276  if (!ival)
277  return false;
278  if (target)
279  *target = ival->get_absolute_value();
280  return true;
281  }
282  default:
283  return false; // do not modify *target
284  }
285 }
286 
289 SgAsmx86Instruction::get_successors(const std::vector<SgAsmInstruction*>& insns, bool *complete, MemoryMap *initial_memory)
290 {
291  using namespace BinaryAnalysis::InstructionSemantics;
292  static const bool debug = false;
293 
294  if (debug) {
295  std::cerr <<"SgAsmx86Instruction::get_successors(" <<StringUtility::addrToString(insns.front()->get_address())
296  <<" for " <<insns.size() <<" instruction" <<(1==insns.size()?"":"s") <<"):" <<std::endl;
297  }
298 
299  Disassembler::AddressSet successors = SgAsmInstruction::get_successors(insns, complete);
300 
301  /* If we couldn't determine all the successors, or a cursory analysis couldn't narrow it down to a single successor then
302  * we'll do a more thorough analysis now. In the case where the cursory analysis returned a complete set containing two
303  * successors, a thorough analysis might be able to narrow it down to a single successor. We should not make special
304  * assumptions about CALL and FARCALL instructions -- their only successor is the specified address operand. */
305  if (!*complete || successors.size()>1) {
306 
307 #if 0
308  /* Use the most robust semantic analysis available. Warning: this can be very slow, especially when an SMT solver is
309  * involved! */
310 # if defined(ROSE_YICES) || defined(ROSE_HAVE_LIBYICES)
311  YicesSolver yices;
312  if (yices.available_linkage() & YicesSolver::LM_LIBRARY) {
313  yices.set_linkage(YicesSolver::LM_LIBRARY);
314  } else {
315  yices.set_linkage(YicesSolver::LM_EXECUTABLE);
316  }
317  SMTSolver *solver = &yices;
318 # else
319  SMTSolver *solver = NULL;
320 # endif
321  if (debug && solver)
322  solver->set_debug(stderr);
323  typedef SymbolicSemantics::Policy<> Policy;
324  typedef SymbolicSemantics::ValueType<32> RegisterType;
325  typedef X86InstructionSemantics<Policy, SymbolicSemantics::ValueType> Semantics;
326  Policy policy(solver);
327 #else
328  typedef PartialSymbolicSemantics::Policy<> Policy;
329  typedef PartialSymbolicSemantics::ValueType<32> RegisterType;
330  typedef X86InstructionSemantics<Policy, PartialSymbolicSemantics::ValueType> Semantics;
331  Policy policy;
332  policy.set_map(initial_memory);
333 #endif
334  try {
335  Semantics semantics(policy);
336  for (size_t i=0; i<insns.size(); i++) {
337  SgAsmx86Instruction* insn = isSgAsmx86Instruction(insns[i]);
338  semantics.processInstruction(insn);
339  if (debug) {
340  std::cerr << " state after " <<unparseInstructionWithAddress(insn) <<std::endl
341  <<policy.get_state();
342  }
343  }
344  const RegisterType &newip = policy.get_ip();
345  if (newip.is_known()) {
346  successors.clear();
347  successors.insert(newip.known_value());
348  *complete = true; /*this is the complete set of successors*/
349  }
350  } catch(const Semantics::Exception& e) {
351  /* Abandon entire basic block if we hit an instruction that's not implemented. */
352  if (debug)
353  std::cerr <<e <<"\n";
354  } catch(const Policy::Exception& e) {
355  /* Abandon entire basic block if the semantics policy cannot handle the instruction. */
356  if (debug)
357  std::cerr <<e <<"\n";
358  }
359  }
360 
361  if (debug) {
362  std::cerr <<" successors:";
363  for (Disassembler::AddressSet::const_iterator si=successors.begin(); si!=successors.end(); ++si)
364  std::cerr <<" " <<StringUtility::addrToString(*si);
365  if (!*complete) std::cerr <<"...";
366  std::cerr <<std::endl;
367  }
368 
369  return successors;
370 }
371 
502 bool
504 {
505  std::vector<SgAsmInstruction*> sequence;
506  sequence.push_back(this);
507  return has_effect(sequence, false);
508 }
509 
531 bool
532 SgAsmx86Instruction::has_effect(const std::vector<SgAsmInstruction*>& insns, bool allow_branch/*false*/,
533  bool relax_stack_semantics/*false*/)
534 {
535  using namespace BinaryAnalysis::InstructionSemantics;
536 
537  if (insns.empty()) return false;
538 
539  typedef PartialSymbolicSemantics::Policy<> Policy;
540  typedef X86InstructionSemantics<Policy, PartialSymbolicSemantics::ValueType> Semantics;
541  Policy policy;
542  Semantics semantics(policy);
543  if (relax_stack_semantics) policy.set_discard_popped_memory(true);
544  try {
545  for (std::vector<SgAsmInstruction*>::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
547  if (!insn) return true;
548  semantics.processInstruction(insn);
549  if (!policy.get_ip().is_known()) return true;
550  }
551  } catch (const Semantics::Exception&) {
552  return true;
553  } catch (const Policy::Exception&) {
554  return true;
555  }
556 
557  /* If the final instruction pointer is not the fall-through address of the final instruction then return true. In other
558  * words, a sequence ending with a JMP (for instance) has an effect, but an internal JMP has no effect. This is to
559  * support instruction sequences from non-contiguous basic blocks. */
560  ROSE_ASSERT(policy.get_ip().is_known());
561  if (!allow_branch && policy.get_ip().known_value()!=insns.back()->get_address() + insns.back()->get_size())
562  return true;
563 
564  /* Instructions have an effect if the state changed. We want the comparison to be independent of the instruction pointer,
565  * so we'll set the IP of both the initial and final states to the same (unknown) value. */
566  policy.get_orig_state().registers.ip = policy.get_state().registers.ip = PartialSymbolicSemantics::ValueType<32>();
567  return !policy.equal_states(policy.get_orig_state(), policy.get_state());
568 }
569 
581 std::vector< std::pair< size_t, size_t > >
582 SgAsmx86Instruction::find_noop_subsequences(const std::vector<SgAsmInstruction*>& insns, bool allow_branch/*false*/,
583  bool relax_stack_semantics/*false*/)
584 {
585  using namespace BinaryAnalysis::InstructionSemantics;
586 
587  static const bool verbose = false;
588 
589  if (verbose) std::cerr <<"find_noop_subsequences:\n";
590  std::vector< std::pair <size_t/*starting insn index*/, size_t/*num. insns*/> > retval;
591 
592  typedef PartialSymbolicSemantics::Policy<> Policy;
593  typedef X86InstructionSemantics<Policy, PartialSymbolicSemantics::ValueType> Semantics;
594  Policy policy;
595  if (relax_stack_semantics) policy.set_discard_popped_memory(true);
596  Semantics semantics(policy);
597 
598  /* When comparing states, we don't want to compare the instruction pointers. Therefore, we'll change the IP value of
599  * each state to be the same. */
600  const PartialSymbolicSemantics::ValueType<32> common_ip;
601 
602  /* Save the state before and after each instruction. states[i] is the state before insn[i] and states[i+1] is the state
603  * after insn[i]. */
604  std::vector<PartialSymbolicSemantics::State<PartialSymbolicSemantics::ValueType> > state;
605  state.push_back(policy.get_state());
606  state.back().registers.ip = common_ip;
607  try {
608  for (std::vector<SgAsmInstruction*>::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {
610  if (verbose)
611  std::cerr <<" insn #" <<(state.size()-1)
612  <<" " <<(insn ? unparseInstructionWithAddress(insn) : "<none>") <<"\n";
613  if (!insn) return retval;
614  semantics.processInstruction(insn);
615  state.push_back(policy.get_state());
616  if (verbose) std::cerr <<" state:\n" <<policy.get_state();
617  }
618  } catch (const Semantics::Exception&) {
619  /* Perhaps we can find at least a few no-op subsequences... */
620  } catch (const Policy::Exception&) {
621  /* Perhaps we can find at least a few no-op subsequences... */
622  }
623 
624  /* If the last instruction resulted in indeterminant instruction pointer then discard it from the list of states because
625  * it has an effect (it's probably a conditional jump). It's up to the caller whether a final instruction that
626  * unconditionally branches has an effect. */
627  if (!policy.get_ip().is_known()) {
628  state.pop_back();
629  } else if (!allow_branch &&
630  policy.get_ip().known_value()!=insns.back()->get_address() + insns.back()->get_size()) {
631  state.pop_back();
632  }
633 
634  /* Change the IP register so its the same for all states so it doesn't contribute to state differences. */
635  const size_t nstates = state.size();
636  for (size_t i=0; i<nstates; i++)
637  state[i].registers.ip = common_ip;
638 
639  /* Find pairs of equivalent states. */
640  if (verbose) std::cerr <<" number of states: " <<nstates <<"\n";
641  for (size_t i=0; i<nstates-1; i++) {
642  for (size_t j=i+1; j<nstates; j++) {
643  if (policy.equal_states(state[i], state[j])) {
644  if (verbose) std::cerr <<" at instruction #"<<i <<": no-op of length " <<(j-i) <<"\n";
645  retval.push_back(std::make_pair(i, j-i));
646  }
647  }
648  }
649 
650  return retval;
651 }