pat/ROSE_HTML_Reference/SgAsmX86Instruction_8C_source.html

/* SgAsmx86Instruction member definitions.  Do not move them to src/ROSETTA/Grammar/BinaryInstruction.code (or any *.code file)

 * because then they won't get indexed/formatted/etc. by C-aware tools. */


#include "sage3basic.h"

#include "SymbolicSemantics.h"

#include "SymbolicSemantics2.h"

#include "PartialSymbolicSemantics.h"

#include "DispatcherX86.h"

#include "YicesSolver.h"

#include "Disassembler.h"


// see base class

bool

SgAsmx86Instruction::terminates_basic_block() {

    if (get_kind()==x86_unknown_instruction)

        return true;

    return x86InstructionIsControlTransfer(this);

}


// see base class

bool

SgAsmx86Instruction::is_function_call(const std::vector<SgAsmInstruction*>& insns, rose_addr_t *target, rose_addr_t *return_va)

{

    static const size_t EXECUTION_LIMIT = 25; // max size of basic blocks for expensive analyses

    if (insns.empty())

        return false;

    SgAsmx86Instruction *last = isSgAsmx86Instruction(insns.back());

    if (!last)

        return false;


    // Quick method based only on the kind of instruction

    if (x86_call==last->get_kind() || x86_farcall==last->get_kind()) {

        last->get_branch_target(target);

        if (return_va)

            *return_va = last->get_address() + last->get_size();

        return true;

    }


    // The following stuff works only if we have a relatively complete AST.

    SgAsmFunction *func = SageInterface::getEnclosingNode<SgAsmFunction>(last);

    SgAsmInterpretation *interp = SageInterface::getEnclosingNode<SgAsmInterpretation>(func);


    // Slow method: Emulate the instructions and then look at the EIP and stack.  If the EIP points outside the current

    // function and the top of the stack holds an address of an instruction within the current function, then this must be a

    // function call.  FIXME: The implementation here assumes a 32-bit machine. [Robb P. Matzke 2013-09-06]

    if (interp && insns.size()<=EXECUTION_LIMIT) {

        using namespace BinaryAnalysis::InstructionSemantics2;

        using namespace BinaryAnalysis::InstructionSemantics2::SymbolicSemantics;

        const InstructionMap &imap = interp->get_instruction_map();

        const RegisterDictionary *regdict = RegisterDictionary::dictionary_for_isa(interp);

        SMTSolver *solver = NULL; // using a solver would be more accurate, but slower

        BaseSemantics::RiscOperatorsPtr ops = RiscOperators::instance(regdict, solver);

        DispatcherX86Ptr dispatcher = DispatcherX86::instance(ops);

        SValuePtr orig_esp = SValue::promote(ops->readRegister(dispatcher->REG_ESP));

        try {

            for (size_t i=0; i<insns.size(); ++i)

                dispatcher->processInstruction(insns[i]);

        } catch (const BaseSemantics::Exception &e) {

            return false;

        }


        // If the next instruction address is concrete but does not point to a function entry point, then this is not a call.

        SValuePtr eip = SValue::promote(ops->readRegister(dispatcher->REG_EIP));

        if (eip->is_number()) {

            rose_addr_t target_va = eip->get_number();

            SgAsmFunction *target_func = SageInterface::getEnclosingNode<SgAsmFunction>(imap.get_value_or(target_va, NULL));

            if (!target_func || target_va!=target_func->get_entry_va())

                return false;

        }


        // If nothing was pushed onto the stack, then this isn't a function call.

        SValuePtr esp = SValue::promote(ops->readRegister(dispatcher->REG_ESP));

        SValuePtr stack_delta = SValue::promote(ops->add(esp, ops->negate(orig_esp)));

        SValuePtr stack_delta_sign = SValue::promote(ops->extract(stack_delta, 31, 32));

        if (stack_delta_sign->is_number() && 0==stack_delta_sign->get_number())

            return false;


        // If the top of the stack does not contain a concrete value or the top of the stack does not point to an instruction

        // in this basic block's function, then this is not a function call.

        SValuePtr top = SValue::promote(ops->readMemory(dispatcher->REG_SS, esp, esp->boolean_(true), 32));

        if (top->is_number()) {

            rose_addr_t va = top->get_number();

            SgAsmFunction *return_func = SageInterface::getEnclosingNode<SgAsmFunction>(imap.get_value_or(va, NULL));

            if (!return_func || return_func!=func) {

                return false;

            }

        } else {

            return false;

        }


        // Since EIP might point to a function entry address and since the top of the stack contains a pointer to an

        // instruction in this function, we assume that this is a function call.

        if (target && eip->is_number())

            *target = eip->get_number();

        if (return_va && top->is_number())

            *return_va = top->get_number();

        return true;

    }


    // Similar to the above method, but works when all we have is the basic block (e.g., this case gets hit quite a bit from

    // the Partitioner).  Returns true if, after executing the basic block, the top of the stack contains the fall-through

    // address of the basic block. We depend on our caller to figure out if EIP is reasonably a function entry address.

    if (!interp && insns.size()<=EXECUTION_LIMIT) {

        using namespace BinaryAnalysis::InstructionSemantics2;

        using namespace BinaryAnalysis::InstructionSemantics2::SymbolicSemantics;

        const RegisterDictionary *regdict = RegisterDictionary::dictionary_pentium4();

        SMTSolver *solver = NULL; // using a solver would be more accurate, but slower

        BaseSemantics::RiscOperatorsPtr ops = RiscOperators::instance(regdict, solver);

        DispatcherX86Ptr dispatcher = DispatcherX86::instance(ops);

        try {

            for (size_t i=0; i<insns.size(); ++i)

                dispatcher->processInstruction(insns[i]);

        } catch (const BaseSemantics::Exception &e) {

            return false;

        }


        // Look at the top of the stack

        SValuePtr top = SValue::promote(ops->readMemory(dispatcher->REG_SS, ops->readRegister(dispatcher->REG_ESP),

                                                        ops->get_protoval()->boolean_(true), 32));

        if (top->is_number() && top->get_number() == last->get_address()+last->get_size()) {

            if (target) {

                SValuePtr eip = SValue::promote(ops->readRegister(dispatcher->REG_EIP));

                if (eip->is_number())

                    *target = eip->get_number();

            }

            if (return_va)

                *return_va = top->get_number();

            return true;

        }

    }


    return false;

}


bool

SgAsmx86Instruction::is_function_return(const std::vector<SgAsmInstruction*> &insns) {

    if (insns.empty())

        return false;

    SgAsmx86Instruction *last_insn = isSgAsmx86Instruction(insns.back());

    if (!last_insn)

        return false;

    if (last_insn->get_kind()==x86_ret || last_insn->get_kind()==x86_retf)

        return true;

    return false;

}


bool

SgAsmx86Instruction::is_unknown() const

{

    return x86_unknown_instruction == get_kind();

}


Disassembler::AddressSet

SgAsmx86Instruction::get_successors(bool *complete) {

    Disassembler::AddressSet retval;

    *complete = true; /*assume true and prove otherwise*/


    switch (get_kind()) {

        case x86_call:

        case x86_farcall:

        case x86_jmp:

        case x86_farjmp: {

            /* Unconditional branch to operand-specified address. We cannot assume that a CALL instruction returns to the

             * fall-through address. */

            rose_addr_t va;

            if (get_branch_target(&va)) {

                retval.insert(va);

            } else {

                *complete = false;

            }

            break;

        }


        case x86_ja:

        case x86_jae:

        case x86_jb:

        case x86_jbe:

        case x86_jcxz:

        case x86_jecxz:

        case x86_jrcxz:

        case x86_je:

        case x86_jg:

        case x86_jge:

        case x86_jl:

        case x86_jle:

        case x86_jne:

        case x86_jno:

        case x86_jns:

        case x86_jo:

        case x86_jpe:

        case x86_jpo:

        case x86_js:

        case x86_loop:

        case x86_loopnz:

        case x86_loopz: {

            /* Conditional branches to operand-specified address */

            rose_addr_t va;

            if (get_branch_target(&va)) {

                retval.insert(va);

            } else {

                *complete = false;

            }

            retval.insert(get_address() + get_size());

            break;

        }


        case x86_ret:

        case x86_iret:

        case x86_int1:

        case x86_int3:

        case x86_into:

        case x86_rsm:

        case x86_ud2:

        case x86_retf: {

            /* Unconditional branch to run-time specified address */

            *complete = false;

            break;

        }


        case x86_hlt: {

            /* Instructions having no successor. */

            break;

        }


        case x86_unknown_instruction: {

            /* Instructions having unknown successors */

            *complete = false;

        }


        default: {

            /* Instructions that always fall through to the next instruction */

            retval.insert(get_address() + get_size());

            break;

        }

    }

    return retval;

}


bool

SgAsmx86Instruction::get_branch_target(rose_addr_t *target) {

    // Treats far destinations as "unknown"

    switch (get_kind()) {

        case x86_call:

        case x86_farcall:

        case x86_jmp:

        case x86_ja:

        case x86_jae:

        case x86_jb:

        case x86_jbe:

        case x86_jcxz:

        case x86_jecxz:

        case x86_jrcxz:

        case x86_je:

        case x86_jg:

        case x86_jge:

        case x86_jl:

        case x86_jle:

        case x86_jne:

        case x86_jno:

        case x86_jns:

        case x86_jo:

        case x86_jpe:

        case x86_jpo:

        case x86_js:

        case x86_loop:

        case x86_loopnz:

        case x86_loopz: {

            const SgAsmExpressionPtrList &args = get_operandList()->get_operands();

            if (args.size()!=1)

                return false;

            SgAsmIntegerValueExpression *ival = isSgAsmIntegerValueExpression(args[0]);

            if (!ival)

                return false;

            if (target)

                *target = ival->get_absolute_value();

            return true;

        }

        default:

            return false; // do not modify *target

    }

}


Disassembler::AddressSet

SgAsmx86Instruction::get_successors(const std::vector<SgAsmInstruction*>& insns, bool *complete, MemoryMap *initial_memory)

{

    using namespace BinaryAnalysis::InstructionSemantics;

    static const bool debug = false;


    if (debug) {

        std::cerr <<"SgAsmx86Instruction::get_successors(" <<StringUtility::addrToString(insns.front()->get_address())

                  <<" for " <<insns.size() <<" instruction" <<(1==insns.size()?"":"s") <<"):" <<std::endl;

    }


    Disassembler::AddressSet successors = SgAsmInstruction::get_successors(insns, complete);


    /* If we couldn't determine all the successors, or a cursory analysis couldn't narrow it down to a single successor then

     * we'll do a more thorough analysis now. In the case where the cursory analysis returned a complete set containing two

     * successors, a thorough analysis might be able to narrow it down to a single successor. We should not make special

     * assumptions about CALL and FARCALL instructions -- their only successor is the specified address operand. */

    if (!*complete || successors.size()>1) {


#if 0

        /* Use the most robust semantic analysis available.  Warning: this can be very slow, especially when an SMT solver is

         * involved! */

# if defined(ROSE_YICES) || defined(ROSE_HAVE_LIBYICES)

        YicesSolver yices;

        if (yices.available_linkage() & YicesSolver::LM_LIBRARY) {

            yices.set_linkage(YicesSolver::LM_LIBRARY);

        } else {

            yices.set_linkage(YicesSolver::LM_EXECUTABLE);

        }

        SMTSolver *solver = &yices;

# else

        SMTSolver *solver = NULL;

# endif

        if (debug && solver)

            solver->set_debug(stderr);

        typedef SymbolicSemantics::Policy<> Policy;

        typedef SymbolicSemantics::ValueType<32> RegisterType;

        typedef X86InstructionSemantics<Policy, SymbolicSemantics::ValueType> Semantics;

        Policy policy(solver);

#else

        typedef PartialSymbolicSemantics::Policy<> Policy;

        typedef PartialSymbolicSemantics::ValueType<32> RegisterType;

        typedef X86InstructionSemantics<Policy, PartialSymbolicSemantics::ValueType> Semantics;

        Policy policy;

        policy.set_map(initial_memory);

#endif

        try {

            Semantics semantics(policy);

            for (size_t i=0; i<insns.size(); i++) {

                SgAsmx86Instruction* insn = isSgAsmx86Instruction(insns[i]);

                semantics.processInstruction(insn);

                if (debug) {

                    std::cerr << "  state after " <<unparseInstructionWithAddress(insn) <<std::endl

                              <<policy.get_state();

                }

            }

            const RegisterType &newip = policy.get_ip();

            if (newip.is_known()) {

                successors.clear();

                successors.insert(newip.known_value());

                *complete = true; /*this is the complete set of successors*/

            }

        } catch(const Semantics::Exception& e) {

            /* Abandon entire basic block if we hit an instruction that's not implemented. */

            if (debug)

                std::cerr <<e <<"\n";

        } catch(const Policy::Exception& e) {

            /* Abandon entire basic block if the semantics policy cannot handle the instruction. */

            if (debug)

                std::cerr <<e <<"\n";

        }

    }


    if (debug) {

        std::cerr <<"  successors:";

        for (Disassembler::AddressSet::const_iterator si=successors.begin(); si!=successors.end(); ++si)

            std::cerr <<" " <<StringUtility::addrToString(*si);

        if (!*complete) std::cerr <<"...";

        std::cerr <<std::endl;

    }


    return successors;

}


bool

SgAsmx86Instruction::has_effect()

{

    std::vector<SgAsmInstruction*> sequence;

    sequence.push_back(this);

    return has_effect(sequence, false);

}


bool

SgAsmx86Instruction::has_effect(const std::vector<SgAsmInstruction*>& insns, bool allow_branch/*false*/,

                                bool relax_stack_semantics/*false*/)

{

    using namespace BinaryAnalysis::InstructionSemantics;


    if (insns.empty()) return false;


    typedef PartialSymbolicSemantics::Policy<> Policy;

    typedef X86InstructionSemantics<Policy, PartialSymbolicSemantics::ValueType> Semantics;

    Policy policy;

    Semantics semantics(policy);

    if (relax_stack_semantics) policy.set_discard_popped_memory(true);

    try {

        for (std::vector<SgAsmInstruction*>::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {

            SgAsmx86Instruction *insn = isSgAsmx86Instruction(*ii);

            if (!insn) return true;

            semantics.processInstruction(insn);

            if (!policy.get_ip().is_known()) return true;

        }

    } catch (const Semantics::Exception&) {

        return true;

    } catch (const Policy::Exception&) {

        return true;

    }


    /* If the final instruction pointer is not the fall-through address of the final instruction then return true. In other

     * words, a sequence ending with a JMP (for instance) has an effect, but an internal JMP has no effect.  This is to

     * support instruction sequences from non-contiguous basic blocks. */

    ROSE_ASSERT(policy.get_ip().is_known());

    if (!allow_branch && policy.get_ip().known_value()!=insns.back()->get_address() + insns.back()->get_size())

        return true;


    /* Instructions have an effect if the state changed.  We want the comparison to be independent of the instruction pointer,

     * so we'll set the IP of both the initial and final states to the same (unknown) value. */

    policy.get_orig_state().registers.ip = policy.get_state().registers.ip = PartialSymbolicSemantics::ValueType<32>();

    return !policy.equal_states(policy.get_orig_state(), policy.get_state());

}


std::vector< std::pair< size_t, size_t > >

SgAsmx86Instruction::find_noop_subsequences(const std::vector<SgAsmInstruction*>& insns, bool allow_branch/*false*/,

                                            bool relax_stack_semantics/*false*/)

{

    using namespace BinaryAnalysis::InstructionSemantics;


    static const bool verbose = false;


    if (verbose) std::cerr <<"find_noop_subsequences:\n";

    std::vector< std::pair <size_t/*starting insn index*/, size_t/*num. insns*/> > retval;


    typedef PartialSymbolicSemantics::Policy<> Policy;

    typedef X86InstructionSemantics<Policy, PartialSymbolicSemantics::ValueType> Semantics;

    Policy policy;

    if (relax_stack_semantics) policy.set_discard_popped_memory(true);

    Semantics semantics(policy);


    /* When comparing states, we don't want to compare the instruction pointers. Therefore, we'll change the IP value of

     * each state to be the same. */

    const PartialSymbolicSemantics::ValueType<32> common_ip;


    /* Save the state before and after each instruction.  states[i] is the state before insn[i] and states[i+1] is the state

     * after insn[i]. */

    std::vector<PartialSymbolicSemantics::State<PartialSymbolicSemantics::ValueType> > state;

    state.push_back(policy.get_state());

    state.back().registers.ip = common_ip;

    try {

        for (std::vector<SgAsmInstruction*>::const_iterator ii=insns.begin(); ii!=insns.end(); ++ii) {

            SgAsmx86Instruction *insn = isSgAsmx86Instruction(*ii);

            if (verbose)

                std::cerr <<"  insn #" <<(state.size()-1)

                          <<" " <<(insn ? unparseInstructionWithAddress(insn) : "<none>") <<"\n";

            if (!insn) return retval;

            semantics.processInstruction(insn);

            state.push_back(policy.get_state());

            if (verbose) std::cerr <<"  state:\n" <<policy.get_state();

        }

    } catch (const Semantics::Exception&) {

        /* Perhaps we can find at least a few no-op subsequences... */

    } catch (const Policy::Exception&) {

        /* Perhaps we can find at least a few no-op subsequences... */

    }


    /* If the last instruction resulted in indeterminant instruction pointer then discard it from the list of states because

     * it has an effect (it's probably a conditional jump).  It's up to the caller whether a final instruction that

     * unconditionally branches has an effect. */

    if (!policy.get_ip().is_known()) {

        state.pop_back();

    } else if (!allow_branch &&

               policy.get_ip().known_value()!=insns.back()->get_address() + insns.back()->get_size()) {

        state.pop_back();

    }


    /* Change the IP register so its the same for all states so it doesn't contribute to state differences. */

    const size_t nstates = state.size();

    for (size_t i=0; i<nstates; i++)

        state[i].registers.ip = common_ip;


    /* Find pairs of equivalent states. */

    if (verbose) std::cerr <<"  number of states: " <<nstates <<"\n";

    for (size_t i=0; i<nstates-1; i++) {

        for (size_t j=i+1; j<nstates; j++) {

            if (policy.equal_states(state[i], state[j])) {

                if (verbose) std::cerr <<"  at instruction #"<<i <<": no-op of length " <<(j-i) <<"\n";

                retval.push_back(std::make_pair(i, j-i));

            }

        }

    }


    return retval;

}