UnrealEngine/Engine/Source/ThirdParty/Intel/ISPC/ispc-1.16.1/src/opt.cpp

/*
  Copyright (c) 2010-2021, Intel Corporation
  All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are
  met:

    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.

    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.

    * Neither the name of Intel Corporation nor the names of its
      contributors may be used to endorse or promote products derived from
      this software without specific prior written permission.


   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/** @file opt.cpp
    @brief Implementations of various ispc optimization passes that operate
           on the LLVM IR.
*/

#include "opt.h"
#include "ctx.h"
#include "llvmutil.h"
#include "module.h"
#include "sym.h"
#include "util.h"

#include <map>
#include <regex>
#include <set>
#include <stdio.h>

#include <llvm/ADT/SmallSet.h>
#include <llvm/ADT/Triple.h>
#include <llvm/Analysis/BasicAliasAnalysis.h>
#include <llvm/Analysis/ConstantFolding.h>
#include <llvm/Analysis/Passes.h>
#include <llvm/Analysis/TargetLibraryInfo.h>
#include <llvm/Analysis/TargetTransformInfo.h>
#include <llvm/Analysis/TypeBasedAliasAnalysis.h>
#include <llvm/BinaryFormat/Dwarf.h>
#include <llvm/IR/BasicBlock.h>
#include <llvm/IR/Constants.h>
#include <llvm/IR/DataLayout.h>
#include <llvm/IR/DebugInfo.h>
#include <llvm/IR/Function.h>
#include <llvm/IR/IRPrintingPasses.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/IntrinsicInst.h>
#include <llvm/IR/Intrinsics.h>
#include <llvm/IR/IntrinsicsX86.h>
#include <llvm/IR/LegacyPassManager.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/PatternMatch.h>
#include <llvm/IR/Verifier.h>
#include <llvm/InitializePasses.h>
#include <llvm/Pass.h>
#include <llvm/PassRegistry.h>
#include <llvm/Support/raw_ostream.h>
#include <llvm/Target/TargetMachine.h>
#include <llvm/Target/TargetOptions.h>
#include <llvm/Transforms/IPO.h>
#include <llvm/Transforms/IPO/FunctionAttrs.h>
#include <llvm/Transforms/InstCombine/InstCombine.h>
#include <llvm/Transforms/Instrumentation.h>
#include <llvm/Transforms/Scalar.h>
#include <llvm/Transforms/Scalar/GVN.h>
#include <llvm/Transforms/Scalar/InstSimplifyPass.h>
#include <llvm/Transforms/Utils.h>
#include <llvm/Transforms/Utils/BasicBlockUtils.h>

#ifdef ISPC_HOST_IS_LINUX
#include <alloca.h>
#elif defined(ISPC_HOST_IS_WINDOWS)
#include <malloc.h>
#ifndef __MINGW32__
#define alloca _alloca
#endif
#endif // ISPC_HOST_IS_WINDOWS

#ifndef PRId64
#define PRId64 "lld"
#endif
#ifndef PRIu64
#define PRIu64 "llu"
#endif

#ifndef ISPC_NO_DUMPS
#include <llvm/Support/FileSystem.h>
#include <llvm/Support/Regex.h>
#endif
#ifdef ISPC_GENX_ENABLED
#include "gen/GlobalsLocalization.h"
#include <LLVMSPIRVLib/LLVMSPIRVLib.h>
#include <llvm/GenXIntrinsics/GenXIntrOpts.h>
#include <llvm/GenXIntrinsics/GenXIntrinsics.h>
#include <llvm/GenXIntrinsics/GenXSPIRVWriterAdaptor.h>
// Used for GenX gather coalescing
#include <llvm/Transforms/Utils/Local.h>

// Constant in number of bytes.
enum { BYTE = 1, WORD = 2, DWORD = 4, QWORD = 8, OWORD = 16, GRF = 32 };
#endif

using namespace ispc;

static llvm::Pass *CreateIntrinsicsOptPass();
static llvm::Pass *CreateInstructionSimplifyPass();
static llvm::Pass *CreatePeepholePass();

static llvm::Pass *CreateImproveMemoryOpsPass();
static llvm::Pass *CreateGatherCoalescePass();
static llvm::Pass *CreateReplacePseudoMemoryOpsPass();

static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
static llvm::Pass *CreateMakeInternalFuncsStaticPass();

#ifndef ISPC_NO_DUMPS
static llvm::Pass *CreateDebugPass(char *output);
static llvm::Pass *CreateDebugPassFile(int number, llvm::StringRef name);
#endif

static llvm::Pass *CreateReplaceStdlibShiftPass();

#ifdef ISPC_GENX_ENABLED
static llvm::Pass *CreateGenXGatherCoalescingPass();
static llvm::Pass *CreateReplaceLLVMIntrinsics();
static llvm::Pass *CreateFixDivisionInstructions();
static llvm::Pass *CreateFixAddressSpace();
static llvm::Pass *CreateDemotePHIs();
static llvm::Pass *CreateCheckUnsupportedInsts();
static llvm::Pass *CreateMangleOpenCLBuiltins();
#endif

#ifndef ISPC_NO_DUMPS
#define DEBUG_START_PASS(NAME)                                                                                         \
    if (g->debugPrint &&                                                                                               \
        (getenv("FUNC") == NULL || (getenv("FUNC") != NULL && !strncmp(bb.getParent()->getName().str().c_str(),        \
                                                                       getenv("FUNC"), strlen(getenv("FUNC")))))) {    \
        fprintf(stderr, "Start of " NAME "\n");                                                                        \
        fprintf(stderr, "---------------\n");                                                                          \
        bb.dump();                                                                                                     \
        fprintf(stderr, "---------------\n\n");                                                                        \
    } else /* eat semicolon */

#define DEBUG_END_PASS(NAME)                                                                                           \
    if (g->debugPrint &&                                                                                               \
        (getenv("FUNC") == NULL || (getenv("FUNC") != NULL && !strncmp(bb.getParent()->getName().str().c_str(),        \
                                                                       getenv("FUNC"), strlen(getenv("FUNC")))))) {    \
        fprintf(stderr, "End of " NAME " %s\n", modifiedAny ? "** CHANGES **" : "");                                   \
        fprintf(stderr, "---------------\n");                                                                          \
        bb.dump();                                                                                                     \
        fprintf(stderr, "---------------\n\n");                                                                        \
    } else /* eat semicolon */
#else
#define DEBUG_START_PASS(NAME)
#define DEBUG_END_PASS(NAME)
#endif

///////////////////////////////////////////////////////////////////////////

/** This utility routine copies the metadata (if any) attached to the
    'from' instruction in the IR to the 'to' instruction.

    For flexibility, this function takes an llvm::Value rather than an
    llvm::Instruction for the 'to' parameter; at some places in the code
    below, we sometimes use a llvm::Value to start out storing a value and
    then later store instructions.  If a llvm::Value is passed to this, the
    routine just returns without doing anything; if it is in fact an
    LLVM::Instruction, then the metadata can be copied to it.
 */
static void lCopyMetadata(llvm::Value *vto, const llvm::Instruction *from) {
    llvm::Instruction *to = llvm::dyn_cast<llvm::Instruction>(vto);
    if (!to)
        return;

    llvm::SmallVector<std::pair<unsigned int, llvm::MDNode *>, 8> metadata;

    from->getAllMetadata(metadata);
    for (unsigned int i = 0; i < metadata.size(); ++i)
        to->setMetadata(metadata[i].first, metadata[i].second);
}

/** We have a protocol with the front-end LLVM IR code generation process
    that allows us to encode the source file position that corresponds with
    instructions.  (For example, this allows us to issue performance
    warnings related to things like scatter and gather after optimization
    has been performed, so that we aren't warning about scatters and
    gathers that have been improved to stores and loads by optimization
    passes.)  Note that this is slightly redundant with the source file
    position encoding generated for debugging symbols, though we don't
    always generate debugging information but we do always generate this
    position data.

    This function finds the SourcePos that the metadata in the instruction
    (if present) corresponds to.  See the implementation of
    FunctionEmitContext::addGSMetadata(), which encodes the source position during
    code generation.

    @param inst   Instruction to try to find the source position of
    @param pos    Output variable in which to store the position
    @returns      True if source file position metadata was present and *pos
                  has been set.  False otherwise.
*/
static bool lGetSourcePosFromMetadata(const llvm::Instruction *inst, SourcePos *pos) {
    llvm::MDNode *filename = inst->getMetadata("filename");
    llvm::MDNode *first_line = inst->getMetadata("first_line");
    llvm::MDNode *first_column = inst->getMetadata("first_column");
    llvm::MDNode *last_line = inst->getMetadata("last_line");
    llvm::MDNode *last_column = inst->getMetadata("last_column");

    if (!filename || !first_line || !first_column || !last_line || !last_column)
        return false;

    // All of these asserts are things that FunctionEmitContext::addGSMetadata() is
    // expected to have done in its operation
    llvm::MDString *str = llvm::dyn_cast<llvm::MDString>(filename->getOperand(0));
    Assert(str);
    llvm::ConstantInt *first_lnum =

        llvm::mdconst::extract<llvm::ConstantInt>(first_line->getOperand(0));
    Assert(first_lnum);

    llvm::ConstantInt *first_colnum =

        llvm::mdconst::extract<llvm::ConstantInt>(first_column->getOperand(0));
    Assert(first_column);

    llvm::ConstantInt *last_lnum =

        llvm::mdconst::extract<llvm::ConstantInt>(last_line->getOperand(0));
    Assert(last_lnum);

    llvm::ConstantInt *last_colnum = llvm::mdconst::extract<llvm::ConstantInt>(last_column->getOperand(0));
    Assert(last_column);

    *pos = SourcePos(str->getString().data(), (int)first_lnum->getZExtValue(), (int)first_colnum->getZExtValue(),
                     (int)last_lnum->getZExtValue(), (int)last_colnum->getZExtValue());
    return true;
}

static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, const llvm::Twine &name,
                                    llvm::Instruction *insertBefore = NULL) {
    llvm::Value *args[2] = {arg0, arg1};
    llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
    return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
}

static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
                                    const llvm::Twine &name, llvm::Instruction *insertBefore = NULL) {
    llvm::Value *args[3] = {arg0, arg1, arg2};
    llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
    return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
}

static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
                                    llvm::Value *arg3, const llvm::Twine &name,
                                    llvm::Instruction *insertBefore = NULL) {
    llvm::Value *args[4] = {arg0, arg1, arg2, arg3};
    llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[4]);
    return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
}

static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
                                    llvm::Value *arg3, llvm::Value *arg4, const llvm::Twine &name,
                                    llvm::Instruction *insertBefore = NULL) {
    llvm::Value *args[5] = {arg0, arg1, arg2, arg3, arg4};
    llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[5]);
    return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
}

static llvm::Instruction *lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2,
                                    llvm::Value *arg3, llvm::Value *arg4, llvm::Value *arg5, const llvm::Twine &name,
                                    llvm::Instruction *insertBefore = NULL) {
    llvm::Value *args[6] = {arg0, arg1, arg2, arg3, arg4, arg5};
    llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[6]);
    return llvm::CallInst::Create(func, newArgArray, name, insertBefore);
}

static llvm::Instruction *lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
                                   llvm::Instruction *insertBefore) {
    llvm::Value *index[1] = {offset};
    llvm::ArrayRef<llvm::Value *> arrayRef(&index[0], &index[1]);

    return llvm::GetElementPtrInst::Create(PTYPE(ptr), ptr, arrayRef, name, insertBefore);
}

/** Given a vector of constant values (int, float, or bool) representing an
    execution mask, convert it to a bitvector where the 0th bit corresponds
    to the first vector value and so forth.
*/
static uint64_t lConstElementsToMask(const llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> &elements) {
    Assert(elements.size() <= 64);

    uint64_t mask = 0;
    uint64_t undefSetMask = 0;
    llvm::APInt intMaskValue;
    for (unsigned int i = 0; i < elements.size(); ++i) {
        // SSE has the "interesting" approach of encoding blending
        // masks as <n x float>.
        if (llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i])) {
            llvm::APFloat apf = cf->getValueAPF();
            intMaskValue = apf.bitcastToAPInt();
        } else if (llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i])) {
            // Otherwise get it as an int
            intMaskValue = ci->getValue();
        } else {
            // We create a separate 'undef mask' with all undef bits set.
            // This mask will have no bits set if there are no 'undef' elements.
            llvm::UndefValue *uv = llvm::dyn_cast<llvm::UndefValue>(elements[i]);
            Assert(uv != NULL); // vs return -1 if NULL?
            undefSetMask |= (1ull << i);
            continue;
        }
        // Is the high-bit set?  If so, OR in the appropriate bit in
        // the result mask
        if (intMaskValue.countLeadingOnes() > 0)
            mask |= (1ull << i);
    }

    // if no bits are set in mask, do not need to consider undefs. It's
    // always 'all_off'.
    // If any bits are set in mask, assume' undef' bits as as '1'. This ensures
    // cases with only '1's and 'undef's will be considered as 'all_on'
    if (mask != 0)
        mask |= undefSetMask;

    return mask;
}

/** Given an llvm::Value represinting a vector mask, see if the value is a
    constant.  If so, return true and set *bits to be the integer mask
    found by taking the high bits of the mask values in turn and
    concatenating them into a single integer.  In other words, given the
    4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
 */
static bool lGetMask(llvm::Value *factor, uint64_t *mask) {
    llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
    if (cdv != NULL) {
        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
        for (int i = 0; i < (int)cdv->getNumElements(); ++i)
            elements.push_back(cdv->getElementAsConstant(i));
        *mask = lConstElementsToMask(elements);
        return true;
    }

    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
    if (cv != NULL) {
        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
        for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
            llvm::Constant *c = llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
            if (c == NULL)
                return false;
            if (llvm::isa<llvm::ConstantExpr>(cv->getOperand(i)))
                return false; // We can not handle constant expressions here
            elements.push_back(c);
        }
        *mask = lConstElementsToMask(elements);
        return true;
    } else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
        *mask = 0;
        return true;
    } else {
#if 0
        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
        if (ce != NULL) {
            llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
            const llvm::TargetData *td = targetMachine->getTargetData();
            llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
            c->dump();
            factor = c;
        }
        // else we should be able to handle it above...
        Assert(!llvm::isa<llvm::Constant>(factor));
#endif
        return false;
    }
}

enum class MaskStatus { all_on, all_off, mixed, unknown };

/** Determines if the given mask value is all on, all off, mixed, or
    unknown at compile time.
*/
static MaskStatus lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
    uint64_t bits;
    if (lGetMask(mask, &bits) == false)
        return MaskStatus::unknown;

    if (bits == 0)
        return MaskStatus::all_off;

    if (vecWidth == -1)
        vecWidth = g->target->getVectorWidth();
    Assert(vecWidth <= 64);

    for (int i = 0; i < vecWidth; ++i) {
        if ((bits & (1ull << i)) == 0)
            return MaskStatus::mixed;
    }
    return MaskStatus::all_on;
}

///////////////////////////////////////////////////////////////////////////
// This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
//   and change PassManager function add by adding some checks and debug passes.
//   This wrap can control:
//   - If we want to switch off optimization with given number.
//   - If we want to dump LLVM IR after optimization with given number.
//   - If we want to generate LLVM IR debug for gdb after optimization with given number.
class DebugPassManager {
  public:
    DebugPassManager() : number(0) {}
    void add(llvm::Pass *P, int stage);
    bool run(llvm::Module &M) { return PM.run(M); }
    llvm::legacy::PassManager &getPM() { return PM; }

  private:
    llvm::legacy::PassManager PM;
    int number;
};

void DebugPassManager::add(llvm::Pass *P, int stage = -1) {
    // taking number of optimization
    if (stage == -1) {
        number++;
    } else {
        number = stage;
    }
    if (g->off_stages.find(number) == g->off_stages.end()) {
        // adding optimization (not switched off)
        PM.add(P);
#ifndef ISPC_NO_DUMPS
        if (g->debug_stages.find(number) != g->debug_stages.end()) {
            // adding dump of LLVM IR after optimization
            if (g->dumpFile) {
                PM.add(CreateDebugPassFile(number, P->getPassName()));
            } else {
                char buf[100];
                snprintf(buf, sizeof(buf), "\n\n*****LLVM IR after phase %d: %s*****\n\n", number,
                         P->getPassName().data());
                PM.add(CreateDebugPass(buf));
            }
        }
#endif
    }
}
///////////////////////////////////////////////////////////////////////////

void ispc::Optimize(llvm::Module *module, int optLevel) {
#ifndef ISPC_NO_DUMPS
    if (g->debugPrint) {
        printf("*** Code going into optimization ***\n");
        module->dump();
    }
#endif
    DebugPassManager optPM;

    if (g->enableLLVMIntrinsics) {
        // Required for matrix intrinsics. This needs to happen before VerifierPass.
        // TODO : Limit pass to only when llvm.matrix.* intrinsics are used.
        optPM.add(llvm::createLowerMatrixIntrinsicsPass()); // llvm.matrix
    }
    optPM.add(llvm::createVerifierPass(), 0);

    optPM.add(new llvm::TargetLibraryInfoWrapperPass(llvm::Triple(module->getTargetTriple())));
    if (!g->target->isGenXTarget()) {
        llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
        optPM.getPM().add(createTargetTransformInfoWrapperPass(targetMachine->getTargetIRAnalysis()));
    }
    optPM.add(llvm::createIndVarSimplifyPass());

#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
    llvm::SimplifyCFGOptions simplifyCFGopt;
    simplifyCFGopt.HoistCommonInsts = true;
#endif
    if (optLevel == 0) {
        // This is more or less the minimum set of optimizations that we
        // need to do to generate code that will actually run.  (We can't
        // run absolutely no optimizations, since the front-end needs us to
        // take the various __pseudo_* functions it has emitted and turn
        // them into something that can actually execute.
#ifdef ISPC_GENX_ENABLED
        if (g->target->isGenXTarget()) {
            // Global DCE is required for ISPCSimdCFLoweringPass
            optPM.add(llvm::createGlobalDCEPass());
            // FIXME: temporary solution
            optPM.add(llvm::createBreakCriticalEdgesPass());
            optPM.add(CreateDemotePHIs());
            optPM.add(llvm::createISPCSimdCFLoweringPass());
            // FIXME: temporary solution
            optPM.add(llvm::createPromoteMemoryToRegisterPass());
        }
#endif
        optPM.add(CreateImproveMemoryOpsPass(), 100);

        if (g->opt.disableHandlePseudoMemoryOps == false)
            optPM.add(CreateReplacePseudoMemoryOpsPass());

        optPM.add(CreateIntrinsicsOptPass(), 102);
        optPM.add(CreateIsCompileTimeConstantPass(true));
#ifdef ISPC_GENX_ENABLED
        if (g->target->isGenXTarget()) {
            // InstructionCombining pass is required for FixDivisionInstructions
            optPM.add(llvm::createInstructionCombiningPass());
            optPM.add(CreateFixDivisionInstructions());
        }
#endif
        optPM.add(llvm::createFunctionInliningPass());
        optPM.add(CreateMakeInternalFuncsStaticPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif
#ifdef ISPC_GENX_ENABLED
        if (g->target->isGenXTarget()) {
            optPM.add(llvm::createPromoteMemoryToRegisterPass());
            optPM.add(llvm::createGlobalsLocalizationPass());
            // Remove dead globals after localization
            optPM.add(llvm::createGlobalDCEPass());
            // This pass is needed for correct prints work
            optPM.add(llvm::createSROAPass());
            optPM.add(CreateReplaceLLVMIntrinsics());
            optPM.add(CreateCheckUnsupportedInsts());
            optPM.add(CreateFixAddressSpace());
            optPM.add(CreateMangleOpenCLBuiltins());
            // This pass is required to prepare LLVM IR for open source SPIR-V translator
            optPM.add(
                llvm::createGenXSPIRVWriterAdaptorPass(true /*RewriteTypes*/, false /*RewriteSingleElementVectors*/));
        }
#endif
        optPM.add(llvm::createGlobalDCEPass());
    } else {
        llvm::PassRegistry *registry = llvm::PassRegistry::getPassRegistry();
        llvm::initializeCore(*registry);
        llvm::initializeScalarOpts(*registry);
        llvm::initializeIPO(*registry);
        llvm::initializeAnalysis(*registry);
        llvm::initializeTransformUtils(*registry);
        llvm::initializeInstCombine(*registry);
        llvm::initializeInstrumentation(*registry);
        llvm::initializeTarget(*registry);

        optPM.add(llvm::createGlobalDCEPass(), 184);

#ifdef ISPC_GENX_ENABLED
        if (g->target->isGenXTarget()) {
            // FIXME: temporary solution
            optPM.add(llvm::createBreakCriticalEdgesPass());
            optPM.add(CreateDemotePHIs());
            optPM.add(llvm::createISPCSimdCFLoweringPass());
            // FIXME: temporary solution
            optPM.add(llvm::createPromoteMemoryToRegisterPass());
        }
#endif
        // Setup to use LLVM default AliasAnalysis
        // Ideally, we want call:
        //    llvm::PassManagerBuilder pm_Builder;
        //    pm_Builder.OptLevel = optLevel;
        //    pm_Builder.addInitialAliasAnalysisPasses(optPM);
        // but the addInitialAliasAnalysisPasses() is a private function
        // so we explicitly enable them here.
        // Need to keep sync with future LLVM change
        // An alternative is to call populateFunctionPassManager()
        optPM.add(llvm::createTypeBasedAAWrapperPass(), 190);
        optPM.add(llvm::createBasicAAWrapperPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif

        optPM.add(llvm::createSROAPass());

        optPM.add(llvm::createEarlyCSEPass());
        optPM.add(llvm::createLowerExpectIntrinsicPass());

        // Early optimizations to try to reduce the total amount of code to
        // work with if we can
        optPM.add(llvm::createReassociatePass(), 200);
        optPM.add(llvm::createInstSimplifyLegacyPass());
        optPM.add(llvm::createDeadCodeEliminationPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif

        optPM.add(llvm::createPromoteMemoryToRegisterPass());
        optPM.add(llvm::createAggressiveDCEPass());

        if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
            optPM.add(llvm::createInstructionCombiningPass(), 210);
            optPM.add(CreateImproveMemoryOpsPass());
        }
        if (!g->opt.disableMaskAllOnOptimizations) {
            optPM.add(CreateIntrinsicsOptPass(), 215);
            optPM.add(CreateInstructionSimplifyPass());
        }
        optPM.add(llvm::createDeadCodeEliminationPass(), 220);

        // On to more serious optimizations
        optPM.add(llvm::createSROAPass());
        optPM.add(llvm::createInstructionCombiningPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif
        optPM.add(llvm::createPromoteMemoryToRegisterPass());
        optPM.add(llvm::createGlobalOptimizerPass());
        optPM.add(llvm::createReassociatePass());
        // IPConstProp will not be supported by LLVM moving forward.
        // Switching to IPSCCP which is its recommended functional equivalent.
        // TODO : Make IPSCCP the default after ISPC 1.14 release.
#if ISPC_LLVM_VERSION < ISPC_LLVM_12_0
        optPM.add(llvm::createIPConstantPropagationPass());
#else
        optPM.add(llvm::createIPSCCPPass());
#endif

        optPM.add(CreateReplaceStdlibShiftPass(), 229);

        optPM.add(llvm::createDeadArgEliminationPass(), 230);
        optPM.add(llvm::createInstructionCombiningPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif
        optPM.add(llvm::createPruneEHPass());
#ifdef ISPC_GENX_ENABLED
        if (g->target->isGenXTarget()) {
            optPM.add(CreateFixDivisionInstructions());
        }
#endif
        optPM.add(llvm::createPostOrderFunctionAttrsLegacyPass());
        optPM.add(llvm::createReversePostOrderFunctionAttrsPass());

        // Next inline pass will remove functions, saved by __keep_funcs_live
        optPM.add(llvm::createFunctionInliningPass());
        optPM.add(llvm::createInstSimplifyLegacyPass());
        optPM.add(llvm::createDeadCodeEliminationPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif

        optPM.add(llvm::createArgumentPromotionPass());

        optPM.add(llvm::createAggressiveDCEPass());
        optPM.add(llvm::createInstructionCombiningPass(), 241);
        optPM.add(llvm::createJumpThreadingPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif

        optPM.add(llvm::createSROAPass());

        optPM.add(llvm::createInstructionCombiningPass());
#ifdef ISPC_GENX_ENABLED
        if (g->target->isGenXTarget()) {
            // Inline
            optPM.add(llvm::createCorrelatedValuePropagationPass());
            optPM.add(llvm::createInstructionCombiningPass());
            optPM.add(llvm::createGlobalDCEPass());
            optPM.add(llvm::createInstructionCombiningPass());
            optPM.add(llvm::createEarlyCSEPass());
            optPM.add(llvm::createDeadCodeEliminationPass());
            optPM.add(llvm::createGlobalsLocalizationPass());
            // remove dead globals after localization
            optPM.add(llvm::createGlobalDCEPass());
        }
#endif
        optPM.add(llvm::createTailCallEliminationPass());

        if (!g->opt.disableMaskAllOnOptimizations) {
            optPM.add(CreateIntrinsicsOptPass(), 250);
            optPM.add(CreateInstructionSimplifyPass());
        }

        if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
            optPM.add(llvm::createInstructionCombiningPass(), 255);
#ifdef ISPC_GENX_ENABLED
            if (g->target->isGenXTarget() && !g->opt.disableGenXGatherCoalescing)
                optPM.add(CreateGenXGatherCoalescingPass());
#endif
            optPM.add(CreateImproveMemoryOpsPass());

            if (g->opt.disableCoalescing == false) {
                // It is important to run this here to make it easier to
                // finding matching gathers we can coalesce..
                optPM.add(llvm::createEarlyCSEPass(), 260);
                optPM.add(CreateGatherCoalescePass());
            }
        }

        optPM.add(llvm::createFunctionInliningPass(), 265);
        optPM.add(llvm::createInstSimplifyLegacyPass());
        optPM.add(CreateIntrinsicsOptPass());
        optPM.add(CreateInstructionSimplifyPass());

        if (g->opt.disableGatherScatterOptimizations == false && g->target->getVectorWidth() > 1) {
            optPM.add(llvm::createInstructionCombiningPass(), 270);
            optPM.add(CreateImproveMemoryOpsPass());
        }

        optPM.add(llvm::createIPSCCPPass(), 275);
        optPM.add(llvm::createDeadArgEliminationPass());
        optPM.add(llvm::createAggressiveDCEPass());
        optPM.add(llvm::createInstructionCombiningPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif

        if (g->opt.disableHandlePseudoMemoryOps == false) {
            optPM.add(CreateReplacePseudoMemoryOpsPass(), 280);
        }
        optPM.add(CreateIntrinsicsOptPass(), 281);
        optPM.add(CreateInstructionSimplifyPass());

        optPM.add(llvm::createFunctionInliningPass());
        optPM.add(llvm::createArgumentPromotionPass());

        optPM.add(llvm::createSROAPass());

        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(CreateInstructionSimplifyPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif
        optPM.add(llvm::createReassociatePass());
        optPM.add(llvm::createLoopRotatePass());
        optPM.add(llvm::createLICMPass());
        optPM.add(llvm::createLoopUnswitchPass(false));
        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(CreateInstructionSimplifyPass());
        optPM.add(llvm::createIndVarSimplifyPass());
        // Currently CM does not support memset/memcpy
        // so this pass is temporary disabled for GEN.
        if (!g->target->isGenXTarget()) {
            optPM.add(llvm::createLoopIdiomPass());
        }
        optPM.add(llvm::createLoopDeletionPass());
        if (g->opt.unrollLoops) {
            optPM.add(llvm::createLoopUnrollPass(), 300);
        }
        optPM.add(llvm::createGVNPass(), 301);

        optPM.add(CreateIsCompileTimeConstantPass(true));
        optPM.add(CreateIntrinsicsOptPass());
        optPM.add(CreateInstructionSimplifyPass());
        // Currently CM does not support memset/memcpy
        // so this pass is temporary disabled for GEN.
        if (!g->target->isGenXTarget()) {
            optPM.add(llvm::createMemCpyOptPass());
        }
        optPM.add(llvm::createSCCPPass());
        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(CreateInstructionSimplifyPass());
        optPM.add(llvm::createJumpThreadingPass());
        optPM.add(llvm::createCorrelatedValuePropagationPass());
        optPM.add(llvm::createDeadStoreEliminationPass());
        optPM.add(llvm::createAggressiveDCEPass());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_12_0
        optPM.add(llvm::createCFGSimplificationPass(simplifyCFGopt));
#else
        optPM.add(llvm::createCFGSimplificationPass());
#endif
        optPM.add(llvm::createInstructionCombiningPass());
        optPM.add(CreateInstructionSimplifyPass());
#ifdef ISPC_GENX_ENABLED
        if (g->target->isGenXTarget()) {
            optPM.add(CreateReplaceLLVMIntrinsics());
        }
#endif
        optPM.add(CreatePeepholePass());
        optPM.add(llvm::createFunctionInliningPass());
        optPM.add(llvm::createAggressiveDCEPass());
        optPM.add(llvm::createStripDeadPrototypesPass());
        optPM.add(CreateMakeInternalFuncsStaticPass());
        optPM.add(llvm::createGlobalDCEPass());
        optPM.add(llvm::createConstantMergePass());
#ifdef ISPC_GENX_ENABLED
        if (g->target->isGenXTarget()) {
            optPM.add(CreateCheckUnsupportedInsts());
            optPM.add(CreateFixAddressSpace());
            optPM.add(CreateMangleOpenCLBuiltins());
            // This pass is required to prepare LLVM IR for open source SPIR-V translator
            optPM.add(
                llvm::createGenXSPIRVWriterAdaptorPass(true /*RewriteTypes*/, false /*RewriteSingleElementVectors*/));
        }
#endif
    }

    // Finish up by making sure we didn't mess anything up in the IR along
    // the way.
    optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
    optPM.run(*module);

#ifndef ISPC_NO_DUMPS
    if (g->debugPrint) {
        printf("\n*****\nFINAL OUTPUT\n*****\n");
        module->dump();
    }
#endif
}

///////////////////////////////////////////////////////////////////////////
// IntrinsicsOpt

/** This is a relatively simple optimization pass that does a few small
    optimizations that LLVM's x86 optimizer doesn't currently handle.
    (Specifically, MOVMSK of a constant can be replaced with the
    corresponding constant value, BLENDVPS and AVX masked load/store with
    either an 'all on' or 'all off' masks can be replaced with simpler
    operations.

    @todo The better thing to do would be to submit a patch to LLVM to get
    these; they're presumably pretty simple patterns to match.
*/
class IntrinsicsOpt : public llvm::FunctionPass {
  public:
    IntrinsicsOpt() : FunctionPass(ID){};

    llvm::StringRef getPassName() const { return "Intrinsics Cleanup Optimization"; }

    bool runOnBasicBlock(llvm::BasicBlock &BB);

    bool runOnFunction(llvm::Function &F);

    static char ID;

  private:
    struct MaskInstruction {
        MaskInstruction(llvm::Function *f) { function = f; }
        llvm::Function *function;
    };
    std::vector<MaskInstruction> maskInstructions;

    /** Structure that records everything we need to know about a blend
        instruction for this optimization pass.
     */
    struct BlendInstruction {
        BlendInstruction(llvm::Function *f, uint64_t ao, int o0, int o1, int of)
            : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) {}
        /** Function pointer for the blend instruction */
        llvm::Function *function;
        /** Mask value for an "all on" mask for this instruction */
        uint64_t allOnMask;
        /** The operand number in the llvm CallInst corresponds to the
            first operand to blend with. */
        int op0;
        /** The operand number in the CallInst corresponding to the second
            operand to blend with. */
        int op1;
        /** The operand in the call inst where the blending factor is
            found. */
        int opFactor;
    };
    std::vector<BlendInstruction> blendInstructions;

    bool matchesMaskInstruction(llvm::Function *function);
    BlendInstruction *matchingBlendInstruction(llvm::Function *function);
};

char IntrinsicsOpt::ID = 0;

/** Given an llvm::Value, return true if we can determine that it's an
    undefined value.  This only makes a weak attempt at chasing this down,
    only detecting flat-out undef values, and bitcasts of undef values.

    @todo Is it worth working harder to find more of these?  It starts to
    get tricky, since having an undef operand doesn't necessarily mean that
    the result will be undefined.  (And for that matter, is there an LLVM
    call that will do this for us?)
 */
static bool lIsUndef(llvm::Value *value) {
    if (llvm::isa<llvm::UndefValue>(value))
        return true;

    llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(value);
    if (bci)
        return lIsUndef(bci->getOperand(0));

    return false;
}

bool IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("IntrinsicsOpt");

    // We can't initialize mask/blend function vector during pass initialization,
    // as they may be optimized out by the time the pass is invoked.

    // All of the mask instructions we may encounter.  Note that even if
    // compiling for AVX, we may still encounter the regular 4-wide SSE
    // MOVMSK instruction.
    if (llvm::Function *ssei8Movmsk =
            m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse2_pmovmskb_128))) {
        maskInstructions.push_back(ssei8Movmsk);
    }
    if (llvm::Function *sseFloatMovmsk =
            m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse_movmsk_ps))) {
        maskInstructions.push_back(sseFloatMovmsk);
    }
    if (llvm::Function *__movmsk = m->module->getFunction("__movmsk")) {
        maskInstructions.push_back(__movmsk);
    }
    if (llvm::Function *avxFloatMovmsk =
            m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_movmsk_ps_256))) {
        maskInstructions.push_back(avxFloatMovmsk);
    }

    // And all of the blend instructions
    blendInstructions.push_back(BlendInstruction(
        m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_sse41_blendvps)), 0xf, 0, 1, 2));
    blendInstructions.push_back(BlendInstruction(
        m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_blendv_ps_256)), 0xff, 0, 1, 2));

    llvm::Function *avxMaskedLoad32 =
        m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_ps_256));
    llvm::Function *avxMaskedLoad64 =
        m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskload_pd_256));
    llvm::Function *avxMaskedStore32 =
        m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_ps_256));
    llvm::Function *avxMaskedStore64 =
        m->module->getFunction(llvm::Intrinsic::getName(llvm::Intrinsic::x86_avx_maskstore_pd_256));

    bool modifiedAny = false;
restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        if (callInst == NULL || callInst->getCalledFunction() == NULL)
            continue;

        BlendInstruction *blend = matchingBlendInstruction(callInst->getCalledFunction());
        if (blend != NULL) {
            llvm::Value *v[2] = {callInst->getArgOperand(blend->op0), callInst->getArgOperand(blend->op1)};
            llvm::Value *factor = callInst->getArgOperand(blend->opFactor);

            // If the values are the same, then no need to blend..
            if (v[0] == v[1]) {
                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[0]);
                modifiedAny = true;
                goto restart;
            }

            // If one of the two is undefined, we're allowed to replace
            // with the value of the other.  (In other words, the only
            // valid case is that the blend factor ends up having a value
            // that only selects from the defined one of the two operands,
            // otherwise the result is undefined and any value is fine,
            // ergo the defined one is an acceptable result.)
            if (lIsUndef(v[0])) {
                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[1]);
                modifiedAny = true;
                goto restart;
            }
            if (lIsUndef(v[1])) {
                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, v[0]);
                modifiedAny = true;
                goto restart;
            }

            MaskStatus maskStatus = lGetMaskStatus(factor);
            llvm::Value *value = NULL;
            if (maskStatus == MaskStatus::all_off) {
                // Mask all off -> replace with the first blend value
                value = v[0];
            } else if (maskStatus == MaskStatus::all_on) {
                // Mask all on -> replace with the second blend value
                value = v[1];
            }

            if (value != NULL) {
                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
                modifiedAny = true;
                goto restart;
            }
        } else if (matchesMaskInstruction(callInst->getCalledFunction())) {
            llvm::Value *factor = callInst->getArgOperand(0);
            uint64_t mask;
            if (lGetMask(factor, &mask) == true) {
                // If the vector-valued mask has a known value, replace it
                // with the corresponding integer mask from its elements
                // high bits.
                llvm::Value *value = (callInst->getType() == LLVMTypes::Int32Type) ? LLVMInt32(mask) : LLVMInt64(mask);
                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
                modifiedAny = true;
                goto restart;
            }
        } else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
                   callInst->getCalledFunction() == avxMaskedLoad64) {
            llvm::Value *factor = callInst->getArgOperand(1);
            MaskStatus maskStatus = lGetMaskStatus(factor);
            if (maskStatus == MaskStatus::all_off) {
                // nothing being loaded, replace with undef value
                llvm::Type *returnType = callInst->getType();
                Assert(llvm::isa<llvm::VectorType>(returnType));
                llvm::Value *undefValue = llvm::UndefValue::get(returnType);
                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, undefValue);
                modifiedAny = true;
                goto restart;
            } else if (maskStatus == MaskStatus::all_on) {
                // all lanes active; replace with a regular load
                llvm::Type *returnType = callInst->getType();
                Assert(llvm::isa<llvm::VectorType>(returnType));
                // cast the i8 * to the appropriate type
                llvm::Value *castPtr =
                    new llvm::BitCastInst(callInst->getArgOperand(0), llvm::PointerType::get(returnType, 0),
                                          llvm::Twine(callInst->getArgOperand(0)->getName()) + "_cast", callInst);
                lCopyMetadata(castPtr, callInst);
                int align;
                if (g->opt.forceAlignedMemory)
                    align = g->target->getNativeVectorAlignment();
                else
                    align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
#if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
                llvm::Instruction *loadInst =
                    new llvm::LoadInst(castPtr, llvm::Twine(callInst->getArgOperand(0)->getName()) + "_load",
                                       false /* not volatile */, llvm::MaybeAlign(align), (llvm::Instruction *)NULL);
#else
                llvm::Instruction *loadInst = new llvm::LoadInst(
                    llvm::dyn_cast<llvm::PointerType>(castPtr->getType())->getPointerElementType(), castPtr,
                    llvm::Twine(callInst->getArgOperand(0)->getName()) + "_load", false /* not volatile */,
                    llvm::MaybeAlign(align).valueOrOne(), (llvm::Instruction *)NULL);
#endif
                lCopyMetadata(loadInst, callInst);
                llvm::ReplaceInstWithInst(callInst, loadInst);
                modifiedAny = true;
                goto restart;
            }
        } else if (callInst->getCalledFunction() == avxMaskedStore32 ||
                   callInst->getCalledFunction() == avxMaskedStore64) {
            // NOTE: mask is the 2nd parameter, not the 3rd one!!
            llvm::Value *factor = callInst->getArgOperand(1);
            MaskStatus maskStatus = lGetMaskStatus(factor);
            if (maskStatus == MaskStatus::all_off) {
                // nothing actually being stored, just remove the inst
                callInst->eraseFromParent();
                modifiedAny = true;
                goto restart;
            } else if (maskStatus == MaskStatus::all_on) {
                // all lanes storing, so replace with a regular store
                llvm::Value *rvalue = callInst->getArgOperand(2);
                llvm::Type *storeType = rvalue->getType();
                llvm::Value *castPtr =
                    new llvm::BitCastInst(callInst->getArgOperand(0), llvm::PointerType::get(storeType, 0),
                                          llvm::Twine(callInst->getArgOperand(0)->getName()) + "_ptrcast", callInst);
                lCopyMetadata(castPtr, callInst);

                int align;
                if (g->opt.forceAlignedMemory)
                    align = g->target->getNativeVectorAlignment();
                else
                    align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
                llvm::StoreInst *storeInst = new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL,
                                                                 llvm::MaybeAlign(align).valueOrOne());
                lCopyMetadata(storeInst, callInst);
                llvm::ReplaceInstWithInst(callInst, storeInst);

                modifiedAny = true;
                goto restart;
            }
        }
    }

    DEBUG_END_PASS("IntrinsicsOpt");

    return modifiedAny;
}

bool IntrinsicsOpt::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("IntrinsicsOpt::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

bool IntrinsicsOpt::matchesMaskInstruction(llvm::Function *function) {
    for (unsigned int i = 0; i < maskInstructions.size(); ++i) {
        if (maskInstructions[i].function != NULL && function == maskInstructions[i].function) {
            return true;
        }
    }
    return false;
}

IntrinsicsOpt::BlendInstruction *IntrinsicsOpt::matchingBlendInstruction(llvm::Function *function) {
    for (unsigned int i = 0; i < blendInstructions.size(); ++i) {
        if (blendInstructions[i].function != NULL && function == blendInstructions[i].function) {
            return &blendInstructions[i];
        }
    }
    return NULL;
}

static llvm::Pass *CreateIntrinsicsOptPass() { return new IntrinsicsOpt; }

///////////////////////////////////////////////////////////////////////////

/** This simple optimization pass looks for a vector select instruction
    with an all-on or all-off constant mask, simplifying it to the
    appropriate operand if so.

    @todo The better thing to do would be to submit a patch to LLVM to get
    these; they're presumably pretty simple patterns to match.
*/
class InstructionSimplifyPass : public llvm::FunctionPass {
  public:
    InstructionSimplifyPass() : FunctionPass(ID) {}

    llvm::StringRef getPassName() const { return "Vector Select Optimization"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);

    static char ID;

  private:
    static bool simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter);
    static llvm::Value *simplifyBoolVec(llvm::Value *value);
    static bool simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter);
};

char InstructionSimplifyPass::ID = 0;

llvm::Value *InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
    llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
    if (trunc != NULL) {
        // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
        llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
        if (sext && sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
            return sext->getOperand(0);

        llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
        if (zext && zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
            return zext->getOperand(0);
    }
    /*
      // This optimization has discernable benefit on the perf
      // suite on latest LLVM versions.
      // On 3.4+ (maybe even older), it can result in illegal
      // operations, so it's being disabled.
    llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
    if (icmp != NULL) {
        // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
        if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
            llvm::Value *op1 = icmp->getOperand(1);
            if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
                llvm::Value *op0 = icmp->getOperand(0);
                llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
                if (sext)
                    return sext->getOperand(0);
                llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
                if (zext)
                    return zext->getOperand(0);
            }
        }

    }
    */
    return NULL;
}

bool InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst, llvm::BasicBlock::iterator iter) {
    if (selectInst->getType()->isVectorTy() == false)
        return false;
    Assert(selectInst->getOperand(1) != NULL);
    Assert(selectInst->getOperand(2) != NULL);
    llvm::Value *factor = selectInst->getOperand(0);

    // Simplify all-on or all-off mask values
    MaskStatus maskStatus = lGetMaskStatus(factor);
    llvm::Value *value = NULL;
    if (maskStatus == MaskStatus::all_on)
        // Mask all on -> replace with the first select value
        value = selectInst->getOperand(1);
    else if (maskStatus == MaskStatus::all_off)
        // Mask all off -> replace with the second select value
        value = selectInst->getOperand(2);
    if (value != NULL) {
        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, value);
        return true;
    }

    // Sometimes earlier LLVM optimization passes generate unnecessarily
    // complex expressions for the selection vector, which in turn confuses
    // the code generators and leads to sub-optimal code (particularly for
    // 8 and 16-bit masks).  We'll try to simplify them out here so that
    // the code generator patterns match..
    if ((factor = simplifyBoolVec(factor)) != NULL) {
        llvm::Instruction *newSelect = llvm::SelectInst::Create(factor, selectInst->getOperand(1),
                                                                selectInst->getOperand(2), selectInst->getName());
        llvm::ReplaceInstWithInst(selectInst, newSelect);
        return true;
    }

    return false;
}

bool InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter) {
    llvm::Function *calledFunc = callInst->getCalledFunction();

    // Turn a __movmsk call with a compile-time constant vector into the
    // equivalent scalar value.
    if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
        return false;

    uint64_t mask;
    if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, LLVMInt64(mask));
        return true;
    }
    return false;
}

bool InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("InstructionSimplify");

    bool modifiedAny = false;

restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
        if (selectInst && simplifySelect(selectInst, iter)) {
            modifiedAny = true;
            goto restart;
        }
        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        if (callInst && simplifyCall(callInst, iter)) {
            modifiedAny = true;
            goto restart;
        }
    }

    DEBUG_END_PASS("InstructionSimplify");

    return modifiedAny;
}

bool InstructionSimplifyPass::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("InstructionSimplifyPass::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateInstructionSimplifyPass() { return new InstructionSimplifyPass; }

///////////////////////////////////////////////////////////////////////////
// ImproveMemoryOpsPass

/** When the front-end emits gathers and scatters, it generates an array of
    vector-width pointers to represent the set of addresses to read from or
    write to.  This optimization detects cases when the base pointer is a
    uniform pointer or when the indexing is into an array that can be
    converted into scatters/gathers from a single base pointer and an array
    of offsets.

    See for example the comments discussing the __pseudo_gather functions
    in builtins.cpp for more information about this.
 */
class ImproveMemoryOpsPass : public llvm::FunctionPass {
  public:
    static char ID;
    ImproveMemoryOpsPass() : FunctionPass(ID) {}

    llvm::StringRef getPassName() const { return "Improve Memory Ops"; }

    bool runOnBasicBlock(llvm::BasicBlock &BB);

    bool runOnFunction(llvm::Function &F);
};

char ImproveMemoryOpsPass::ID = 0;

/** Check to make sure that this value is actually a pointer in the end.
    We need to make sure that given an expression like vec(offset) +
    ptr2int(ptr), lGetBasePointer() doesn't return vec(offset) for the base
    pointer such that we then treat ptr2int(ptr) as an offset.  This ends
    up being important so that we don't generate LLVM GEP instructions like
    "gep inttoptr 8, i64 %ptr", which in turn can lead to incorrect code
    since LLVM's pointer aliasing analysis assumes that operands after the
    first one to a GEP aren't pointers.
 */
static llvm::Value *lCheckForActualPointer(llvm::Value *v) {
    if (v == NULL) {
        return NULL;
    } else if (llvm::isa<llvm::PointerType>(v->getType())) {
        return v;
    } else if (llvm::isa<llvm::PtrToIntInst>(v)) {
        return v;
    }
    // This one is tricky, as it's heuristic tuned for LLVM 3.7+, which may
    // optimize loading double* with consequent ptr2int to straight load of i64.
    // This heuristic should be good enough to catch all the cases we should
    // detect and nothing else.
    else if (llvm::isa<llvm::LoadInst>(v)) {
        return v;
    }

    else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
        llvm::Value *t = lCheckForActualPointer(ci->getOperand(0));
        if (t == NULL) {
            return NULL;
        } else {
            return v;
        }
    } else {
        llvm::ConstantExpr *uce = llvm::dyn_cast<llvm::ConstantExpr>(v);
        if (uce != NULL && uce->getOpcode() == llvm::Instruction::PtrToInt)
            return v;
        return NULL;
    }
}

/** Given a llvm::Value representing a varying pointer, this function
    checks to see if all of the elements of the vector have the same value
    (i.e. there's a common base pointer). If broadcast has been already detected
    it checks that the first element of the vector is not undef. If one of the conditions
    is true, it returns the common pointer value; otherwise it returns NULL.
 */
static llvm::Value *lGetBasePointer(llvm::Value *v, llvm::Instruction *insertBefore, bool broadcastDetected) {
    if (llvm::isa<llvm::InsertElementInst>(v) || llvm::isa<llvm::ShuffleVectorInst>(v)) {
        // If we have already detected broadcast we want to look for
        // the vector with the first not-undef element
        llvm::Value *element = LLVMFlattenInsertChain(v, g->target->getVectorWidth(), true, false, broadcastDetected);
        // TODO: it's probably ok to allow undefined elements and return
        // the base pointer if all of the other elements have the same
        // value.
        if (element != NULL) {
            // all elements are the same and not NULLs
            return lCheckForActualPointer(element);
        } else {
            return NULL;
        }
    }

    // This case comes up with global/static arrays
    if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
        return lCheckForActualPointer(cv->getSplatValue());
    } else if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
        return lCheckForActualPointer(cdv->getSplatValue());
    }
    // It is a little bit tricky to use operations with pointers, casted to int with another bit size
    // but sometimes it is useful, so we handle this case here.
    else if (llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v)) {
        llvm::Value *t = lGetBasePointer(ci->getOperand(0), insertBefore, broadcastDetected);
        if (t == NULL) {
            return NULL;
        } else {
            return llvm::CastInst::Create(ci->getOpcode(), t, ci->getType()->getScalarType(),
                                          llvm::Twine(t->getName()) + "_cast", insertBefore);
        }
    }

    return NULL;
}

/** Given the two operands to a constant add expression, see if we have the
    form "base pointer + offset", whee op0 is the base pointer and op1 is
    the offset; if so return the base and the offset. */
static llvm::Constant *lGetConstantAddExprBaseOffset(llvm::Constant *op0, llvm::Constant *op1, llvm::Constant **delta) {
    llvm::ConstantExpr *op = llvm::dyn_cast<llvm::ConstantExpr>(op0);
    if (op == NULL || op->getOpcode() != llvm::Instruction::PtrToInt)
        // the first operand isn't a pointer
        return NULL;

    llvm::ConstantInt *opDelta = llvm::dyn_cast<llvm::ConstantInt>(op1);
    if (opDelta == NULL)
        // the second operand isn't an integer operand
        return NULL;

    *delta = opDelta;
    return op0;
}

static llvm::Value *lExtractFromInserts(llvm::Value *v, unsigned int index) {
    llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(v);
    if (iv == NULL)
        return NULL;

    Assert(iv->hasIndices() && iv->getNumIndices() == 1);
    if (iv->getIndices()[0] == index)
        return iv->getInsertedValueOperand();
    else
        return lExtractFromInserts(iv->getAggregateOperand(), index);
}

/** Given a varying pointer in ptrs, this function checks to see if it can
    be determined to be indexing from a common uniform base pointer.  If
    so, the function returns the base pointer llvm::Value and initializes
    *offsets with an int vector of the per-lane offsets
 */
static llvm::Value *lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets, llvm::Instruction *insertBefore) {
#ifndef ISPC_NO_DUMPS
    if (g->debugPrint) {
        fprintf(stderr, "lGetBasePtrAndOffsets\n");
        LLVMDumpValue(ptrs);
    }
#endif

    bool broadcastDetected = false;
    // Looking for %gep_offset = shufflevector <8 x i64> %0, <8 x i64> undef, <8 x i32> zeroinitializer
    llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(ptrs);
    if (shuffle != NULL) {
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
        llvm::Value *indices = shuffle->getShuffleMaskForBitcode();
#else
        llvm::Value *indices = shuffle->getOperand(2);
#endif
        llvm::Value *vec = shuffle->getOperand(1);

        if (lIsUndef(vec) && llvm::isa<llvm::ConstantAggregateZero>(indices)) {
            broadcastDetected = true;
        }
    }
    llvm::Value *base = lGetBasePointer(ptrs, insertBefore, broadcastDetected);
    if (base != NULL) {
        // We have a straight up varying pointer with no indexing that's
        // actually all the same value.
        if (g->target->is32Bit())
            *offsets = LLVMInt32Vector(0);
        else
            *offsets = LLVMInt64Vector((int64_t)0);

        if (broadcastDetected) {
            llvm::Value *op = shuffle->getOperand(0);
            llvm::BinaryOperator *bop_var = llvm::dyn_cast<llvm::BinaryOperator>(op);
            if (bop_var != NULL && ((bop_var->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop_var))) {
                // We expect here ConstantVector as
                // <i64 4, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef, i64 undef>
                llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(bop_var->getOperand(1));
                llvm::Value *shuffle_offset = NULL;
                if (cv != NULL) {
                    llvm::Value *zeroMask =
#if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
                        llvm::ConstantVector::getSplat(cv->getType()->getVectorNumElements(),
#elif ISPC_LLVM_VERSION < ISPC_LLVM_12_0
                        llvm::ConstantVector::getSplat(
                            {llvm::dyn_cast<llvm::VectorType>(cv->getType())->getNumElements(), false},
#else // LLVM 12.0+
                        llvm::ConstantVector::getSplat(
                            llvm::ElementCount::get(
                                llvm::dyn_cast<llvm::FixedVectorType>(cv->getType())->getNumElements(), false),
#endif
                                                       llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
                    // Create offset
                    shuffle_offset = new llvm::ShuffleVectorInst(cv, llvm::UndefValue::get(cv->getType()), zeroMask,
                                                                 "shuffle", bop_var);
                } else {
                    // or it binaryoperator can accept another binary operator
                    // that is a result of counting another part of offset:
                    // %another_bop = bop <16 x i32> %vec, <i32 7, i32 undef, i32 undef, ...>
                    // %offsets = add <16 x i32> %another_bop, %base
                    bop_var = llvm::dyn_cast<llvm::BinaryOperator>(bop_var->getOperand(0));
                    if (bop_var != NULL) {
                        llvm::Type *bop_var_type = bop_var->getType();
                        llvm::Value *zeroMask = llvm::ConstantVector::getSplat(
#if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
                            bop_var_type->getVectorNumElements(),
#elif ISPC_LLVM_VERSION < ISPC_LLVM_12_0
                            {llvm::dyn_cast<llvm::VectorType>(bop_var_type)->getNumElements(), false},
#else // LLVM 12.0+
                            llvm::ElementCount::get(
                                llvm::dyn_cast<llvm::FixedVectorType>(bop_var_type)->getNumElements(), false),
#endif
                            llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
                        shuffle_offset = new llvm::ShuffleVectorInst(bop_var, llvm::UndefValue::get(bop_var_type),
                                                                     zeroMask, "shuffle", bop_var);
                    }
                }
                if (shuffle_offset != NULL) {
                    *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, shuffle_offset,
                                                            "new_offsets", insertBefore);
                    return base;
                } else {
                    // Base + offset pattern was not recognized
                    return NULL;
                }
            }
        }
        return base;
    }

    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(ptrs);
    if (bop != NULL && ((bop->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop))) {
        // If we have a common pointer plus something, then we're also
        // good.
        if ((base = lGetBasePtrAndOffsets(bop->getOperand(0), offsets, insertBefore)) != NULL) {
            *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, bop->getOperand(1), "new_offsets",
                                                    insertBefore);
            return base;
        } else if ((base = lGetBasePtrAndOffsets(bop->getOperand(1), offsets, insertBefore)) != NULL) {
            *offsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, *offsets, bop->getOperand(0), "new_offsets",
                                                    insertBefore);
            return base;
        }
    }
    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(ptrs);
    if (cv != NULL) {
        // Indexing into global arrays can lead to this form, with
        // ConstantVectors..
        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
        for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
            llvm::Constant *c = llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
            if (c == NULL)
                return NULL;
            elements.push_back(c);
        }

        llvm::Constant *delta[ISPC_MAX_NVEC];
        for (unsigned int i = 0; i < elements.size(); ++i) {
            // For each element, try to decompose it into either a straight
            // up base pointer, or a base pointer plus an integer value.
            llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(elements[i]);
            if (ce == NULL)
                return NULL;

            delta[i] = NULL;
            llvm::Value *elementBase = NULL; // base pointer for this element
            if (ce->getOpcode() == llvm::Instruction::PtrToInt) {
                // If the element is just a ptr to int instruction, treat
                // it as having an offset of zero
                elementBase = ce;
                delta[i] = g->target->is32Bit() ? LLVMInt32(0) : LLVMInt64(0);
            } else if ((ce->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(ce)) {
                // Try both orderings of the operands to see if we can get
                // a pointer+offset out of them.
                elementBase = lGetConstantAddExprBaseOffset(ce->getOperand(0), ce->getOperand(1), &delta[i]);
                if (elementBase == NULL)
                    elementBase = lGetConstantAddExprBaseOffset(ce->getOperand(1), ce->getOperand(0), &delta[i]);
            }

            // We weren't able to find a base pointer in the above.  (We
            // don't expect this to happen; if it does, it may be necessary
            // to handle more cases in the decomposition above.)
            if (elementBase == NULL)
                return NULL;

            Assert(delta[i] != NULL);
            if (base == NULL)
                // The first time we've found a base pointer
                base = elementBase;
            else if (base != elementBase)
                // Different program instances have different base
                // pointers, so no luck.
                return NULL;
        }

        Assert(base != NULL);
        llvm::ArrayRef<llvm::Constant *> deltas(&delta[0], &delta[elements.size()]);
        *offsets = llvm::ConstantVector::get(deltas);
        return base;
    }

    llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(ptrs);
    if (ev != NULL) {
        Assert(ev->getNumIndices() == 1);
        int index = ev->getIndices()[0];
        ptrs = lExtractFromInserts(ev->getAggregateOperand(), index);
        if (ptrs != NULL)
            return lGetBasePtrAndOffsets(ptrs, offsets, insertBefore);
    }

    return NULL;
}

/** Given a vector expression in vec, separate it into a compile-time
    constant component and a variable component, returning the two parts in
    *constOffset and *variableOffset.  (It should be the case that the sum
    of these two is exactly equal to the original vector.)

    This routine only handles some (important) patterns; in some cases it
    will fail and return components that are actually compile-time
    constants in *variableOffset.

    Finally, if there aren't any constant (or, respectivaly, variable)
    components, the corresponding return value may be set to NULL.
 */
static void lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, llvm::Value **variableOffset,
                                   llvm::Instruction *insertBefore) {
    if (llvm::isa<llvm::ConstantVector>(vec) || llvm::isa<llvm::ConstantDataVector>(vec) ||
        llvm::isa<llvm::ConstantAggregateZero>(vec)) {
        *constOffset = vec;
        *variableOffset = NULL;
        return;
    }

    llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(vec);
    if (cast != NULL) {
        // Check the cast target.
        llvm::Value *co, *vo;
        lExtractConstantOffset(cast->getOperand(0), &co, &vo, insertBefore);

        // make new cast instructions for the two parts
        if (co == NULL)
            *constOffset = NULL;
        else
            *constOffset = llvm::CastInst::Create(cast->getOpcode(), co, cast->getType(),
                                                  llvm::Twine(co->getName()) + "_cast", insertBefore);
        if (vo == NULL)
            *variableOffset = NULL;
        else
            *variableOffset = llvm::CastInst::Create(cast->getOpcode(), vo, cast->getType(),
                                                     llvm::Twine(vo->getName()) + "_cast", insertBefore);
        return;
    }

    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(vec);
    if (bop != NULL) {
        llvm::Value *op0 = bop->getOperand(0);
        llvm::Value *op1 = bop->getOperand(1);
        llvm::Value *c0, *v0, *c1, *v1;

        if ((bop->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop)) {
            lExtractConstantOffset(op0, &c0, &v0, insertBefore);
            lExtractConstantOffset(op1, &c1, &v1, insertBefore);

            if (c0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c0))
                *constOffset = c1;
            else if (c1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(c1))
                *constOffset = c0;
            else
                *constOffset = llvm::BinaryOperator::Create(
                    llvm::Instruction::Add, c0, c1, ((llvm::Twine("add_") + c0->getName()) + "_") + c1->getName(),
                    insertBefore);

            if (v0 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v0))
                *variableOffset = v1;
            else if (v1 == NULL || llvm::isa<llvm::ConstantAggregateZero>(v1))
                *variableOffset = v0;
            else
                *variableOffset = llvm::BinaryOperator::Create(
                    llvm::Instruction::Add, v0, v1, ((llvm::Twine("add_") + v0->getName()) + "_") + v1->getName(),
                    insertBefore);
            return;
        } else if (bop->getOpcode() == llvm::Instruction::Shl) {
            lExtractConstantOffset(op0, &c0, &v0, insertBefore);
            lExtractConstantOffset(op1, &c1, &v1, insertBefore);

            // Given the product of constant and variable terms, we have:
            // (c0 + v0) * (2^(c1 + v1))  = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1
            // We can optimize only if v1 == NULL.
            if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) {
                *constOffset = NULL;
                *variableOffset = vec;
            } else if (v0 == NULL) {
                *constOffset = vec;
                *variableOffset = NULL;
            } else {
                *constOffset = llvm::BinaryOperator::Create(
                    llvm::Instruction::Shl, c0, c1, ((llvm::Twine("shl_") + c0->getName()) + "_") + c1->getName(),
                    insertBefore);
                *variableOffset = llvm::BinaryOperator::Create(
                    llvm::Instruction::Shl, v0, c1, ((llvm::Twine("shl_") + v0->getName()) + "_") + c1->getName(),
                    insertBefore);
            }
            return;
        } else if (bop->getOpcode() == llvm::Instruction::Mul) {
            lExtractConstantOffset(op0, &c0, &v0, insertBefore);
            lExtractConstantOffset(op1, &c1, &v1, insertBefore);

            // Given the product of constant and variable terms, we have:
            // (c0 + v0) * (c1 + v1) == (c0 c1) + (v0 c1 + c0 v1 + v0 v1)
            // Note that the first term is a constant and the last three are
            // variable.
            if (c0 != NULL && c1 != NULL)
                *constOffset = llvm::BinaryOperator::Create(
                    llvm::Instruction::Mul, c0, c1, ((llvm::Twine("mul_") + c0->getName()) + "_") + c1->getName(),
                    insertBefore);
            else
                *constOffset = NULL;

            llvm::Value *va = NULL, *vb = NULL, *vc = NULL;
            if (v0 != NULL && c1 != NULL)
                va = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, c1,
                                                  ((llvm::Twine("mul_") + v0->getName()) + "_") + c1->getName(),
                                                  insertBefore);
            if (c0 != NULL && v1 != NULL)
                vb = llvm::BinaryOperator::Create(llvm::Instruction::Mul, c0, v1,
                                                  ((llvm::Twine("mul_") + c0->getName()) + "_") + v1->getName(),
                                                  insertBefore);
            if (v0 != NULL && v1 != NULL)
                vc = llvm::BinaryOperator::Create(llvm::Instruction::Mul, v0, v1,
                                                  ((llvm::Twine("mul_") + v0->getName()) + "_") + v1->getName(),
                                                  insertBefore);

            llvm::Value *vab = NULL;
            if (va != NULL && vb != NULL)
                vab = llvm::BinaryOperator::Create(llvm::Instruction::Add, va, vb,
                                                   ((llvm::Twine("add_") + va->getName()) + "_") + vb->getName(),
                                                   insertBefore);
            else if (va != NULL)
                vab = va;
            else
                vab = vb;

            if (vab != NULL && vc != NULL)
                *variableOffset = llvm::BinaryOperator::Create(
                    llvm::Instruction::Add, vab, vc, ((llvm::Twine("add_") + vab->getName()) + "_") + vc->getName(),
                    insertBefore);
            else if (vab != NULL)
                *variableOffset = vab;
            else
                *variableOffset = vc;

            return;
        }
    }

    // Nothing matched, just return what we have as a variable component
    *constOffset = NULL;
    *variableOffset = vec;
}

/* Returns true if the given value is a constant vector of integers with
   the same value in all of the elements.  (Returns the splatted value in
   *splat, if so). */
static bool lIsIntegerSplat(llvm::Value *v, int *splat) {
    llvm::ConstantDataVector *cvec = llvm::dyn_cast<llvm::ConstantDataVector>(v);
    if (cvec == NULL)
        return false;

    llvm::Constant *splatConst = cvec->getSplatValue();
    if (splatConst == NULL)
        return false;

    llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(splatConst);
    if (ci == NULL)
        return false;

    int64_t splatVal = ci->getSExtValue();
    *splat = (int)splatVal;
    return true;
}

static llvm::Value *lExtract248Scale(llvm::Value *splatOperand, int splatValue, llvm::Value *otherOperand,
                                     llvm::Value **result) {
    if (splatValue == 2 || splatValue == 4 || splatValue == 8) {
        *result = otherOperand;
        return LLVMInt32(splatValue);
    }
    // Even if we don't have a common scale by exactly 2, 4, or 8, we'll
    // see if we can pull out that much of the scale anyway; this may in
    // turn allow other optimizations later.
    for (int scale = 8; scale >= 2; scale /= 2) {
        llvm::Instruction *insertBefore = llvm::dyn_cast<llvm::Instruction>(*result);
        Assert(insertBefore != NULL);

        if ((splatValue % scale) == 0) {
            // *result = otherOperand * splatOperand / scale;
            llvm::Value *splatScaleVec = (splatOperand->getType() == LLVMTypes::Int32VectorType)
                                             ? LLVMInt32Vector(scale)
                                             : LLVMInt64Vector(scale);
            llvm::Value *splatDiv =
                llvm::BinaryOperator::Create(llvm::Instruction::SDiv, splatOperand, splatScaleVec, "div", insertBefore);
            *result = llvm::BinaryOperator::Create(llvm::Instruction::Mul, splatDiv, otherOperand, "mul", insertBefore);
            return LLVMInt32(scale);
        }
    }
    return LLVMInt32(1);
}

/** Given a vector of integer offsets to a base pointer being used for a
    gather or a scatter, see if its root operation is a multiply by a
    vector of some value by all 2s/4s/8s.  If not, return NULL.

    If it is return an i32 value of 2, 4, 8 from the function and modify
    *vec so that it points to the operand that is being multiplied by
    2/4/8.

    We go through all this trouble so that we can pass the i32 scale factor
    to the {gather,scatter}_base_offsets function as a separate scale
    factor for the offsets.  This in turn is used in a way so that the LLVM
    x86 code generator matches it to apply x86's free scale by 2x, 4x, or
    8x to one of two registers being added together for an addressing
    calculation.
 */
static llvm::Value *lExtractOffsetVector248Scale(llvm::Value **vec) {
    llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(*vec);
    if (cast != NULL) {
        llvm::Value *castOp = cast->getOperand(0);
        // Check the cast target.
        llvm::Value *scale = lExtractOffsetVector248Scale(&castOp);
        if (scale == NULL)
            return NULL;

        // make a new cast instruction so that we end up with the right
        // type
        *vec = llvm::CastInst::Create(cast->getOpcode(), castOp, cast->getType(), "offset_cast", cast);
        return scale;
    }

    // If we don't have a binary operator, then just give up
    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
    if (bop == NULL)
        return LLVMInt32(1);

    llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
    if ((bop->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop)) {
        if (llvm::isa<llvm::ConstantAggregateZero>(op0)) {
            *vec = op1;
            return lExtractOffsetVector248Scale(vec);
        } else if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
            *vec = op0;
            return lExtractOffsetVector248Scale(vec);
        } else {
            llvm::Value *s0 = lExtractOffsetVector248Scale(&op0);
            llvm::Value *s1 = lExtractOffsetVector248Scale(&op1);
            if (s0 == s1) {
                *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add, op0, op1, "new_add", bop);
                return s0;
            } else
                return LLVMInt32(1);
        }
    } else if (bop->getOpcode() == llvm::Instruction::Mul) {
        // Check each operand for being one of the scale factors we care about.
        int splat;
        if (lIsIntegerSplat(op0, &splat))
            return lExtract248Scale(op0, splat, op1, vec);
        else if (lIsIntegerSplat(op1, &splat))
            return lExtract248Scale(op1, splat, op0, vec);
        else
            return LLVMInt32(1);
    } else
        return LLVMInt32(1);
}

#if 0
static llvm::Value *
lExtractUniforms(llvm::Value **vec, llvm::Instruction *insertBefore) {
    fprintf(stderr, " lextract: ");
    (*vec)->dump();
    fprintf(stderr, "\n");

    if (llvm::isa<llvm::ConstantVector>(*vec) ||
        llvm::isa<llvm::ConstantDataVector>(*vec) ||
        llvm::isa<llvm::ConstantAggregateZero>(*vec))
        return NULL;

    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(*vec);
    if (sext != NULL) {
        llvm::Value *sextOp = sext->getOperand(0);
        // Check the sext target.
        llvm::Value *unif = lExtractUniforms(&sextOp, insertBefore);
        if (unif == NULL)
            return NULL;

        // make a new sext instruction so that we end up with the right
        // type
        *vec = new llvm::SExtInst(sextOp, sext->getType(), "offset_sext", sext);
        return unif;
    }

    if (LLVMVectorValuesAllEqual(*vec)) {
        // FIXME: we may want to redo all of the expression here, in scalar
        // form (if at all possible), for code quality...
        llvm::Value *unif =
            llvm::ExtractElementInst::Create(*vec, LLVMInt32(0),
                                             "first_uniform", insertBefore);
        *vec = NULL;
        return unif;
    }

    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(*vec);
    if (bop == NULL)
        return NULL;

    llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
    if (bop->getOpcode() == llvm::Instruction::Add) {
        llvm::Value *s0 = lExtractUniforms(&op0, insertBefore);
        llvm::Value *s1 = lExtractUniforms(&op1, insertBefore);
        if (s0 == NULL && s1 == NULL)
            return NULL;

        if (op0 == NULL)
            *vec = op1;
        else if (op1 == NULL)
            *vec = op0;
        else
            *vec = llvm::BinaryOperator::Create(llvm::Instruction::Add,
                                                op0, op1, "new_add", insertBefore);

        if (s0 == NULL)
            return s1;
        else if (s1 == NULL)
            return s0;
        else
            return llvm::BinaryOperator::Create(llvm::Instruction::Add, s0, s1,
                                                "add_unif", insertBefore);
    }
#if 0
    else if (bop->getOpcode() == llvm::Instruction::Mul) {
        // Check each operand for being one of the scale factors we care about.
        int splat;
        if (lIs248Splat(op0, &splat)) {
            *vec = op1;
            return LLVMInt32(splat);
        }
        else if (lIs248Splat(op1, &splat)) {
            *vec = op0;
            return LLVMInt32(splat);
        }
        else
            return LLVMInt32(1);
    }
#endif
    else
        return NULL;
}


static void
lExtractUniformsFromOffset(llvm::Value **basePtr, llvm::Value **offsetVector,
                           llvm::Value *offsetScale,
                           llvm::Instruction *insertBefore) {
#if 1
    (*basePtr)->dump();
    printf("\n");
    (*offsetVector)->dump();
    printf("\n");
    offsetScale->dump();
    printf("-----\n");
#endif

    llvm::Value *uniformDelta = lExtractUniforms(offsetVector, insertBefore);
    if (uniformDelta == NULL)
        return;

    *basePtr = lGEPInst(*basePtr, arrayRef, "new_base", insertBefore);

    // this should only happen if we have only uniforms, but that in turn
    // shouldn't be a gather/scatter!
    Assert(*offsetVector != NULL);
}
#endif

static bool lVectorIs32BitInts(llvm::Value *v) {
    int nElts;
    int64_t elts[ISPC_MAX_NVEC];
    if (!LLVMExtractVectorInts(v, elts, &nElts))
        return false;

    for (int i = 0; i < nElts; ++i)
        if ((int32_t)elts[i] != elts[i])
            return false;

    return true;
}

/** Check to see if the two offset vectors can safely be represented with
    32-bit values.  If so, return true and update the pointed-to
    llvm::Value *s to be the 32-bit equivalents. */
static bool lOffsets32BitSafe(llvm::Value **variableOffsetPtr, llvm::Value **constOffsetPtr,
                              llvm::Instruction *insertBefore) {
    llvm::Value *variableOffset = *variableOffsetPtr;
    llvm::Value *constOffset = *constOffsetPtr;

    if (variableOffset->getType() != LLVMTypes::Int32VectorType) {
        llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(variableOffset);
        if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType)
            // sext of a 32-bit vector -> the 32-bit vector is good
            variableOffset = sext->getOperand(0);
        else if (lVectorIs32BitInts(variableOffset))
            // The only constant vector we should have here is a vector of
            // all zeros (i.e. a ConstantAggregateZero, but just in case,
            // do the more general check with lVectorIs32BitInts().
            variableOffset = new llvm::TruncInst(variableOffset, LLVMTypes::Int32VectorType,
                                                 llvm::Twine(variableOffset->getName()) + "_trunc", insertBefore);
        else
            return false;
    }

    if (constOffset->getType() != LLVMTypes::Int32VectorType) {
        if (lVectorIs32BitInts(constOffset)) {
            // Truncate them so we have a 32-bit vector type for them.
            constOffset = new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
                                              llvm::Twine(constOffset->getName()) + "_trunc", insertBefore);
        } else {
            // FIXME: otherwise we just assume that all constant offsets
            // can actually always fit into 32-bits...  (This could be
            // wrong, but it should be only in pretty esoteric cases).  We
            // make this assumption for now since we sometimes generate
            // constants that need constant folding before we really have a
            // constant vector out of them, and
            // llvm::ConstantFoldInstruction() doesn't seem to be doing
            // enough for us in some cases if we call it from here.
            constOffset = new llvm::TruncInst(constOffset, LLVMTypes::Int32VectorType,
                                              llvm::Twine(constOffset->getName()) + "_trunc", insertBefore);
        }
    }

    *variableOffsetPtr = variableOffset;
    *constOffsetPtr = constOffset;
    return true;
}

/** Check to see if the offset value is composed of a string of Adds,
    SExts, and Constant Vectors that are 32-bit safe.  Recursively
    explores the operands of Add instructions (as they might themselves
    be adds that eventually terminate in constant vectors or a SExt.)
 */

static bool lIs32BitSafeHelper(llvm::Value *v) {
    // handle Adds, SExts, Constant Vectors
    if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v)) {
        if ((bop->getOpcode() == llvm::Instruction::Add) || IsOrEquivalentToAdd(bop)) {
            return lIs32BitSafeHelper(bop->getOperand(0)) && lIs32BitSafeHelper(bop->getOperand(1));
        }
        return false;
    } else if (llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(v)) {
        return sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType;
    } else
        return lVectorIs32BitInts(v);
}

/** Check to see if the single offset vector can safely be represented with
    32-bit values.  If so, return true and update the pointed-to
    llvm::Value * to be the 32-bit equivalent. */
static bool lOffsets32BitSafe(llvm::Value **offsetPtr, llvm::Instruction *insertBefore) {
    llvm::Value *offset = *offsetPtr;

    if (offset->getType() == LLVMTypes::Int32VectorType)
        return true;

    llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offset);
    if (sext != NULL && sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
        // sext of a 32-bit vector -> the 32-bit vector is good
        *offsetPtr = sext->getOperand(0);
        return true;
    } else if (lIs32BitSafeHelper(offset)) {
        // The only constant vector we should have here is a vector of
        // all zeros (i.e. a ConstantAggregateZero, but just in case,
        // do the more general check with lVectorIs32BitInts().

        // Alternatively, offset could be a sequence of adds terminating
        // in safe constant vectors or a SExt.
        *offsetPtr = new llvm::TruncInst(offset, LLVMTypes::Int32VectorType, llvm::Twine(offset->getName()) + "_trunc",
                                         insertBefore);
        return true;
    } else
        return false;
}

static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) {
    struct GSInfo {
        GSInfo(const char *pgFuncName, const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
            : isGather(ig), isPrefetch(ip) {
            func = m->module->getFunction(pgFuncName);
            baseOffsetsFunc = m->module->getFunction(pgboFuncName);
            baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
        }
        llvm::Function *func;
        llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
        const bool isGather;
        const bool isPrefetch;
    };

    GSInfo gsFuncs[] = {
        GSInfo(
            "__pseudo_gather32_i8",
            g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
            g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
            true, false),
        GSInfo("__pseudo_gather32_i16",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
                                      : "__pseudo_gather_factored_base_offsets32_i16",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
                                      : "__pseudo_gather_factored_base_offsets32_i16",
               true, false),
        GSInfo("__pseudo_gather32_i32",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
                                      : "__pseudo_gather_factored_base_offsets32_i32",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
                                      : "__pseudo_gather_factored_base_offsets32_i32",
               true, false),
        GSInfo("__pseudo_gather32_float",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
                                      : "__pseudo_gather_factored_base_offsets32_float",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
                                      : "__pseudo_gather_factored_base_offsets32_float",
               true, false),
        GSInfo("__pseudo_gather32_i64",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
                                      : "__pseudo_gather_factored_base_offsets32_i64",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
                                      : "__pseudo_gather_factored_base_offsets32_i64",
               true, false),
        GSInfo("__pseudo_gather32_double",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
                                      : "__pseudo_gather_factored_base_offsets32_double",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
                                      : "__pseudo_gather_factored_base_offsets32_double",
               true, false),

        GSInfo("__pseudo_scatter32_i8",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
                                       : "__pseudo_scatter_factored_base_offsets32_i8",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
                                       : "__pseudo_scatter_factored_base_offsets32_i8",
               false, false),
        GSInfo("__pseudo_scatter32_i16",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
                                       : "__pseudo_scatter_factored_base_offsets32_i16",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
                                       : "__pseudo_scatter_factored_base_offsets32_i16",
               false, false),
        GSInfo("__pseudo_scatter32_i32",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
                                       : "__pseudo_scatter_factored_base_offsets32_i32",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
                                       : "__pseudo_scatter_factored_base_offsets32_i32",
               false, false),
        GSInfo("__pseudo_scatter32_float",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
                                       : "__pseudo_scatter_factored_base_offsets32_float",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
                                       : "__pseudo_scatter_factored_base_offsets32_float",
               false, false),
        GSInfo("__pseudo_scatter32_i64",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
                                       : "__pseudo_scatter_factored_base_offsets32_i64",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
                                       : "__pseudo_scatter_factored_base_offsets32_i64",
               false, false),
        GSInfo("__pseudo_scatter32_double",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
                                       : "__pseudo_scatter_factored_base_offsets32_double",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
                                       : "__pseudo_scatter_factored_base_offsets32_double",
               false, false),

        GSInfo(
            "__pseudo_gather64_i8",
            g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8" : "__pseudo_gather_factored_base_offsets64_i8",
            g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
            true, false),
        GSInfo("__pseudo_gather64_i16",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16"
                                      : "__pseudo_gather_factored_base_offsets64_i16",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
                                      : "__pseudo_gather_factored_base_offsets32_i16",
               true, false),
        GSInfo("__pseudo_gather64_i32",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32"
                                      : "__pseudo_gather_factored_base_offsets64_i32",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
                                      : "__pseudo_gather_factored_base_offsets32_i32",
               true, false),
        GSInfo("__pseudo_gather64_float",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_float"
                                      : "__pseudo_gather_factored_base_offsets64_float",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
                                      : "__pseudo_gather_factored_base_offsets32_float",
               true, false),
        GSInfo("__pseudo_gather64_i64",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64"
                                      : "__pseudo_gather_factored_base_offsets64_i64",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
                                      : "__pseudo_gather_factored_base_offsets32_i64",
               true, false),
        GSInfo("__pseudo_gather64_double",
               g->target->hasGather() ? "__pseudo_gather_base_offsets64_double"
                                      : "__pseudo_gather_factored_base_offsets64_double",
               g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
                                      : "__pseudo_gather_factored_base_offsets32_double",
               true, false),

        GSInfo("__pseudo_scatter64_i8",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8"
                                       : "__pseudo_scatter_factored_base_offsets64_i8",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
                                       : "__pseudo_scatter_factored_base_offsets32_i8",
               false, false),
        GSInfo("__pseudo_scatter64_i16",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16"
                                       : "__pseudo_scatter_factored_base_offsets64_i16",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
                                       : "__pseudo_scatter_factored_base_offsets32_i16",
               false, false),
        GSInfo("__pseudo_scatter64_i32",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32"
                                       : "__pseudo_scatter_factored_base_offsets64_i32",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
                                       : "__pseudo_scatter_factored_base_offsets32_i32",
               false, false),
        GSInfo("__pseudo_scatter64_float",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float"
                                       : "__pseudo_scatter_factored_base_offsets64_float",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
                                       : "__pseudo_scatter_factored_base_offsets32_float",
               false, false),
        GSInfo("__pseudo_scatter64_i64",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64"
                                       : "__pseudo_scatter_factored_base_offsets64_i64",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
                                       : "__pseudo_scatter_factored_base_offsets32_i64",
               false, false),
        GSInfo("__pseudo_scatter64_double",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double"
                                       : "__pseudo_scatter_factored_base_offsets64_double",
               g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
                                       : "__pseudo_scatter_factored_base_offsets32_double",
               false, false),
        GSInfo("__pseudo_prefetch_read_varying_1",
               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
               false, true),

        GSInfo("__pseudo_prefetch_read_varying_2",
               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
               false, true),

        GSInfo("__pseudo_prefetch_read_varying_3",
               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
               false, true),

        GSInfo("__pseudo_prefetch_read_varying_nt",
               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
               g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
               false, true),
    };

    int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
    for (int i = 0; i < numGSFuncs; ++i)
        Assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL && gsFuncs[i].baseOffsets32Func != NULL);

    GSInfo *info = NULL;
    for (int i = 0; i < numGSFuncs; ++i)
        if (gsFuncs[i].func != NULL && callInst->getCalledFunction() == gsFuncs[i].func) {
            info = &gsFuncs[i];
            break;
        }
    if (info == NULL)
        return false;

    // Try to transform the array of pointers to a single base pointer
    // and an array of int32 offsets.  (All the hard work is done by
    // lGetBasePtrAndOffsets).
    llvm::Value *ptrs = callInst->getArgOperand(0);
    llvm::Value *offsetVector = NULL;
    llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector, callInst);

    if (basePtr == NULL || offsetVector == NULL ||
        (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch() == false)) {
        // It's actually a fully general gather/scatter with a varying
        // set of base pointers, so leave it as is and continune onward
        // to the next instruction...
        return false;
    }
    // Cast the base pointer to a void *, since that's what the
    // __pseudo_*_base_offsets_* functions want.
    basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType, llvm::Twine(basePtr->getName()) + "_2void",
                                     callInst);
    lCopyMetadata(basePtr, callInst);
    llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;

    if ((info->isGather == true && g->target->hasGather()) ||
        (info->isGather == false && info->isPrefetch == false && g->target->hasScatter()) ||
        (info->isGather == false && info->isPrefetch == true && g->target->hasVecPrefetch())) {

        // See if the offsets are scaled by 2, 4, or 8.  If so,
        // extract that scale factor and rewrite the offsets to remove
        // it.
        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector);

        // If we're doing 32-bit addressing on a 64-bit target, here we
        // will see if we can call one of the 32-bit variants of the pseudo
        // gather/scatter functions.
        if (g->opt.force32BitAddressing && lOffsets32BitSafe(&offsetVector, callInst)) {
            gatherScatterFunc = info->baseOffsets32Func;
        }

        if (info->isGather || info->isPrefetch) {
            llvm::Value *mask = callInst->getArgOperand(1);

            // Generate a new function call to the next pseudo gather
            // base+offsets instruction.  Note that we're passing a NULL
            // llvm::Instruction to llvm::CallInst::Create; this means that
            // the instruction isn't inserted into a basic block and that
            // way we can then call ReplaceInstWithInst().
            llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, mask,
                                                   callInst->getName().str().c_str(), NULL);
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
        } else {
            llvm::Value *storeValue = callInst->getArgOperand(1);
            llvm::Value *mask = callInst->getArgOperand(2);

            // Generate a new function call to the next pseudo scatter
            // base+offsets instruction.  See above for why passing NULL
            // for the Instruction * is intended.
            llvm::Instruction *newCall =
                lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, storeValue, mask, "", NULL);
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
        }
    } else {
        // Try to decompose the offset vector into a compile time constant
        // component and a varying component.  The constant component is
        // passed as a separate parameter to the gather/scatter functions,
        // which in turn allows their implementations to end up emitting
        // x86 instructions with constant offsets encoded in them.
        llvm::Value *constOffset = NULL;
        llvm::Value *variableOffset = NULL;
        lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, callInst);
        if (constOffset == NULL)
            constOffset = LLVMIntAsType(0, offsetVector->getType());
        if (variableOffset == NULL)
            variableOffset = LLVMIntAsType(0, offsetVector->getType());

        // See if the varying component is scaled by 2, 4, or 8.  If so,
        // extract that scale factor and rewrite variableOffset to remove
        // it.  (This also is pulled out so that we can match the scales by
        // 2/4/8 offered by x86 addressing operators.)
        llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset);

        // If we're doing 32-bit addressing on a 64-bit target, here we
        // will see if we can call one of the 32-bit variants of the pseudo
        // gather/scatter functions.
        if (g->opt.force32BitAddressing && lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) {
            gatherScatterFunc = info->baseOffsets32Func;
        }

        if (info->isGather || info->isPrefetch) {
            llvm::Value *mask = callInst->getArgOperand(1);

            // Generate a new function call to the next pseudo gather
            // base+offsets instruction.  Note that we're passing a NULL
            // llvm::Instruction to llvm::CallInst::Create; this means that
            // the instruction isn't inserted into a basic block and that
            // way we can then call ReplaceInstWithInst().
            llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, constOffset,
                                                   mask, callInst->getName().str().c_str(), NULL);
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
        } else {
            llvm::Value *storeValue = callInst->getArgOperand(1);
            llvm::Value *mask = callInst->getArgOperand(2);

            // Generate a new function call to the next pseudo scatter
            // base+offsets instruction.  See above for why passing NULL
            // for the Instruction * is intended.
            llvm::Instruction *newCall = lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, constOffset,
                                                   storeValue, mask, "", NULL);
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
        }
    }
    return true;
}

/** Try to improve the decomposition between compile-time constant and
    compile-time unknown offsets in calls to the __pseudo_*_base_offsets*
    functions.  Other other optimizations have run, we will sometimes be
    able to pull more terms out of the unknown part and add them into the
    compile-time-known part.
 */
static bool lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) {
    struct GSBOInfo {
        GSBOInfo(const char *pgboFuncName, const char *pgbo32FuncName, bool ig, bool ip)
            : isGather(ig), isPrefetch(ip) {
            baseOffsetsFunc = m->module->getFunction(pgboFuncName);
            baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
        }
        llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
        const bool isGather;
        const bool isPrefetch;
    };

    GSBOInfo gsFuncs[] = {
        GSBOInfo(
            g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
            g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8" : "__pseudo_gather_factored_base_offsets32_i8",
            true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
                                        : "__pseudo_gather_factored_base_offsets32_i16",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
                                        : "__pseudo_gather_factored_base_offsets32_i16",
                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
                                        : "__pseudo_gather_factored_base_offsets32_i32",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
                                        : "__pseudo_gather_factored_base_offsets32_i32",
                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
                                        : "__pseudo_gather_factored_base_offsets32_float",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
                                        : "__pseudo_gather_factored_base_offsets32_float",
                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
                                        : "__pseudo_gather_factored_base_offsets32_i64",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
                                        : "__pseudo_gather_factored_base_offsets32_i64",
                 true, false),
        GSBOInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
                                        : "__pseudo_gather_factored_base_offsets32_double",
                 g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
                                        : "__pseudo_gather_factored_base_offsets32_double",
                 true, false),

        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
                                         : "__pseudo_scatter_factored_base_offsets32_i8",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
                                         : "__pseudo_scatter_factored_base_offsets32_i8",
                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
                                         : "__pseudo_scatter_factored_base_offsets32_i16",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
                                         : "__pseudo_scatter_factored_base_offsets32_i16",
                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
                                         : "__pseudo_scatter_factored_base_offsets32_i32",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
                                         : "__pseudo_scatter_factored_base_offsets32_i32",
                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
                                         : "__pseudo_scatter_factored_base_offsets32_float",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
                                         : "__pseudo_scatter_factored_base_offsets32_float",
                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
                                         : "__pseudo_scatter_factored_base_offsets32_i64",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
                                         : "__pseudo_scatter_factored_base_offsets32_i64",
                 false, false),
        GSBOInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
                                         : "__pseudo_scatter_factored_base_offsets32_double",
                 g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
                                         : "__pseudo_scatter_factored_base_offsets32_double",
                 false, false),

        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_1_native" : "__prefetch_read_varying_1",
                 false, true),

        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_2_native" : "__prefetch_read_varying_2",
                 false, true),

        GSBOInfo(g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
                 g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_3_native" : "__prefetch_read_varying_3",
                 false, true),

        GSBOInfo(
            g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
            g->target->hasVecPrefetch() ? "__pseudo_prefetch_read_varying_nt_native" : "__prefetch_read_varying_nt",
            false, true),
    };

    int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
    for (int i = 0; i < numGSFuncs; ++i)
        Assert(gsFuncs[i].baseOffsetsFunc != NULL && gsFuncs[i].baseOffsets32Func != NULL);

    llvm::Function *calledFunc = callInst->getCalledFunction();
    Assert(calledFunc != NULL);

    // Is one of the gather/scatter functins that decompose into
    // base+offsets being called?
    GSBOInfo *info = NULL;
    for (int i = 0; i < numGSFuncs; ++i)
        if (calledFunc == gsFuncs[i].baseOffsetsFunc || calledFunc == gsFuncs[i].baseOffsets32Func) {
            info = &gsFuncs[i];
            break;
        }
    if (info == NULL)
        return false;

    // Grab the old variable offset
    llvm::Value *origVariableOffset = callInst->getArgOperand(1);

    // If it's zero, we're done.  Don't go and think that we're clever by
    // adding these zeros to the constant offsets.
    if (llvm::isa<llvm::ConstantAggregateZero>(origVariableOffset))
        return false;

    // Try to decompose the old variable offset
    llvm::Value *constOffset = NULL;
    llvm::Value *variableOffset = NULL;
    lExtractConstantOffset(origVariableOffset, &constOffset, &variableOffset, callInst);

    // No luck
    if (constOffset == NULL)
        return false;

    // Total luck: everything could be moved to the constant offset
    if (variableOffset == NULL)
        variableOffset = LLVMIntAsType(0, origVariableOffset->getType());

    // We need to scale the value we add to the constant offset by the
    // 2/4/8 scale for the variable offset, if present.
    llvm::ConstantInt *varScale = llvm::dyn_cast<llvm::ConstantInt>(callInst->getArgOperand(2));
    Assert(varScale != NULL);

    llvm::Value *scaleSmear;
    if (origVariableOffset->getType() == LLVMTypes::Int64VectorType)
        scaleSmear = LLVMInt64Vector((int64_t)varScale->getZExtValue());
    else
        scaleSmear = LLVMInt32Vector((int32_t)varScale->getZExtValue());

    constOffset =
        llvm::BinaryOperator::Create(llvm::Instruction::Mul, constOffset, scaleSmear, constOffset->getName(), callInst);

    // And add the additional offset to the original constant offset
    constOffset = llvm::BinaryOperator::Create(llvm::Instruction::Add, constOffset, callInst->getArgOperand(3),
                                               callInst->getArgOperand(3)->getName(), callInst);

    // Finally, update the values of the operands to the gather/scatter
    // function.
    callInst->setArgOperand(1, variableOffset);
    callInst->setArgOperand(3, constOffset);

    return true;
}

static llvm::Value *lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, llvm::Instruction *insertBefore,
                                          int typeScale = 1) {
    llvm::Value *firstOffset = LLVMExtractFirstVectorElement(offsets);
    Assert(firstOffset != NULL);
    llvm::Value *typeScaleValue =
        firstOffset->getType() == LLVMTypes::Int32Type ? LLVMInt32(typeScale) : LLVMInt64(typeScale);
    if (g->target->isGenXTarget() && typeScale > 1) {
        firstOffset = llvm::BinaryOperator::Create(llvm::Instruction::SDiv, firstOffset, typeScaleValue,
                                                   "scaled_offset", insertBefore);
    }

    return lGEPInst(base, firstOffset, "ptr", insertBefore);
}

static llvm::Constant *lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) {
    llvm::ConstantInt *offsetScaleInt = llvm::dyn_cast<llvm::ConstantInt>(offsetScale);
    Assert(offsetScaleInt != NULL);
    uint64_t scaleValue = offsetScaleInt->getZExtValue();

    std::vector<llvm::Constant *> scales;
    for (int i = 0; i < g->target->getVectorWidth(); ++i) {
        if (vecType == LLVMTypes::Int64VectorType)
            scales.push_back(LLVMInt64(scaleValue));
        else {
            Assert(vecType == LLVMTypes::Int32VectorType);
            scales.push_back(LLVMInt32((int32_t)scaleValue));
        }
    }
    return llvm::ConstantVector::get(scales);
}

/** After earlier optimization passes have run, we are sometimes able to
    determine that gathers/scatters are actually accessing memory in a more
    regular fashion and then change the operation to something simpler and
    more efficient.  For example, if all of the lanes in a gather are
    reading from the same location, we can instead do a scalar load and
    broadcast.  This pass examines gathers and scatters and tries to
    simplify them if at all possible.

    @todo Currently, this only looks for all program instances going to the
    same location and all going to a linear sequence of locations in
    memory.  There are a number of other cases that might make sense to
    look for, including things that could be handled with a vector load +
    shuffle or things that could be handled with hybrids of e.g. 2 4-wide
    vector loads with AVX, etc.
*/
static bool lGSToLoadStore(llvm::CallInst *callInst) {
    struct GatherImpInfo {
        GatherImpInfo(const char *pName, const char *lmName, const char *bmName, llvm::Type *st, int a)
            : align(a), isFactored(!g->target->hasGather()) {
            pseudoFunc = m->module->getFunction(pName);
            loadMaskedFunc = m->module->getFunction(lmName);
            blendMaskedFunc = m->module->getFunction(bmName);
            Assert(pseudoFunc != NULL && loadMaskedFunc != NULL);
            scalarType = st;
        }

        llvm::Function *pseudoFunc;
        llvm::Function *loadMaskedFunc;
        llvm::Function *blendMaskedFunc;
        llvm::Type *scalarType;
        const int align;
        const bool isFactored;
    };

    GatherImpInfo gInfo[] = {
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i8"
                                             : "__pseudo_gather_factored_base_offsets32_i8",
                      "__masked_load_i8", "__masked_load_blend_i8", LLVMTypes::Int8Type, 1),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i16"
                                             : "__pseudo_gather_factored_base_offsets32_i16",
                      "__masked_load_i16", "__masked_load_blend_i16", LLVMTypes::Int16Type, 2),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i32"
                                             : "__pseudo_gather_factored_base_offsets32_i32",
                      "__masked_load_i32", "__masked_load_blend_i32", LLVMTypes::Int32Type, 4),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_float"
                                             : "__pseudo_gather_factored_base_offsets32_float",
                      "__masked_load_float", "__masked_load_blend_float", LLVMTypes::FloatType, 4),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_i64"
                                             : "__pseudo_gather_factored_base_offsets32_i64",
                      "__masked_load_i64", "__masked_load_blend_i64", LLVMTypes::Int64Type, 8),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets32_double"
                                             : "__pseudo_gather_factored_base_offsets32_double",
                      "__masked_load_double", "__masked_load_blend_double", LLVMTypes::DoubleType, 8),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i8"
                                             : "__pseudo_gather_factored_base_offsets64_i8",
                      "__masked_load_i8", "__masked_load_blend_i8", LLVMTypes::Int8Type, 1),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i16"
                                             : "__pseudo_gather_factored_base_offsets64_i16",
                      "__masked_load_i16", "__masked_load_blend_i16", LLVMTypes::Int16Type, 2),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i32"
                                             : "__pseudo_gather_factored_base_offsets64_i32",
                      "__masked_load_i32", "__masked_load_blend_i32", LLVMTypes::Int32Type, 4),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_float"
                                             : "__pseudo_gather_factored_base_offsets64_float",
                      "__masked_load_float", "__masked_load_blend_float", LLVMTypes::FloatType, 4),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_i64"
                                             : "__pseudo_gather_factored_base_offsets64_i64",
                      "__masked_load_i64", "__masked_load_blend_i64", LLVMTypes::Int64Type, 8),
        GatherImpInfo(g->target->hasGather() ? "__pseudo_gather_base_offsets64_double"
                                             : "__pseudo_gather_factored_base_offsets64_double",
                      "__masked_load_double", "__masked_load_blend_double", LLVMTypes::DoubleType, 8),
    };

    struct ScatterImpInfo {
        ScatterImpInfo(const char *pName, const char *msName, llvm::Type *vpt, int a)
            : align(a), isFactored(!g->target->hasScatter()) {
            pseudoFunc = m->module->getFunction(pName);
            maskedStoreFunc = m->module->getFunction(msName);
            vecPtrType = vpt;
            Assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
        }
        llvm::Function *pseudoFunc;
        llvm::Function *maskedStoreFunc;
        llvm::Type *vecPtrType;
        const int align;
        const bool isFactored;
    };

    ScatterImpInfo sInfo[] = {
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i8"
                                               : "__pseudo_scatter_factored_base_offsets32_i8",
                       "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i16"
                                               : "__pseudo_scatter_factored_base_offsets32_i16",
                       "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i32"
                                               : "__pseudo_scatter_factored_base_offsets32_i32",
                       "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_float"
                                               : "__pseudo_scatter_factored_base_offsets32_float",
                       "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_i64"
                                               : "__pseudo_scatter_factored_base_offsets32_i64",
                       "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets32_double"
                                               : "__pseudo_scatter_factored_base_offsets32_double",
                       "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i8"
                                               : "__pseudo_scatter_factored_base_offsets64_i8",
                       "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i16"
                                               : "__pseudo_scatter_factored_base_offsets64_i16",
                       "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i32"
                                               : "__pseudo_scatter_factored_base_offsets64_i32",
                       "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_float"
                                               : "__pseudo_scatter_factored_base_offsets64_float",
                       "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_i64"
                                               : "__pseudo_scatter_factored_base_offsets64_i64",
                       "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8),
        ScatterImpInfo(g->target->hasScatter() ? "__pseudo_scatter_base_offsets64_double"
                                               : "__pseudo_scatter_factored_base_offsets64_double",
                       "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8),
    };

    llvm::Function *calledFunc = callInst->getCalledFunction();

    GatherImpInfo *gatherInfo = NULL;
    ScatterImpInfo *scatterInfo = NULL;
    for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
        if (gInfo[i].pseudoFunc != NULL && calledFunc == gInfo[i].pseudoFunc) {
            gatherInfo = &gInfo[i];
            break;
        }
    }
    for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
        if (sInfo[i].pseudoFunc != NULL && calledFunc == sInfo[i].pseudoFunc) {
            scatterInfo = &sInfo[i];
            break;
        }
    }
    if (gatherInfo == NULL && scatterInfo == NULL)
        return false;

    SourcePos pos;
    lGetSourcePosFromMetadata(callInst, &pos);

    llvm::Value *base = callInst->getArgOperand(0);
    llvm::Value *fullOffsets = NULL;
    llvm::Value *storeValue = NULL;
    llvm::Value *mask = NULL;
    if ((gatherInfo != NULL && gatherInfo->isFactored) || (scatterInfo != NULL && scatterInfo->isFactored)) {
        llvm::Value *varyingOffsets = callInst->getArgOperand(1);
        llvm::Value *offsetScale = callInst->getArgOperand(2);
        llvm::Value *constOffsets = callInst->getArgOperand(3);
        if (scatterInfo)
            storeValue = callInst->getArgOperand(4);
        mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5);

        // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets
        llvm::Constant *offsetScaleVec = lGetOffsetScaleVec(offsetScale, varyingOffsets->getType());

        llvm::Value *scaledVarying = llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec,
                                                                  varyingOffsets, "scaled_varying", callInst);
        fullOffsets = llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, constOffsets,
                                                   "varying+const_offsets", callInst);
    } else {
        if (scatterInfo)
            storeValue = callInst->getArgOperand(3);
        mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4);

        llvm::Value *offsetScale = callInst->getArgOperand(1);
        llvm::Value *offsets = callInst->getArgOperand(2);
        llvm::Value *offsetScaleVec = lGetOffsetScaleVec(offsetScale, offsets->getType());

        fullOffsets =
            llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, offsets, "scaled_offsets", callInst);
    }

    Debug(SourcePos(), "GSToLoadStore: %s.", fullOffsets->getName().str().c_str());
    llvm::Type *scalarType = (gatherInfo != NULL) ? gatherInfo->scalarType : scatterInfo->vecPtrType->getScalarType();
    int typeScale = g->target->getDataLayout()->getTypeStoreSize(scalarType) /
                    g->target->getDataLayout()->getTypeStoreSize(base->getType()->getContainedType(0));

    if (LLVMVectorValuesAllEqual(fullOffsets)) {
        // If all the offsets are equal, then compute the single
        // pointer they all represent based on the first one of them
        // (arbitrarily).
        if (gatherInfo != NULL) {
            // A gather with everyone going to the same location is
            // handled as a scalar load and broadcast across the lanes.
            Debug(pos, "Transformed gather to scalar load and broadcast!");
            llvm::Value *ptr;
            // For gen we need to cast the base first and only after that get common pointer otherwise
            // CM backend will be broken on bitcast i8* to T* instruction with following load.
            // For this we need to re-calculate the offset basing on type sizes.
            if (g->target->isGenXTarget()) {
                base = new llvm::BitCastInst(base, llvm::PointerType::get(scalarType, 0), base->getName(), callInst);
                ptr = lComputeCommonPointer(base, fullOffsets, callInst, typeScale);
            } else {
                ptr = lComputeCommonPointer(base, fullOffsets, callInst);
                ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(scalarType, 0), base->getName(), callInst);
            }

            lCopyMetadata(ptr, callInst);
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
            Assert(llvm::isa<llvm::PointerType>(ptr->getType()));
            llvm::Value *scalarValue =
                new llvm::LoadInst(llvm::dyn_cast<llvm::PointerType>(ptr->getType())->getPointerElementType(), ptr,
                                   callInst->getName(), callInst);
#else
            llvm::Value *scalarValue = new llvm::LoadInst(ptr, callInst->getName(), callInst);
#endif

            // Generate the following sequence:
            //   %name123 = insertelement <4 x i32> undef, i32 %val, i32 0
            //   %name124 = shufflevector <4 x i32> %name123, <4 x i32> undef,
            //                                              <4 x i32> zeroinitializer
            llvm::Value *undef1Value = llvm::UndefValue::get(callInst->getType());
            llvm::Value *undef2Value = llvm::UndefValue::get(callInst->getType());
            llvm::Value *insertVec =
                llvm::InsertElementInst::Create(undef1Value, scalarValue, LLVMInt32(0), callInst->getName(), callInst);
            llvm::Value *zeroMask =
#if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
                llvm::ConstantVector::getSplat(callInst->getType()->getVectorNumElements(),
#elif ISPC_LLVM_VERSION < ISPC_LLVM_12_0
                llvm::ConstantVector::getSplat(
                    {llvm::dyn_cast<llvm::VectorType>(callInst->getType())->getNumElements(), false},

#else // LLVM 12.0+
                llvm::ConstantVector::getSplat(
                    llvm::ElementCount::get(
                        llvm::dyn_cast<llvm::FixedVectorType>(callInst->getType())->getNumElements(), false),
#endif
                                               llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
            llvm::Value *shufValue = new llvm::ShuffleVectorInst(insertVec, undef2Value, zeroMask, callInst->getName());

            lCopyMetadata(shufValue, callInst);
            llvm::ReplaceInstWithInst(callInst, llvm::dyn_cast<llvm::Instruction>(shufValue));
            return true;
        } else {
            // A scatter with everyone going to the same location is
            // undefined (if there's more than one program instance in
            // the gang).  Issue a warning.
            if (g->target->getVectorWidth() > 1)
                Warning(pos, "Undefined behavior: all program instances are "
                             "writing to the same location!");

            // We could do something similar to the gather case, where
            // we arbitrarily write one of the values, but we need to
            // a) check to be sure the mask isn't all off and b) pick
            // the value from an executing program instance in that
            // case.  We'll just let a bunch of the program instances
            // do redundant writes, since this isn't important to make
            // fast anyway...
            return false;
        }
    } else {
        int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
        if (step > 0 && LLVMVectorIsLinear(fullOffsets, step)) {
            // We have a linear sequence of memory locations being accessed
            // starting with the location given by the offset from
            // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
            // and 64 bit gather/scatters, respectively.
            llvm::Value *ptr;

            if (gatherInfo != NULL) {
                if (g->target->isGenXTarget()) {
                    // For gen we need to cast the base first and only after that get common pointer otherwise
                    // CM backend will be broken on bitcast i8* to T* instruction with following load.
                    // For this we need to re-calculate the offset basing on type sizes.
                    // Second bitcast to void* does not cause such problem in backend.
                    base =
                        new llvm::BitCastInst(base, llvm::PointerType::get(scalarType, 0), base->getName(), callInst);
                    ptr = lComputeCommonPointer(base, fullOffsets, callInst, typeScale);
                    ptr = new llvm::BitCastInst(ptr, LLVMTypes::Int8PointerType, base->getName(), callInst);
                } else {
                    ptr = lComputeCommonPointer(base, fullOffsets, callInst);
                }
                lCopyMetadata(ptr, callInst);
                Debug(pos, "Transformed gather to unaligned vector load!");
                bool doBlendLoad = false;
#ifdef ISPC_GENX_ENABLED
                doBlendLoad = g->target->isGenXTarget() && g->opt.enableGenXUnsafeMaskedLoad;
#endif
                llvm::Instruction *newCall =
                    lCallInst(doBlendLoad ? gatherInfo->blendMaskedFunc : gatherInfo->loadMaskedFunc, ptr, mask,
                              llvm::Twine(ptr->getName()) + "_masked_load");
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
                return true;
            } else {
                Debug(pos, "Transformed scatter to unaligned vector store!");
                ptr = lComputeCommonPointer(base, fullOffsets, callInst);
                ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", callInst);
                llvm::Instruction *newCall = lCallInst(scatterInfo->maskedStoreFunc, ptr, storeValue, mask, "");
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
                return true;
            }
        }
        return false;
    }
}

///////////////////////////////////////////////////////////////////////////
// MaskedStoreOptPass

#ifdef ISPC_GENX_ENABLED
static llvm::Function *lGenXMaskedInt8Inst(llvm::Instruction *inst, bool isStore) {
    std::string maskedFuncName;
    if (isStore) {
        maskedFuncName = "masked_store_i8";
    } else {
        maskedFuncName = "masked_load_i8";
    }
    llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(inst);
    if (callInst != NULL && callInst->getCalledFunction()->getName().contains(maskedFuncName)) {
        return NULL;
    }
    return m->module->getFunction("__" + maskedFuncName);
}

static llvm::CallInst *lGenXStoreInst(llvm::Value *val, llvm::Value *ptr, llvm::Instruction *inst) {
    Assert(g->target->isGenXTarget());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
    Assert(llvm::isa<llvm::FixedVectorType>(val->getType()));
    llvm::FixedVectorType *valVecType = llvm::dyn_cast<llvm::FixedVectorType>(val->getType());
#else
    Assert(llvm::isa<llvm::VectorType>(val->getType()));
    llvm::VectorType *valVecType = llvm::dyn_cast<llvm::VectorType>(val->getType());
#endif
    Assert(llvm::isPowerOf2_32(valVecType->getNumElements()));
    Assert(valVecType->getPrimitiveSizeInBits() / 8 <= 8 * OWORD);

    // The data write of svm store must have a size that is a power of two from 16 to 128
    // bytes. However for int8 type and simd width = 8, the data write size is 8.
    // So we use masked store function here instead of svm store which process int8 type
    // correctly.
    if (valVecType->getPrimitiveSizeInBits() / 8 < 16) {
        Assert(valVecType->getScalarType() == LLVMTypes::Int8Type);
        if (llvm::Function *maskedFunc = lGenXMaskedInt8Inst(inst, true))
            return llvm::dyn_cast<llvm::CallInst>(lCallInst(maskedFunc, ptr, val, LLVMMaskAllOn, ""));
        else {
            return NULL;
        }
    }
    llvm::Instruction *svm_st_zext = new llvm::PtrToIntInst(ptr, LLVMTypes::Int64Type, "svm_st_ptrtoint", inst);

    llvm::Type *argTypes[] = {svm_st_zext->getType(), val->getType()};
    auto Fn = llvm::GenXIntrinsic::getGenXDeclaration(m->module, llvm::GenXIntrinsic::genx_svm_block_st, argTypes);
    return llvm::CallInst::Create(Fn, {svm_st_zext, val}, inst->getName());
}

static llvm::CallInst *lGenXLoadInst(llvm::Value *ptr, llvm::Type *retType, llvm::Instruction *inst) {
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
    Assert(llvm::isa<llvm::FixedVectorType>(retType));
    llvm::FixedVectorType *retVecType = llvm::dyn_cast<llvm::FixedVectorType>(retType);
#else
    Assert(llvm::isa<llvm::VectorType>(retType));
    llvm::VectorType *retVecType = llvm::dyn_cast<llvm::VectorType>(retType);
#endif
    Assert(llvm::isPowerOf2_32(retVecType->getNumElements()));
    Assert(retVecType->getPrimitiveSizeInBits());
    Assert(retVecType->getPrimitiveSizeInBits() / 8 <= 8 * OWORD);
    // The data read of svm load must have a size that is a power of two from 16 to 128
    // bytes. However for int8 type and simd width = 8, the data read size is 8.
    // So we use masked load function here instead of svm load which process int8 type
    // correctly.
    if (retVecType->getPrimitiveSizeInBits() / 8 < 16) {
        Assert(retVecType->getScalarType() == LLVMTypes::Int8Type);
        if (llvm::Function *maskedFunc = lGenXMaskedInt8Inst(inst, false))
            return llvm::dyn_cast<llvm::CallInst>(lCallInst(maskedFunc, ptr, LLVMMaskAllOn, ""));
        else {
            return NULL;
        }
    }

    llvm::Value *svm_ld_ptrtoint = new llvm::PtrToIntInst(ptr, LLVMTypes::Int64Type, "svm_ld_ptrtoint", inst);

    auto Fn = llvm::GenXIntrinsic::getGenXDeclaration(m->module, llvm::GenXIntrinsic::genx_svm_block_ld_unaligned,
                                                      {retType, svm_ld_ptrtoint->getType()});

    return llvm::CallInst::Create(Fn, svm_ld_ptrtoint, inst->getName());
}
#endif
/** Masked stores are generally more complex than regular stores; for
    example, they require multiple instructions to simulate under SSE.
    This optimization detects cases where masked stores can be replaced
    with regular stores or removed entirely, for the cases of an 'all on'
    mask and an 'all off' mask, respectively.
*/
static bool lImproveMaskedStore(llvm::CallInst *callInst) {
    struct MSInfo {
        MSInfo(const char *name, const int a) : align(a) {
            func = m->module->getFunction(name);
            Assert(func != NULL);
        }
        llvm::Function *func;
        const int align;
    };

    MSInfo msInfo[] = {MSInfo("__pseudo_masked_store_i8", 1),  MSInfo("__pseudo_masked_store_i16", 2),
                       MSInfo("__pseudo_masked_store_i32", 4), MSInfo("__pseudo_masked_store_float", 4),
                       MSInfo("__pseudo_masked_store_i64", 8), MSInfo("__pseudo_masked_store_double", 8),
                       MSInfo("__masked_store_blend_i8", 1),   MSInfo("__masked_store_blend_i16", 2),
                       MSInfo("__masked_store_blend_i32", 4),  MSInfo("__masked_store_blend_float", 4),
                       MSInfo("__masked_store_blend_i64", 8),  MSInfo("__masked_store_blend_double", 8),
                       MSInfo("__masked_store_i8", 1),         MSInfo("__masked_store_i16", 2),
                       MSInfo("__masked_store_i32", 4),        MSInfo("__masked_store_float", 4),
                       MSInfo("__masked_store_i64", 8),        MSInfo("__masked_store_double", 8)};
    llvm::Function *called = callInst->getCalledFunction();

    int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
    MSInfo *info = NULL;
    for (int i = 0; i < nMSFuncs; ++i) {
        if (msInfo[i].func != NULL && called == msInfo[i].func) {
            info = &msInfo[i];
            break;
        }
    }
    if (info == NULL)
        return false;

    // Got one; grab the operands
    llvm::Value *lvalue = callInst->getArgOperand(0);
    llvm::Value *rvalue = callInst->getArgOperand(1);
    llvm::Value *mask = callInst->getArgOperand(2);

    MaskStatus maskStatus = lGetMaskStatus(mask);
    if (maskStatus == MaskStatus::all_off) {
        // Zero mask - no-op, so remove the store completely.  (This
        // may in turn lead to being able to optimize out instructions
        // that compute the rvalue...)
        callInst->eraseFromParent();
        return true;
    } else if (maskStatus == MaskStatus::all_on) {
        // The mask is all on, so turn this into a regular store
        llvm::Type *rvalueType = rvalue->getType();
        llvm::Instruction *store = NULL;
#ifdef ISPC_GENX_ENABLED
        // InternalLinkage check is to prevent generation of SVM store when the pointer came from caller.
        // Since it can be allocated in a caller, it may be allocated on register. Possible svm store
        // is resolved after inlining. TODO: problems can be met here in case of Stack Calls.
        if (g->target->isGenXTarget() && GetAddressSpace(lvalue) == AddressSpace::External &&
            callInst->getParent()->getParent()->getLinkage() != llvm::GlobalValue::LinkageTypes::InternalLinkage) {
            store = lGenXStoreInst(rvalue, lvalue, callInst);
        } else if (!g->target->isGenXTarget() ||
                   (g->target->isGenXTarget() && GetAddressSpace(lvalue) == AddressSpace::Local))
#endif
        {
            llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);

            lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
            lCopyMetadata(lvalue, callInst);
            store = new llvm::StoreInst(
                rvalue, lvalue, false /* not volatile */,
                llvm::MaybeAlign(g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align)
                    .valueOrOne());
        }
        if (store != NULL) {
            lCopyMetadata(store, callInst);
            llvm::ReplaceInstWithInst(callInst, store);
            return true;
        }
#ifdef ISPC_GENX_ENABLED
    } else {
        if (g->target->isGenXTarget() && GetAddressSpace(lvalue) == AddressSpace::External) {
            // In thuis case we use masked_store which on genx target causes scatter usage.
            // Get the source position from the metadata attached to the call
            // instruction so that we can issue PerformanceWarning()s below.
            SourcePos pos;
            bool gotPosition = lGetSourcePosFromMetadata(callInst, &pos);
            if (gotPosition) {
                PerformanceWarning(pos, "Scatter required to store value.");
            }
        }
#endif
    }
    return false;
}

static bool lImproveMaskedLoad(llvm::CallInst *callInst, llvm::BasicBlock::iterator iter) {
    struct MLInfo {
        MLInfo(const char *name, const int a) : align(a) {
            func = m->module->getFunction(name);
            Assert(func != NULL);
        }
        llvm::Function *func;
        const int align;
    };

    llvm::Function *called = callInst->getCalledFunction();
    // TODO: we should use dynamic data structure for MLInfo and fill
    // it differently for GenX and CPU targets. It will also help
    // to avoid declaration of GenX intrinsics for CPU targets.
    // It should be changed seamlessly here and in all similar places in this file.
    MLInfo mlInfo[] = {MLInfo("__masked_load_i8", 1),  MLInfo("__masked_load_i16", 2),
                       MLInfo("__masked_load_i32", 4), MLInfo("__masked_load_float", 4),
                       MLInfo("__masked_load_i64", 8), MLInfo("__masked_load_double", 8)};
    MLInfo genxInfo[] = {MLInfo("__masked_load_i8", 1),        MLInfo("__masked_load_i16", 2),
                         MLInfo("__masked_load_i32", 4),       MLInfo("__masked_load_float", 4),
                         MLInfo("__masked_load_i64", 8),       MLInfo("__masked_load_double", 8),
                         MLInfo("__masked_load_blend_i8", 1),  MLInfo("__masked_load_blend_i16", 2),
                         MLInfo("__masked_load_blend_i32", 4), MLInfo("__masked_load_blend_float", 4),
                         MLInfo("__masked_load_blend_i64", 8), MLInfo("__masked_load_blend_double", 8)};
    MLInfo *info = NULL;
    if (g->target->isGenXTarget()) {
        int nFuncs = sizeof(genxInfo) / sizeof(genxInfo[0]);
        for (int i = 0; i < nFuncs; ++i) {
            if (genxInfo[i].func != NULL && called == genxInfo[i].func) {
                info = &genxInfo[i];
                break;
            }
        }
    } else {
        int nFuncs = sizeof(mlInfo) / sizeof(mlInfo[0]);
        for (int i = 0; i < nFuncs; ++i) {
            if (mlInfo[i].func != NULL && called == mlInfo[i].func) {
                info = &mlInfo[i];
                break;
            }
        }
    }
    if (info == NULL)
        return false;

    // Got one; grab the operands
    llvm::Value *ptr = callInst->getArgOperand(0);
    llvm::Value *mask = callInst->getArgOperand(1);

    MaskStatus maskStatus = lGetMaskStatus(mask);
    if (maskStatus == MaskStatus::all_off) {
        // Zero mask - no-op, so replace the load with an undef value
        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), iter, llvm::UndefValue::get(callInst->getType()));
        return true;
    } else if (maskStatus == MaskStatus::all_on) {
        // The mask is all on, so turn this into a regular load
        llvm::Instruction *load = NULL;
#ifdef ISPC_GENX_ENABLED
        // InternalLinkage check is to prevent generation of SVM load when the pointer came from caller.
        // Since it can be allocated in a caller, it may be allocated on register. Possible svm load
        // is resolved after inlining. TODO: problems can be met here in case of Stack Calls.
        if (g->target->isGenXTarget() && GetAddressSpace(ptr) == AddressSpace::External &&
            callInst->getParent()->getParent()->getLinkage() != llvm::GlobalValue::LinkageTypes::InternalLinkage) {
            load = lGenXLoadInst(ptr, callInst->getType(), callInst);
        } else if (!g->target->isGenXTarget() ||
                   (g->target->isGenXTarget() && GetAddressSpace(ptr) == AddressSpace::Local))
#endif
        {
            llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
            ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load", callInst);
#if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
            load = new llvm::LoadInst(
                ptr, callInst->getName(), false /* not volatile */,
                llvm::MaybeAlign(g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align)
                    .valueOrOne(),
                (llvm::Instruction *)NULL);
#else // LLVM 11.0+
            Assert(llvm::isa<llvm::PointerType>(ptr->getType()));
            load = new llvm::LoadInst(
                llvm::dyn_cast<llvm::PointerType>(ptr->getType())->getPointerElementType(), ptr, callInst->getName(),
                false /* not volatile */,
                llvm::MaybeAlign(g->opt.forceAlignedMemory ? g->target->getNativeVectorAlignment() : info->align)
                    .valueOrOne(),
                (llvm::Instruction *)NULL);
#endif
        }
        if (load != NULL) {
            lCopyMetadata(load, callInst);
            llvm::ReplaceInstWithInst(callInst, load);
            return true;
        }
    }
    return false;
}

bool ImproveMemoryOpsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("ImproveMemoryOps");

    bool modifiedAny = false;
restart:
    // Iterate through all of the instructions in the basic block.
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        // If we don't have a call to one of the
        // __pseudo_{gather,scatter}_* functions, then just go on to the
        // next instruction.
        if (callInst == NULL || callInst->getCalledFunction() == NULL)
            continue;
        if (lGSToGSBaseOffsets(callInst)) {
            modifiedAny = true;
            goto restart;
        }
        if (lGSBaseOffsetsGetMoreConst(callInst)) {
            modifiedAny = true;
            goto restart;
        }
        if (lGSToLoadStore(callInst)) {
            modifiedAny = true;
            goto restart;
        }
        if (lImproveMaskedStore(callInst)) {
            modifiedAny = true;
            goto restart;
        }
        if (lImproveMaskedLoad(callInst, iter)) {
            modifiedAny = true;
            goto restart;
        }
    }

    DEBUG_END_PASS("ImproveMemoryOps");

    return modifiedAny;
}

bool ImproveMemoryOpsPass::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("ImproveMemoryOpsPass::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateImproveMemoryOpsPass() { return new ImproveMemoryOpsPass; }

///////////////////////////////////////////////////////////////////////////
// GatherCoalescePass

// This pass implements two optimizations to improve the performance of
// gathers; currently only gathers of 32-bit values where it can be
// determined at compile time that the mask is all on are supported, though
// both of those limitations may be generalized in the future.
//
//  First, for any single gather, see if it's worthwhile to break it into
//  any of scalar, 2-wide (i.e. 64-bit), 4-wide, or 8-wide loads.  Further,
//  we generate code that shuffles these loads around.  Doing fewer, larger
//  loads in this manner, when possible, can be more efficient.
//
//  Second, this pass can coalesce memory accesses across multiple
//  gathers. If we have a series of gathers without any memory writes in
//  the middle, then we try to analyze their reads collectively and choose
//  an efficient set of loads for them.  Not only does this help if
//  different gathers reuse values from the same location in memory, but
//  it's specifically helpful when data with AOS layout is being accessed;
//  in this case, we're often able to generate wide vector loads and
//  appropriate shuffles automatically.

class GatherCoalescePass : public llvm::FunctionPass {
  public:
    static char ID;
    GatherCoalescePass() : FunctionPass(ID) {}

    llvm::StringRef getPassName() const { return "Gather Coalescing"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);
};

char GatherCoalescePass::ID = 0;

/** Representation of a memory load that the gather coalescing code has
    decided to generate.
 */
struct CoalescedLoadOp {
    CoalescedLoadOp(int64_t s, int c) {
        start = s;
        count = c;
        load = element0 = element1 = NULL;
    }

    /** Starting offset of the load from the common base pointer (in terms
        of numbers of items of the underlying element type--*not* in terms
        of bytes). */
    int64_t start;

    /** Number of elements to load at this location */
    int count;

    /** Value loaded from memory for this load op */
    llvm::Value *load;

    /** For 2-wide loads (i.e. 64-bit loads), these store the lower and
        upper 32 bits of the result, respectively. */
    llvm::Value *element0, *element1;
};

/** This function determines whether it makes sense (and is safe) to
    generate a vector load of width vectorWidth, starting at *iter.  It
    returns true if so, setting *newIter to point to the next element in
    the set that isn't taken care of by the generated load.  If a vector
    load of the given width doesn't make sense, then false is returned.
 */
static bool lVectorLoadIsEfficient(std::set<int64_t>::iterator iter, std::set<int64_t>::iterator end,
                                   std::set<int64_t>::iterator *newIter, int vectorWidth) {
    // We're considering a vector load of width vectorWidth, starting at
    // the offset "start".
    int64_t start = *iter;

    // The basic idea is that we'll look at the subsequent elements in the
    // load set after the initial one at start.  As long as subsequent
    // elements:
    //
    // 1. Aren't so far separated that they no longer fit into the range
    //    [start, start+vectorWidth)
    //
    // 2. And don't have too large a gap in between them (e.g., it's not
    //    worth generating an 8-wide load for two elements with offsets 0
    //    and 7, but no loads requested in between).
    //
    // Then we continue moving forward through the elements until we either
    // fill up the vector or run out of elements.

    // lastAccepted holds the last offset we've processed and accepted as
    // valid for the vector load underconsideration
    int64_t lastAccepted = start;

    while (iter != end) {
        // What is the separation in offset values from the last element we
        // added to the set for this load?
        int64_t delta = *iter - lastAccepted;
        if (delta > 3)
            // If there's too big a gap, then we won't issue the load
            return false;

        int64_t span = *iter - start + 1;

        if (span == vectorWidth) {
            // We've extended far enough that we have exactly filled up the
            // entire vector width; we can't go any further, so return with
            // success.  (Update *newIter to point at the next element
            // after the last one accepted here.)
            *newIter = ++iter;
            return true;
        } else if (span > vectorWidth) {
            // The current offset won't fit into a vectorWidth-wide load
            // starting from start.  It's still generally worthwhile
            // issuing the load we've been considering, though, since it
            // will provide values for a number of previous offsets.  This
            // load will have one or more elements at the end of its range
            // that is not needed by any of the offsets under
            // consideration.  As such, there are three cases where issuing
            // this load is a bad idea:
            //
            // 1. 2-wide loads: we know that we haven't completely filled
            //    the 2-wide vector, since otherwise the if() test above
            //    would have succeeded previously.  Therefore, we must have
            //    a situation with offsets like (4,6,...); it would be a
            //    silly idea to issue a 2-wide load to get the value for
            //    the 4 offset, versus failing here and issuing a scalar
            //    load instead.
            //
            // 2. If there are too many unnecessary values at the end of
            //    the load extent (defined as more than half of them)--in
            //    this case, it'd be better to issue a vector load of
            //    smaller width anyway.
            //
            // 3. If the gap between the last accepted offset and the
            //    current one under consideration is more than the page
            //    size.  In this case we can't be sure whether or not some
            //    of the unused elements at the end of the load will
            //    straddle a page boundary and thus lead to an undesirable
            //    fault.  (It's hard to imagine this happening in practice,
            //    except under contrived circumstances, but better safe
            //    than sorry.)
            const int pageSize = 4096;
            if (vectorWidth != 2 && (lastAccepted - start) > (vectorWidth / 2) && (*iter - lastAccepted) < pageSize) {
                *newIter = iter;
                return true;
            } else
                return false;
        }

        // Continue moving forward
        lastAccepted = *iter;
        ++iter;
    }

    return false;
}

/** Given a set of offsets from a common base pointer that we need to get
    loaded into memory, determine a reasonable set of load operations that
    gets all of the corresponding values in memory (ideally, including as
    many as possible wider vector loads rather than scalar loads).  Return
    a CoalescedLoadOp for each one in the *loads array.
 */
static void lSelectLoads(const std::vector<int64_t> &loadOffsets, std::vector<CoalescedLoadOp> *loads) {
    // First, get a sorted set of unique offsets to load from.
    std::set<int64_t> allOffsets;
    for (unsigned int i = 0; i < loadOffsets.size(); ++i)
        allOffsets.insert(loadOffsets[i]);

    std::set<int64_t>::iterator iter = allOffsets.begin();
    while (iter != allOffsets.end()) {
        Debug(SourcePos(), "Load needed at %" PRId64 ".", *iter);
        ++iter;
    }

    // Now, iterate over the offsets from low to high.  Starting at the
    // current offset, we see if a vector load starting from that offset
    // will cover loads at subsequent offsets as well.
    iter = allOffsets.begin();
    while (iter != allOffsets.end()) {
        // Consider vector loads of width of each of the elements of
        // spanSizes[], in order.
        int vectorWidths[] = {8, 4, 2};
        int nVectorWidths = sizeof(vectorWidths) / sizeof(vectorWidths[0]);
        bool gotOne = false;
        for (int i = 0; i < nVectorWidths; ++i) {
            // See if a load of vector with width vectorWidths[i] would be
            // effective (i.e. would cover a reasonable number of the
            // offsets that need to be loaded from).
            std::set<int64_t>::iterator newIter;
            if (lVectorLoadIsEfficient(iter, allOffsets.end(), &newIter, vectorWidths[i])) {
                // Yes: create the corresponding coalesced load and update
                // the iterator to the returned iterator; doing so skips
                // over the additional offsets that are taken care of by
                // this load.
                loads->push_back(CoalescedLoadOp(*iter, vectorWidths[i]));
                iter = newIter;
                gotOne = true;
                break;
            }
        }

        if (gotOne == false) {
            // We couldn't find a vector load starting from this offset
            // that made sense, so emit a scalar load and continue onward.
            loads->push_back(CoalescedLoadOp(*iter, 1));
            ++iter;
        }
    }
}

/** Print a performance message with the details of the result of
    coalescing over a group of gathers. */
static void lCoalescePerfInfo(const std::vector<llvm::CallInst *> &coalesceGroup,
                              const std::vector<CoalescedLoadOp> &loadOps) {
    SourcePos pos;
    lGetSourcePosFromMetadata(coalesceGroup[0], &pos);

    // Create a string that indicates the line numbers of the subsequent
    // gathers from the first one that were coalesced here.
    char otherPositions[512];
    otherPositions[0] = '\0';
    if (coalesceGroup.size() > 1) {
        const char *plural = (coalesceGroup.size() > 2) ? "s" : "";
        char otherBuf[32];
        snprintf(otherBuf, sizeof(otherBuf), "(other%s at line%s ", plural, plural);
        strncat(otherPositions, otherBuf, sizeof(otherPositions) - strlen(otherPositions) - 1);

        for (int i = 1; i < (int)coalesceGroup.size(); ++i) {
            SourcePos p;
            bool ok = lGetSourcePosFromMetadata(coalesceGroup[i], &p);
            if (ok) {
                char buf[32];
                snprintf(buf, sizeof(buf), "%d", p.first_line);
                strncat(otherPositions, buf, sizeof(otherPositions) - strlen(otherPositions) - 1);
                if (i < (int)coalesceGroup.size() - 1)
                    strncat(otherPositions, ", ", sizeof(otherPositions) - strlen(otherPositions) - 1);
            }
        }
        strncat(otherPositions, ") ", sizeof(otherPositions) - strlen(otherPositions) - 1);
    }

    // Count how many loads of each size there were.
    std::map<int, int> loadOpsCount;
    for (int i = 0; i < (int)loadOps.size(); ++i)
        ++loadOpsCount[loadOps[i].count];

    // Generate a string the describes the mix of load ops
    char loadOpsInfo[512];
    loadOpsInfo[0] = '\0';
    std::map<int, int>::const_iterator iter = loadOpsCount.begin();
    while (iter != loadOpsCount.end()) {
        char buf[32];
        snprintf(buf, sizeof(buf), "%d x %d-wide", iter->second, iter->first);
        if ((strlen(loadOpsInfo) + strlen(buf)) >= 512) {
            break;
        }
        strncat(loadOpsInfo, buf, sizeof(loadOpsInfo) - strlen(loadOpsInfo) - 1);
        ++iter;
        if (iter != loadOpsCount.end())
            strncat(loadOpsInfo, ", ", sizeof(loadOpsInfo) - strlen(loadOpsInfo) - 1);
    }

    if (g->opt.level > 0) {
        if (coalesceGroup.size() == 1)
            PerformanceWarning(pos, "Coalesced gather into %d load%s (%s).", (int)loadOps.size(),
                               (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
        else
            PerformanceWarning(pos,
                               "Coalesced %d gathers starting here %sinto %d "
                               "load%s (%s).",
                               (int)coalesceGroup.size(), otherPositions, (int)loadOps.size(),
                               (loadOps.size() > 1) ? "s" : "", loadOpsInfo);
    }
}

/** Utility routine that computes an offset from a base pointer and then
    returns the result of a load of the given type from the resulting
    location:

    return *((type *)(basePtr + offset))
 */
llvm::Value *lGEPAndLoad(llvm::Value *basePtr, int64_t offset, int align, llvm::Instruction *insertBefore,
                         llvm::Type *type) {
    llvm::Value *ptr = lGEPInst(basePtr, LLVMInt64(offset), "new_base", insertBefore);
    ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(type, 0), "ptr_cast", insertBefore);
#if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
    return new llvm::LoadInst(ptr, "gather_load", false /* not volatile */, llvm::MaybeAlign(align), insertBefore);
#else // LLVM 11.0+
    Assert(llvm::isa<llvm::PointerType>(ptr->getType()));
    return new llvm::LoadInst(llvm::dyn_cast<llvm::PointerType>(ptr->getType())->getPointerElementType(), ptr,
                              "gather_load", false /* not volatile */, llvm::MaybeAlign(align).valueOrOne(),
                              insertBefore);
#endif
}

/* Having decided that we're doing to emit a series of loads, as encoded in
   the loadOps array, this function emits the corresponding load
   instructions.
 */
static void lEmitLoads(llvm::Value *basePtr, std::vector<CoalescedLoadOp> &loadOps, int elementSize,
                       llvm::Instruction *insertBefore) {
    Debug(SourcePos(), "Coalesce doing %d loads.", (int)loadOps.size());
    for (int i = 0; i < (int)loadOps.size(); ++i) {
        Debug(SourcePos(), "Load #%d @ %" PRId64 ", %d items", i, loadOps[i].start, loadOps[i].count);

        // basePtr is an i8 *, so the offset from it should be in terms of
        // bytes, not underlying i32 elements.
        int64_t start = loadOps[i].start * elementSize;

        int align = 4;
        switch (loadOps[i].count) {
        case 1:
            // Single 32-bit scalar load
            loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, LLVMTypes::Int32Type);
            break;
        case 2: {
            // Emit 2 x i32 loads as i64 loads and then break the result
            // into two 32-bit parts.
            loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, LLVMTypes::Int64Type);
            // element0 = (int32)value;
            loadOps[i].element0 =
                new llvm::TruncInst(loadOps[i].load, LLVMTypes::Int32Type, "load64_elt0", insertBefore);
            // element1 = (int32)(value >> 32)
            llvm::Value *shift = llvm::BinaryOperator::Create(llvm::Instruction::LShr, loadOps[i].load, LLVMInt64(32),
                                                              "load64_shift", insertBefore);
            loadOps[i].element1 = new llvm::TruncInst(shift, LLVMTypes::Int32Type, "load64_elt1", insertBefore);
            break;
        }
        case 4: {
            // 4-wide vector load
            if (g->opt.forceAlignedMemory) {
                align = g->target->getNativeVectorAlignment();
            }
            llvm::VectorType *vt = LLVMVECTOR::get(LLVMTypes::Int32Type, 4);
            loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, vt);
            break;
        }
        case 8: {
            // 8-wide vector load
            if (g->opt.forceAlignedMemory) {
                align = g->target->getNativeVectorAlignment();
            }
            llvm::VectorType *vt = LLVMVECTOR::get(LLVMTypes::Int32Type, 8);
            loadOps[i].load = lGEPAndLoad(basePtr, start, align, insertBefore, vt);
            break;
        }
        default:
            FATAL("Unexpected load count in lEmitLoads()");
        }
    }
}

/** Convert any loads of 8-wide vectors into two 4-wide vectors
    (logically).  This allows the assembly code below to always operate on
    4-wide vectors, which leads to better code.  Returns a new vector of
    load operations.
 */
static std::vector<CoalescedLoadOp> lSplit8WideLoads(const std::vector<CoalescedLoadOp> &loadOps,
                                                     llvm::Instruction *insertBefore) {
    std::vector<CoalescedLoadOp> ret;
    for (unsigned int i = 0; i < loadOps.size(); ++i) {
        if (loadOps[i].count == 8) {
            // Create fake CoalescedLOadOps, where the load llvm::Value is
            // actually a shuffle that pulls either the first 4 or the last
            // 4 values out of the original 8-wide loaded value.
            int32_t shuf[2][4] = {{0, 1, 2, 3}, {4, 5, 6, 7}};

            ret.push_back(CoalescedLoadOp(loadOps[i].start, 4));
            ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load, shuf[0], 4, insertBefore);

            ret.push_back(CoalescedLoadOp(loadOps[i].start + 4, 4));
            ret.back().load = LLVMShuffleVectors(loadOps[i].load, loadOps[i].load, shuf[1], 4, insertBefore);
        } else
            ret.push_back(loadOps[i]);
    }

    return ret;
}

/** Given a 1-wide load of a 32-bit value, merge its value into the result
    vector for any and all elements for which it applies.
 */
static llvm::Value *lApplyLoad1(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
                                llvm::Instruction *insertBefore) {
    for (int elt = 0; elt < 4; ++elt) {
        if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
            Debug(SourcePos(),
                  "Load 1 @ %" PRId64 " matches for element #%d "
                  "(value %" PRId64 ")",
                  load.start, elt, offsets[elt]);
            // If this load gives one of the values that we need, then we
            // can just insert it in directly
            Assert(set[elt] == false);
            result = llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt), "insert_load", insertBefore);
            set[elt] = true;
        }
    }

    return result;
}

/** Similarly, incorporate the values from a 2-wide load into any vector
    elements that they apply to. */
static llvm::Value *lApplyLoad2(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
                                llvm::Instruction *insertBefore) {
    int elt = 0;
    while (elt < 4) {
        // First, try to do a 64-bit-wide insert into the result vector.
        // We can do this when we're currently at an even element, when the
        // current and next element have consecutive values, and where the
        // original 64-bit load is at the offset needed by the current
        // element.
        if ((elt & 1) == 0 && offsets[elt] + 1 == offsets[elt + 1] && offsets[elt] == load.start) {
            Debug(SourcePos(),
                  "Load 2 @ %" PRId64 " matches for elements #%d,%d "
                  "(values %" PRId64 ",%" PRId64 ")",
                  load.start, elt, elt + 1, offsets[elt], offsets[elt + 1]);
            Assert(set[elt] == false && ((elt < 3) && set[elt + 1] == false));

            // In this case, we bitcast from a 4xi32 to a 2xi64 vector
            llvm::Type *vec2x64Type = LLVMVECTOR::get(LLVMTypes::Int64Type, 2);
            result = new llvm::BitCastInst(result, vec2x64Type, "to2x64", insertBefore);

            // And now we can insert the 64-bit wide value into the
            // appropriate elment
            result = llvm::InsertElementInst::Create(result, load.load, LLVMInt32(elt / 2), "insert64", insertBefore);

            // And back to 4xi32.
            llvm::Type *vec4x32Type = LLVMVECTOR::get(LLVMTypes::Int32Type, 4);
            result = new llvm::BitCastInst(result, vec4x32Type, "to4x32", insertBefore);

            set[elt] = true;
            if (elt < 3) {
                set[elt + 1] = true;
            }
            // Advance elt one extra time, since we just took care of two
            // elements
            ++elt;
        } else if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
            Debug(SourcePos(),
                  "Load 2 @ %" PRId64 " matches for element #%d "
                  "(value %" PRId64 ")",
                  load.start, elt, offsets[elt]);
            // Otherwise, insert one of the 32-bit pieces into an element
            // of the final vector
            Assert(set[elt] == false);
            llvm::Value *toInsert = (offsets[elt] == load.start) ? load.element0 : load.element1;
            result = llvm::InsertElementInst::Create(result, toInsert, LLVMInt32(elt), "insert_load", insertBefore);
            set[elt] = true;
        }
        ++elt;
    }

    return result;
}

#if 1
/* This approach works better with AVX, while the #else path generates
   slightly better code with SSE.  Need to continue to dig into performance
   details with this stuff in general... */

/** And handle a 4-wide load */
static llvm::Value *lApplyLoad4(llvm::Value *result, const CoalescedLoadOp &load, const int64_t offsets[4], bool set[4],
                                llvm::Instruction *insertBefore) {
    // Conceptually, we're doing to consider doing a shuffle vector with
    // the 4-wide load and the 4-wide result we have so far to generate a
    // new 4-wide vector.  We'll start with shuffle indices that just
    // select each element of the result so far for the result.
    int32_t shuf[4] = {4, 5, 6, 7};

    for (int elt = 0; elt < 4; ++elt) {
        if (offsets[elt] >= load.start && offsets[elt] < load.start + load.count) {
            Debug(SourcePos(),
                  "Load 4 @ %" PRId64 " matches for element #%d "
                  "(value %" PRId64 ")",
                  load.start, elt, offsets[elt]);

            // If the current element falls within the range of locations
            // that the 4-wide load covers, then compute the appropriate
            // shuffle index that extracts the appropriate element from the
            // load.
            Assert(set[elt] == false);
            shuf[elt] = int32_t(offsets[elt] - load.start);
            set[elt] = true;
        }
    }

    // Now, issue a shufflevector instruction if any of the values from the
    // load we just considered were applicable.
    if (shuf[0] != 4 || shuf[1] != 5 || shuf[2] != 6 || shuf[3] != 7)
        result = LLVMShuffleVectors(load.load, result, shuf, 4, insertBefore);

    return result;
}

/** We're need to fill in the values for a 4-wide result vector.  This
    function looks at all of the generated loads and extracts the
    appropriate elements from the appropriate loads to assemble the result.
    Here the offsets[] parameter gives the 4 offsets from the base pointer
    for the four elements of the result.
*/
static llvm::Value *lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, const int64_t offsets[4],
                                     llvm::Instruction *insertBefore) {
    llvm::Type *returnType = LLVMVECTOR::get(LLVMTypes::Int32Type, 4);
    llvm::Value *result = llvm::UndefValue::get(returnType);

    Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
          offsets[1], offsets[2], offsets[3]);

    // Track whether we have found a valid value for each of the four
    // elements of the result
    bool set[4] = {false, false, false, false};

    // Loop over all of the loads and check each one to see if it provides
    // a value that's applicable to the result
    for (int load = 0; load < (int)loadOps.size(); ++load) {
        const CoalescedLoadOp &li = loadOps[load];

        switch (li.count) {
        case 1:
            result = lApplyLoad1(result, li, offsets, set, insertBefore);
            break;
        case 2:
            result = lApplyLoad2(result, li, offsets, set, insertBefore);
            break;
        case 4:
            result = lApplyLoad4(result, li, offsets, set, insertBefore);
            break;
        default:
            FATAL("Unexpected load count in lAssemble4Vector()");
        }
    }

    Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
          offsets[1], offsets[2], offsets[3]);

    for (int i = 0; i < 4; ++i)
        Assert(set[i] == true);

    return result;
}

#else

static llvm::Value *lApplyLoad4s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
                                 const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore) {
    int32_t firstMatchElements[4] = {-1, -1, -1, -1};
    const CoalescedLoadOp *firstMatch = NULL;

    Assert(llvm::isa<llvm::UndefValue>(result));

    for (int load = 0; load < (int)loadOps.size(); ++load) {
        const CoalescedLoadOp &loadop = loadOps[load];
        if (loadop.count != 4)
            continue;

        int32_t matchElements[4] = {-1, -1, -1, -1};
        bool anyMatched = false;
        for (int elt = 0; elt < 4; ++elt) {
            if (offsets[elt] >= loadop.start && offsets[elt] < loadop.start + loadop.count) {
                Debug(SourcePos(),
                      "Load 4 @ %" PRId64 " matches for element #%d "
                      "(value %" PRId64 ")",
                      loadop.start, elt, offsets[elt]);
                anyMatched = true;
                Assert(set[elt] == false);
                matchElements[elt] = offsets[elt] - loadop.start;
                set[elt] = true;
            }
        }

        if (anyMatched) {
            if (llvm::isa<llvm::UndefValue>(result)) {
                if (firstMatch == NULL) {
                    firstMatch = &loadop;
                    for (int i = 0; i < 4; ++i)
                        firstMatchElements[i] = matchElements[i];
                } else {
                    int32_t shuffle[4] = {-1, -1, -1, -1};
                    for (int i = 0; i < 4; ++i) {
                        if (firstMatchElements[i] != -1)
                            shuffle[i] = firstMatchElements[i];
                        else
                            shuffle[i] = 4 + matchElements[i];
                    }
                    result = LLVMShuffleVectors(firstMatch->load, loadop.load, shuffle, 4, insertBefore);
                    firstMatch = NULL;
                }
            } else {
                int32_t shuffle[4] = {-1, -1, -1, -1};
                for (int i = 0; i < 4; ++i) {
                    if (matchElements[i] != -1)
                        shuffle[i] = 4 + matchElements[i];
                    else
                        shuffle[i] = i;
                }
                result = LLVMShuffleVectors(result, loadop.load, shuffle, 4, insertBefore);
            }
        }
    }

    if (firstMatch != NULL && llvm::isa<llvm::UndefValue>(result))
        return LLVMShuffleVectors(firstMatch->load, result, firstMatchElements, 4, insertBefore);
    else
        return result;
}

static llvm::Value *lApplyLoad12s(llvm::Value *result, const std::vector<CoalescedLoadOp> &loadOps,
                                  const int64_t offsets[4], bool set[4], llvm::Instruction *insertBefore) {
    // Loop over all of the loads and check each one to see if it provides
    // a value that's applicable to the result
    for (int load = 0; load < (int)loadOps.size(); ++load) {
        const CoalescedLoadOp &loadop = loadOps[load];
        Assert(loadop.count == 1 || loadop.count == 2 || loadop.count == 4);

        if (loadop.count == 1)
            result = lApplyLoad1(result, loadop, offsets, set, insertBefore);
        else if (loadop.count == 2)
            result = lApplyLoad2(result, loadop, offsets, set, insertBefore);
    }
    return result;
}

/** We're need to fill in the values for a 4-wide result vector.  This
    function looks at all of the generated loads and extracts the
    appropriate elements from the appropriate loads to assemble the result.
    Here the offsets[] parameter gives the 4 offsets from the base pointer
    for the four elements of the result.
*/
static llvm::Value *lAssemble4Vector(const std::vector<CoalescedLoadOp> &loadOps, const int64_t offsets[4],
                                     llvm::Instruction *insertBefore) {
    llvm::Type *returnType = LLVMVECTOR::get(LLVMTypes::Int32Type, 4);
    llvm::Value *result = llvm::UndefValue::get(returnType);

    Debug(SourcePos(), "Starting search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
          offsets[1], offsets[2], offsets[3]);

    // Track whether we have found a valid value for each of the four
    // elements of the result
    bool set[4] = {false, false, false, false};

    result = lApplyLoad4s(result, loadOps, offsets, set, insertBefore);
    result = lApplyLoad12s(result, loadOps, offsets, set, insertBefore);

    Debug(SourcePos(), "Done with search for loads [%" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 "].", offsets[0],
          offsets[1], offsets[2], offsets[3]);

    for (int i = 0; i < 4; ++i)
        Assert(set[i] == true);

    return result;
}
#endif

/** Given the set of loads that we've done and the set of result values to
    be computed, this function computes the final llvm::Value *s for each
    result vector.
 */
static void lAssembleResultVectors(const std::vector<CoalescedLoadOp> &loadOps,
                                   const std::vector<int64_t> &constOffsets, std::vector<llvm::Value *> &results,
                                   llvm::Instruction *insertBefore) {
    // We work on 4-wide chunks of the final values, even when we're
    // computing 8-wide or 16-wide vectors.  This gives better code from
    // LLVM's SSE/AVX code generators.
    Assert((constOffsets.size() % 4) == 0);
    std::vector<llvm::Value *> vec4s;
    for (int i = 0; i < (int)constOffsets.size(); i += 4)
        vec4s.push_back(lAssemble4Vector(loadOps, &constOffsets[i], insertBefore));

    // And now concatenate 1, 2, or 4 of the 4-wide vectors computed above
    // into 4, 8, or 16-wide final result vectors.
    int numGathers = constOffsets.size() / g->target->getVectorWidth();
    for (int i = 0; i < numGathers; ++i) {
        llvm::Value *result = NULL;
        switch (g->target->getVectorWidth()) {
        case 4:
            result = vec4s[i];
            break;
        case 8:
            result = LLVMConcatVectors(vec4s[2 * i], vec4s[2 * i + 1], insertBefore);
            break;
        case 16: {
            llvm::Value *v1 = LLVMConcatVectors(vec4s[4 * i], vec4s[4 * i + 1], insertBefore);
            llvm::Value *v2 = LLVMConcatVectors(vec4s[4 * i + 2], vec4s[4 * i + 3], insertBefore);
            result = LLVMConcatVectors(v1, v2, insertBefore);
            break;
        }
        default:
            FATAL("Unhandled vector width in lAssembleResultVectors()");
        }

        results.push_back(result);
    }
}

/** Given a call to a gather function, extract the base pointer, the 2/4/8
    scale, and the first varying offsets value to use them to compute that
    scalar base pointer that is shared by all of the gathers in the group.
    (Thus, this base pointer plus the constant offsets term for each gather
    gives the set of addresses to use for each gather.
 */
static llvm::Value *lComputeBasePtr(llvm::CallInst *gatherInst, llvm::Instruction *insertBefore) {
    llvm::Value *basePtr = gatherInst->getArgOperand(0);
    llvm::Value *variableOffsets = gatherInst->getArgOperand(1);
    llvm::Value *offsetScale = gatherInst->getArgOperand(2);

    // All of the variable offsets values should be the same, due to
    // checking for this in GatherCoalescePass::runOnBasicBlock().  Thus,
    // extract the first value and use that as a scalar.
    llvm::Value *variable = LLVMExtractFirstVectorElement(variableOffsets);
    Assert(variable != NULL);
    if (variable->getType() == LLVMTypes::Int64Type)
        offsetScale = new llvm::ZExtInst(offsetScale, LLVMTypes::Int64Type, "scale_to64", insertBefore);
    llvm::Value *offset =
        llvm::BinaryOperator::Create(llvm::Instruction::Mul, variable, offsetScale, "offset", insertBefore);

    return lGEPInst(basePtr, offset, "new_base", insertBefore);
}

/** Extract the constant offsets (from the common base pointer) from each
    of the gathers in a set to be coalesced.  These come in as byte
    offsets, but we'll transform them into offsets in terms of the size of
    the base scalar type being gathered.  (e.g. for an i32 gather, we might
    have offsets like <0,4,16,20>, which would be transformed to <0,1,4,5>
    here.)
 */
static void lExtractConstOffsets(const std::vector<llvm::CallInst *> &coalesceGroup, int elementSize,
                                 std::vector<int64_t> *constOffsets) {
    int width = g->target->getVectorWidth();
    *constOffsets = std::vector<int64_t>(coalesceGroup.size() * width, 0);

    int64_t *endPtr = &((*constOffsets)[0]);
    for (int i = 0; i < (int)coalesceGroup.size(); ++i, endPtr += width) {
        llvm::Value *offsets = coalesceGroup[i]->getArgOperand(3);
        int nElts;
        bool ok = LLVMExtractVectorInts(offsets, endPtr, &nElts);
        Assert(ok && nElts == width);
    }

    for (int i = 0; i < (int)constOffsets->size(); ++i)
        (*constOffsets)[i] /= elementSize;
}

/** Actually do the coalescing.  We have a set of gathers all accessing
    addresses of the form:

    (ptr + {1,2,4,8} * varyingOffset) + constOffset, a.k.a.
    basePtr + constOffset

    where varyingOffset actually has the same value across all of the SIMD
    lanes and where the part in parenthesis has the same value for all of
    the gathers in the group.
 */
static bool lCoalesceGathers(const std::vector<llvm::CallInst *> &coalesceGroup) {
    llvm::Instruction *insertBefore = coalesceGroup[0];

    // First, compute the shared base pointer for all of the gathers
    llvm::Value *basePtr = lComputeBasePtr(coalesceGroup[0], insertBefore);

    int elementSize = 0;
    if (coalesceGroup[0]->getType() == LLVMTypes::Int32VectorType ||
        coalesceGroup[0]->getType() == LLVMTypes::FloatVectorType)
        elementSize = 4;
    else if (coalesceGroup[0]->getType() == LLVMTypes::Int64VectorType ||
             coalesceGroup[0]->getType() == LLVMTypes::DoubleVectorType)
        elementSize = 8;
    else
        FATAL("Unexpected gather type in lCoalesceGathers");

    // Extract the constant offsets from the gathers into the constOffsets
    // vector: the first vectorWidth elements will be those for the first
    // gather, the next vectorWidth those for the next gather, and so
    // forth.
    std::vector<int64_t> constOffsets;
    lExtractConstOffsets(coalesceGroup, elementSize, &constOffsets);

    // Determine a set of loads to perform to get all of the values we need
    // loaded.
    std::vector<CoalescedLoadOp> loadOps;
    lSelectLoads(constOffsets, &loadOps);

    lCoalescePerfInfo(coalesceGroup, loadOps);

    // Actually emit load instructions for them
    lEmitLoads(basePtr, loadOps, elementSize, insertBefore);

    // Now, for any loads that give us <8 x i32> vectors, split their
    // values into two <4 x i32> vectors; it turns out that LLVM gives us
    // better code on AVX when we assemble the pieces from 4-wide vectors.
    loadOps = lSplit8WideLoads(loadOps, insertBefore);

    // Given all of these chunks of values, shuffle together a vector that
    // gives us each result value; the i'th element of results[] gives the
    // result for the i'th gather in coalesceGroup.
    std::vector<llvm::Value *> results;
    lAssembleResultVectors(loadOps, constOffsets, results, insertBefore);

    // Finally, replace each of the original gathers with the instruction
    // that gives the value from the coalescing process.
    Assert(results.size() == coalesceGroup.size());
    for (int i = 0; i < (int)results.size(); ++i) {
        llvm::Instruction *ir = llvm::dyn_cast<llvm::Instruction>(results[i]);
        Assert(ir != NULL);

        llvm::Type *origType = coalesceGroup[i]->getType();
        if (origType != ir->getType())
            ir = new llvm::BitCastInst(ir, origType, ir->getName(), coalesceGroup[i]);

        // Previously, all of the instructions to compute the final result
        // were into the basic block here; here we remove the very last one
        // of them (that holds the final result) from the basic block.
        // This way, the following ReplaceInstWithInst() call will operate
        // successfully. (It expects that the second argument not be in any
        // basic block.)
        ir->removeFromParent();

        llvm::ReplaceInstWithInst(coalesceGroup[i], ir);
    }

    return true;
}

/** Given an instruction, returns true if the instructon may write to
    memory.  This is a conservative test in that it may return true for
    some instructions that don't actually end up writing to memory, but
    should never return false for an instruction that does write to
    memory. */
static bool lInstructionMayWriteToMemory(llvm::Instruction *inst) {
    if (llvm::isa<llvm::StoreInst>(inst) || llvm::isa<llvm::AtomicRMWInst>(inst) ||
        llvm::isa<llvm::AtomicCmpXchgInst>(inst))
        // FIXME: we could be less conservative and try to allow stores if
        // we are sure that the pointers don't overlap..
        return true;

    // Otherwise, any call instruction that doesn't have an attribute
    // indicating it won't write to memory has to be treated as a potential
    // store.
    llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst);
    if (ci != NULL) {
        llvm::Function *calledFunc = ci->getCalledFunction();
        if (calledFunc == NULL)
            return true;

        if (calledFunc->onlyReadsMemory() || calledFunc->doesNotAccessMemory())
            return false;
        return true;
    }

    return false;
}

bool GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("GatherCoalescePass");

    llvm::Function *gatherFuncs[] = {
        m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"),
        m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"),
        m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"),
        m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"),
    };
    int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]);

    bool modifiedAny = false;

restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        // Iterate over all of the instructions and look for calls to
        // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls.
        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        if (callInst == NULL)
            continue;

        llvm::Function *calledFunc = callInst->getCalledFunction();
        if (calledFunc == NULL)
            continue;

        int i;
        for (i = 0; i < nGatherFuncs; ++i)
            if (gatherFuncs[i] != NULL && calledFunc == gatherFuncs[i])
                break;
        if (i == nGatherFuncs)
            // Doesn't match any of the types of gathers we care about
            continue;

        SourcePos pos;
        lGetSourcePosFromMetadata(callInst, &pos);
        Debug(pos, "Checking for coalescable gathers starting here...");

        llvm::Value *base = callInst->getArgOperand(0);
        llvm::Value *variableOffsets = callInst->getArgOperand(1);
        llvm::Value *offsetScale = callInst->getArgOperand(2);
        llvm::Value *mask = callInst->getArgOperand(4);

        // To apply this optimization, we need a set of one or more gathers
        // that fulfill the following conditions:
        //
        // - Mask all on
        // - The variable offsets to all have the same value (i.e., to be
        //   uniform).
        // - Same base pointer, variable offsets, and offset scale (for
        //   more than one gather)
        //
        // Then and only then do we have a common base pointer with all
        // offsets from that constants (in which case we can potentially
        // coalesce).
        if (lGetMaskStatus(mask) != MaskStatus::all_on)
            continue;

        if (!LLVMVectorValuesAllEqual(variableOffsets))
            continue;

        // coalesceGroup stores the set of gathers that we're going to try to
        // coalesce over
        std::vector<llvm::CallInst *> coalesceGroup;
        coalesceGroup.push_back(callInst);

        // Start iterating at the instruction after the initial gather;
        // look at the remainder of instructions in the basic block (up
        // until we reach a write to memory) to try to find any other
        // gathers that can coalesce with this one.
        llvm::BasicBlock::iterator fwdIter = iter;
        ++fwdIter;
        for (; fwdIter != bb.end(); ++fwdIter) {
            // Must stop once we come to an instruction that may write to
            // memory; otherwise we could end up moving a read before this
            // write.
            if (lInstructionMayWriteToMemory(&*fwdIter))
                break;

            llvm::CallInst *fwdCall = llvm::dyn_cast<llvm::CallInst>(&*fwdIter);
            if (fwdCall == NULL || fwdCall->getCalledFunction() != calledFunc)
                continue;

            SourcePos fwdPos;
            // TODO: need to redesign metadata attached to pseudo calls,
            // LLVM drops metadata frequently and it results in bad disgnostics.
            lGetSourcePosFromMetadata(fwdCall, &fwdPos);

#ifndef ISPC_NO_DUMPS
            if (g->debugPrint) {
                if (base != fwdCall->getArgOperand(0)) {
                    Debug(fwdPos, "base pointers mismatch");
                    LLVMDumpValue(base);
                    LLVMDumpValue(fwdCall->getArgOperand(0));
                }
                if (variableOffsets != fwdCall->getArgOperand(1)) {
                    Debug(fwdPos, "varying offsets mismatch");
                    LLVMDumpValue(variableOffsets);
                    LLVMDumpValue(fwdCall->getArgOperand(1));
                }
                if (offsetScale != fwdCall->getArgOperand(2)) {
                    Debug(fwdPos, "offset scales mismatch");
                    LLVMDumpValue(offsetScale);
                    LLVMDumpValue(fwdCall->getArgOperand(2));
                }
                if (mask != fwdCall->getArgOperand(4)) {
                    Debug(fwdPos, "masks mismatch");
                    LLVMDumpValue(mask);
                    LLVMDumpValue(fwdCall->getArgOperand(4));
                }
            }
#endif

            if (base == fwdCall->getArgOperand(0) && variableOffsets == fwdCall->getArgOperand(1) &&
                offsetScale == fwdCall->getArgOperand(2) && mask == fwdCall->getArgOperand(4)) {
                Debug(fwdPos, "This gather can be coalesced.");
                coalesceGroup.push_back(fwdCall);

                if (coalesceGroup.size() == 4)
                    // FIXME: untested heuristic: don't try to coalesce
                    // over a window of more than 4 gathers, so that we
                    // don't cause too much register pressure and end up
                    // spilling to memory anyway.
                    break;
            } else
                Debug(fwdPos, "This gather doesn't match the initial one.");
        }

        Debug(pos, "Done with checking for matching gathers");

        // Now that we have a group of gathers, see if we can coalesce them
        // into something more efficient than the original set of gathers.
        if (lCoalesceGathers(coalesceGroup)) {
            modifiedAny = true;
            goto restart;
        }
    }

    DEBUG_END_PASS("GatherCoalescePass");

    return modifiedAny;
}

bool GatherCoalescePass::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("GatherCoalescePass::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateGatherCoalescePass() { return new GatherCoalescePass; }

///////////////////////////////////////////////////////////////////////////
// ReplacePseudoMemoryOpsPass

/** For any gathers and scatters remaining after the GSToLoadStorePass
    runs, we need to turn them into actual native gathers and scatters.
    This task is handled by the ReplacePseudoMemoryOpsPass here.
 */
class ReplacePseudoMemoryOpsPass : public llvm::FunctionPass {
  public:
    static char ID;
    ReplacePseudoMemoryOpsPass() : FunctionPass(ID) {}

    llvm::StringRef getPassName() const { return "Replace Pseudo Memory Ops"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);
};

char ReplacePseudoMemoryOpsPass::ID = 0;

/** This routine attempts to determine if the given pointer in lvalue is
    pointing to stack-allocated memory.  It's conservative in that it
    should never return true for non-stack allocated memory, but may return
    false for memory that actually is stack allocated.  The basic strategy
    is to traverse through the operands and see if the pointer originally
    comes from an AllocaInst.
*/
static bool lIsSafeToBlend(llvm::Value *lvalue) {
    llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
    if (bc != NULL)
        return lIsSafeToBlend(bc->getOperand(0));
    else {
        llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
        if (ai) {
            llvm::Type *type = ai->getType();
            llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(type);
            Assert(pt != NULL);
            type = pt->getElementType();
            llvm::ArrayType *at;
            while ((at = llvm::dyn_cast<llvm::ArrayType>(type))) {
                type = at->getElementType();
            }
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
            llvm::FixedVectorType *vt = llvm::dyn_cast<llvm::FixedVectorType>(type);
#else
            llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(type);
#endif
            return (vt != NULL && (int)vt->getNumElements() == g->target->getVectorWidth());
        } else {
            llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
            if (gep != NULL)
                return lIsSafeToBlend(gep->getOperand(0));
            else
                return false;
        }
    }
}

static bool lReplacePseudoMaskedStore(llvm::CallInst *callInst) {
    struct LMSInfo {
        LMSInfo(const char *pname, const char *bname, const char *msname) {
            pseudoFunc = m->module->getFunction(pname);
            blendFunc = m->module->getFunction(bname);
            maskedStoreFunc = m->module->getFunction(msname);
            Assert(pseudoFunc != NULL && blendFunc != NULL && maskedStoreFunc != NULL);
        }
        llvm::Function *pseudoFunc;
        llvm::Function *blendFunc;
        llvm::Function *maskedStoreFunc;
    };

    LMSInfo msInfo[] = {
        LMSInfo("__pseudo_masked_store_i8", "__masked_store_blend_i8", "__masked_store_i8"),
        LMSInfo("__pseudo_masked_store_i16", "__masked_store_blend_i16", "__masked_store_i16"),
        LMSInfo("__pseudo_masked_store_i32", "__masked_store_blend_i32", "__masked_store_i32"),
        LMSInfo("__pseudo_masked_store_float", "__masked_store_blend_float", "__masked_store_float"),
        LMSInfo("__pseudo_masked_store_i64", "__masked_store_blend_i64", "__masked_store_i64"),
        LMSInfo("__pseudo_masked_store_double", "__masked_store_blend_double", "__masked_store_double")};
    LMSInfo *info = NULL;
    for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
        if (msInfo[i].pseudoFunc != NULL && callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
            info = &msInfo[i];
            break;
        }
    }
    if (info == NULL)
        return false;

    llvm::Value *lvalue = callInst->getArgOperand(0);
    llvm::Value *rvalue = callInst->getArgOperand(1);
    llvm::Value *mask = callInst->getArgOperand(2);

    // We need to choose between doing the load + blend + store trick,
    // or serializing the masked store.  Even on targets with a native
    // masked store instruction, this is preferable since it lets us
    // keep values in registers rather than going out to the stack.
    bool doBlend = (!g->opt.disableBlendedMaskedStores && lIsSafeToBlend(lvalue));

    // Generate the call to the appropriate masked store function and
    // replace the __pseudo_* one with it.
    llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
    llvm::Instruction *inst = lCallInst(fms, lvalue, rvalue, mask, "", callInst);
    lCopyMetadata(inst, callInst);

    callInst->eraseFromParent();
    return true;
}

static bool lReplacePseudoGS(llvm::CallInst *callInst) {
    struct LowerGSInfo {
        LowerGSInfo(const char *pName, const char *aName, bool ig, bool ip) : isGather(ig), isPrefetch(ip) {
            pseudoFunc = m->module->getFunction(pName);
            actualFunc = m->module->getFunction(aName);
        }
        llvm::Function *pseudoFunc;
        llvm::Function *actualFunc;
        const bool isGather;
        const bool isPrefetch;
    };

    LowerGSInfo lgsInfo[] = {
        LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true, false),
        LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true, false),
        LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true, false),
        LowerGSInfo("__pseudo_gather32_float", "__gather32_float", true, false),
        LowerGSInfo("__pseudo_gather32_i64", "__gather32_i64", true, false),
        LowerGSInfo("__pseudo_gather32_double", "__gather32_double", true, false),

        LowerGSInfo("__pseudo_gather64_i8", "__gather64_i8", true, false),
        LowerGSInfo("__pseudo_gather64_i16", "__gather64_i16", true, false),
        LowerGSInfo("__pseudo_gather64_i32", "__gather64_i32", true, false),
        LowerGSInfo("__pseudo_gather64_float", "__gather64_float", true, false),
        LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true, false),
        LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true, false),

        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i8", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", "__gather_factored_base_offsets32_i16", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i32", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_float", true,
                    false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", "__gather_factored_base_offsets32_i64", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", "__gather_factored_base_offsets32_double", true,
                    false),

        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i8", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", "__gather_factored_base_offsets64_i16", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i32", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_float", true,
                    false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", "__gather_factored_base_offsets64_i64", true, false),
        LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", "__gather_factored_base_offsets64_double", true,
                    false),

        LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true, false),

        LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true, false),
        LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true, false),

        LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false, false),
        LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false, false),
        LowerGSInfo("__pseudo_scatter32_i32", "__scatter32_i32", false, false),
        LowerGSInfo("__pseudo_scatter32_float", "__scatter32_float", false, false),
        LowerGSInfo("__pseudo_scatter32_i64", "__scatter32_i64", false, false),
        LowerGSInfo("__pseudo_scatter32_double", "__scatter32_double", false, false),

        LowerGSInfo("__pseudo_scatter64_i8", "__scatter64_i8", false, false),
        LowerGSInfo("__pseudo_scatter64_i16", "__scatter64_i16", false, false),
        LowerGSInfo("__pseudo_scatter64_i32", "__scatter64_i32", false, false),
        LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false, false),
        LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false, false),
        LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false, false),

        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i8", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", "__scatter_factored_base_offsets32_i16", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i32", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_float", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", "__scatter_factored_base_offsets32_i64", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", "__scatter_factored_base_offsets32_double",
                    false, false),

        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i8", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", "__scatter_factored_base_offsets64_i16", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i32", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_float", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", "__scatter_factored_base_offsets64_i64", false,
                    false),
        LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", "__scatter_factored_base_offsets64_double",
                    false, false),

        LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false, false),

        LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false, false),
        LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false, false),

        LowerGSInfo("__pseudo_prefetch_read_varying_1", "__prefetch_read_varying_1", false, true),
        LowerGSInfo("__pseudo_prefetch_read_varying_1_native", "__prefetch_read_varying_1_native", false, true),

        LowerGSInfo("__pseudo_prefetch_read_varying_2", "__prefetch_read_varying_2", false, true),
        LowerGSInfo("__pseudo_prefetch_read_varying_2_native", "__prefetch_read_varying_2_native", false, true),

        LowerGSInfo("__pseudo_prefetch_read_varying_3", "__prefetch_read_varying_3", false, true),
        LowerGSInfo("__pseudo_prefetch_read_varying_3_native", "__prefetch_read_varying_3_native", false, true),

        LowerGSInfo("__pseudo_prefetch_read_varying_nt", "__prefetch_read_varying_nt", false, true),
        LowerGSInfo("__pseudo_prefetch_read_varying_nt_native", "__prefetch_read_varying_nt_native", false, true),
    };

    llvm::Function *calledFunc = callInst->getCalledFunction();

    LowerGSInfo *info = NULL;
    for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
        if (lgsInfo[i].pseudoFunc != NULL && calledFunc == lgsInfo[i].pseudoFunc) {
            info = &lgsInfo[i];
            break;
        }
    }
    if (info == NULL)
        return false;

    Assert(info->actualFunc != NULL);

    // Get the source position from the metadata attached to the call
    // instruction so that we can issue PerformanceWarning()s below.
    SourcePos pos;
    bool gotPosition = lGetSourcePosFromMetadata(callInst, &pos);

    callInst->setCalledFunction(info->actualFunc);
    // Check for alloca and if not alloca - generate __gather and change arguments
    if (gotPosition && (g->target->getVectorWidth() > 1) && (g->opt.level > 0)) {
        if (info->isGather)
            PerformanceWarning(pos, "Gather required to load value.");
        else if (!info->isPrefetch)
            PerformanceWarning(pos, "Scatter required to store value.");
    }
    return true;
}

bool ReplacePseudoMemoryOpsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("ReplacePseudoMemoryOpsPass");

    bool modifiedAny = false;

restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        if (callInst == NULL || callInst->getCalledFunction() == NULL)
            continue;

        if (lReplacePseudoGS(callInst)) {
            modifiedAny = true;
            goto restart;
        } else if (lReplacePseudoMaskedStore(callInst)) {
            modifiedAny = true;
            goto restart;
        }
    }

    DEBUG_END_PASS("ReplacePseudoMemoryOpsPass");

    return modifiedAny;
}

bool ReplacePseudoMemoryOpsPass::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("ReplacePseudoMemoryOpsPass::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateReplacePseudoMemoryOpsPass() { return new ReplacePseudoMemoryOpsPass; }

///////////////////////////////////////////////////////////////////////////
// IsCompileTimeConstantPass

/** LLVM IR implementations of target-specific functions may include calls
    to the functions "bool __is_compile_time_constant_*(...)"; these allow
    them to have specialied code paths for where the corresponding value is
    known at compile time.  For masks, for example, this allows them to not
    incur the cost of a MOVMSK call at runtime to compute its value in
    cases where the mask value isn't known until runtime.

    This pass resolves these calls into either 'true' or 'false' values so
    that later optimization passes can operate with these as constants.

    See stdlib.m4 for a number of uses of this idiom.
 */

class IsCompileTimeConstantPass : public llvm::FunctionPass {
  public:
    static char ID;
    IsCompileTimeConstantPass(bool last = false) : FunctionPass(ID) { isLastTry = last; }

    llvm::StringRef getPassName() const { return "Resolve \"is compile time constant\""; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);

    bool isLastTry;
};

char IsCompileTimeConstantPass::ID = 0;

bool IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("IsCompileTimeConstantPass");

    llvm::Function *funcs[] = {m->module->getFunction("__is_compile_time_constant_mask"),
                               m->module->getFunction("__is_compile_time_constant_uniform_int32"),
                               m->module->getFunction("__is_compile_time_constant_varying_int32")};

    bool modifiedAny = false;
restart:
    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
        // Iterate through the instructions looking for calls to the
        // __is_compile_time_constant_*() functions
        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
        if (callInst == NULL)
            continue;

        int j;
        int nFuncs = sizeof(funcs) / sizeof(funcs[0]);
        for (j = 0; j < nFuncs; ++j) {
            if (funcs[j] != NULL && callInst->getCalledFunction() == funcs[j])
                break;
        }
        if (j == nFuncs)
            // not a __is_compile_time_constant_* function
            continue;

        // This optimization pass can be disabled with both the (poorly
        // named) disableGatherScatterFlattening option and
        // disableMaskAllOnOptimizations.
        if (g->opt.disableGatherScatterFlattening || g->opt.disableMaskAllOnOptimizations) {
            llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
            modifiedAny = true;
            goto restart;
        }

        // Is it a constant?  Bingo, turn the call's value into a constant
        // true value.
        llvm::Value *operand = callInst->getArgOperand(0);
        if (llvm::isa<llvm::Constant>(operand)) {
            llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
            modifiedAny = true;
            goto restart;
        }

        // This pass runs multiple times during optimization.  Up until the
        // very last time, it only replaces the call with a 'true' if the
        // value is known to be constant and otherwise leaves the call
        // alone, in case further optimization passes can help resolve its
        // value.  The last time through, it eventually has to give up, and
        // replaces any remaining ones with 'false' constants.
        if (isLastTry) {
            llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMFalse);
            modifiedAny = true;
            goto restart;
        }
    }

    DEBUG_END_PASS("IsCompileTimeConstantPass");

    return modifiedAny;
}

bool IsCompileTimeConstantPass::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("IsCompileTimeConstantPass::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry) { return new IsCompileTimeConstantPass(isLastTry); }

//////////////////////////////////////////////////////////////////////////
// DebugPass

/** This pass is added in list of passes after optimizations which
    we want to debug and print dump of LLVM IR in stderr. Also it
    prints name and number of previous optimization.
 */
#ifndef ISPC_NO_DUMPS
class DebugPass : public llvm::ModulePass {
  public:
    static char ID;
    DebugPass(char *output) : ModulePass(ID) { snprintf(str_output, sizeof(str_output), "%s", output); }

    llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
    bool runOnModule(llvm::Module &m);

  private:
    char str_output[100];
};

char DebugPass::ID = 0;

bool DebugPass::runOnModule(llvm::Module &module) {
    fprintf(stderr, "%s", str_output);
    fflush(stderr);
    module.dump();
    return true;
}

static llvm::Pass *CreateDebugPass(char *output) { return new DebugPass(output); }
#endif

//////////////////////////////////////////////////////////////////////////
// DebugPassFile

/** This pass is added in list of passes after optimizations which
    we want to debug and print dump of LLVM IR to file.
 */
#ifndef ISPC_NO_DUMPS
class DebugPassFile : public llvm::ModulePass {
  public:
    static char ID;
    DebugPassFile(int number, llvm::StringRef name) : ModulePass(ID), pnum(number), pname(name) {}

    llvm::StringRef getPassName() const { return "Dump LLVM IR"; }
    bool runOnModule(llvm::Module &m);
    bool doInitialization(llvm::Module &m);

  private:
    void run(llvm::Module &m, bool init);
    int pnum;
    llvm::StringRef pname;
};

char DebugPassFile::ID = 0;

/**
 * Strips all non-alphanumeric characters from given string.
 */
std::string sanitize(std::string in) {
    llvm::Regex r("[^[:alnum:]]");
    while (r.match(in))
        in = r.sub("", in);
    return in;
}

void DebugPassFile::run(llvm::Module &module, bool init) {
    std::error_code EC;
    char fname[100];
    snprintf(fname, sizeof(fname), "%s_%d_%s.ll", init ? "init" : "ir", pnum, sanitize(std::string(pname)).c_str());
    llvm::raw_fd_ostream OS(fname, EC, llvm::sys::fs::F_None);
    Assert(!EC && "IR dump file creation failed!");
    module.print(OS, 0);
}

bool DebugPassFile::runOnModule(llvm::Module &module) {
    run(module, false);
    return true;
}

bool DebugPassFile::doInitialization(llvm::Module &module) {
    run(module, true);
    return true;
}

static llvm::Pass *CreateDebugPassFile(int number, llvm::StringRef name) { return new DebugPassFile(number, name); }
#endif

///////////////////////////////////////////////////////////////////////////
// MakeInternalFuncsStaticPass

/** There are a number of target-specific functions that we use during
    these optimization passes.  By the time we are done with optimization,
    any uses of these should be inlined and no calls to these functions
    should remain.  This pass marks all of these functions as having
    private linkage so that subsequent passes can eliminate them as dead
    code, thus cleaning up the final code output by the compiler.  We can't
    just declare these as static from the start, however, since then they
    end up being eliminated as dead code during early optimization passes
    even though we may need to generate calls to them during later
    optimization passes.
 */
class MakeInternalFuncsStaticPass : public llvm::ModulePass {
  public:
    static char ID;
    MakeInternalFuncsStaticPass(bool last = false) : ModulePass(ID) {}

    void getAnalysisUsage(llvm::AnalysisUsage &AU) const { AU.setPreservesCFG(); }

    llvm::StringRef getPassName() const { return "Make internal funcs \"static\""; }
    bool runOnModule(llvm::Module &m);
};

char MakeInternalFuncsStaticPass::ID = 0;

bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    const char *names[] = {
        "__avg_up_uint8",
        "__avg_up_int8",
        "__avg_up_uint16",
        "__avg_up_int16",
        "__avg_down_uint8",
        "__avg_down_int8",
        "__avg_down_uint16",
        "__avg_down_int16",
        "__fast_masked_vload",
        "__gather_factored_base_offsets32_i8",
        "__gather_factored_base_offsets32_i16",
        "__gather_factored_base_offsets32_i32",
        "__gather_factored_base_offsets32_i64",
        "__gather_factored_base_offsets32_float",
        "__gather_factored_base_offsets32_double",
        "__gather_factored_base_offsets64_i8",
        "__gather_factored_base_offsets64_i16",
        "__gather_factored_base_offsets64_i32",
        "__gather_factored_base_offsets64_i64",
        "__gather_factored_base_offsets64_float",
        "__gather_factored_base_offsets64_double",
        "__gather_base_offsets32_i8",
        "__gather_base_offsets32_i16",
        "__gather_base_offsets32_i32",
        "__gather_base_offsets32_i64",
        "__gather_base_offsets32_float",
        "__gather_base_offsets32_double",
        "__gather_base_offsets64_i8",
        "__gather_base_offsets64_i16",
        "__gather_base_offsets64_i32",
        "__gather_base_offsets64_i64",
        "__gather_base_offsets64_float",
        "__gather_base_offsets64_double",
        "__gather32_i8",
        "__gather32_i16",
        "__gather32_i32",
        "__gather32_i64",
        "__gather32_float",
        "__gather32_double",
        "__gather64_i8",
        "__gather64_i16",
        "__gather64_i32",
        "__gather64_i64",
        "__gather64_float",
        "__gather64_double",
        "__gather_elt32_i8",
        "__gather_elt32_i16",
        "__gather_elt32_i32",
        "__gather_elt32_i64",
        "__gather_elt32_float",
        "__gather_elt32_double",
        "__gather_elt64_i8",
        "__gather_elt64_i16",
        "__gather_elt64_i32",
        "__gather_elt64_i64",
        "__gather_elt64_float",
        "__gather_elt64_double",
        "__masked_load_i8",
        "__masked_load_i16",
        "__masked_load_i32",
        "__masked_load_i64",
        "__masked_load_float",
        "__masked_load_double",
        "__masked_store_i8",
        "__masked_store_i16",
        "__masked_store_i32",
        "__masked_store_i64",
        "__masked_store_float",
        "__masked_store_double",
        "__masked_store_blend_i8",
        "__masked_store_blend_i16",
        "__masked_store_blend_i32",
        "__masked_store_blend_i64",
        "__masked_store_blend_float",
        "__masked_store_blend_double",
        "__scatter_factored_base_offsets32_i8",
        "__scatter_factored_base_offsets32_i16",
        "__scatter_factored_base_offsets32_i32",
        "__scatter_factored_base_offsets32_i64",
        "__scatter_factored_base_offsets32_float",
        "__scatter_factored_base_offsets32_double",
        "__scatter_factored_base_offsets64_i8",
        "__scatter_factored_base_offsets64_i16",
        "__scatter_factored_base_offsets64_i32",
        "__scatter_factored_base_offsets64_i64",
        "__scatter_factored_base_offsets64_float",
        "__scatter_factored_base_offsets64_double",
        "__scatter_base_offsets32_i8",
        "__scatter_base_offsets32_i16",
        "__scatter_base_offsets32_i32",
        "__scatter_base_offsets32_i64",
        "__scatter_base_offsets32_float",
        "__scatter_base_offsets32_double",
        "__scatter_base_offsets64_i8",
        "__scatter_base_offsets64_i16",
        "__scatter_base_offsets64_i32",
        "__scatter_base_offsets64_i64",
        "__scatter_base_offsets64_float",
        "__scatter_base_offsets64_double",
        "__scatter_elt32_i8",
        "__scatter_elt32_i16",
        "__scatter_elt32_i32",
        "__scatter_elt32_i64",
        "__scatter_elt32_float",
        "__scatter_elt32_double",
        "__scatter_elt64_i8",
        "__scatter_elt64_i16",
        "__scatter_elt64_i32",
        "__scatter_elt64_i64",
        "__scatter_elt64_float",
        "__scatter_elt64_double",
        "__scatter32_i8",
        "__scatter32_i16",
        "__scatter32_i32",
        "__scatter32_i64",
        "__scatter32_float",
        "__scatter32_double",
        "__scatter64_i8",
        "__scatter64_i16",
        "__scatter64_i32",
        "__scatter64_i64",
        "__scatter64_float",
        "__scatter64_double",
        "__prefetch_read_varying_1",
        "__prefetch_read_varying_2",
        "__prefetch_read_varying_3",
        "__prefetch_read_varying_nt",
        "__keep_funcs_live",
#ifdef ISPC_GENX_ENABLED
        "__masked_load_blend_i8",
        "__masked_load_blend_i16",
        "__masked_load_blend_i32",
        "__masked_load_blend_i64",
        "__masked_load_blend_float",
        "__masked_load_blend_double",
#endif
    };

    bool modifiedAny = false;
    int count = sizeof(names) / sizeof(names[0]);
    for (int i = 0; i < count; ++i) {
        llvm::Function *f = m->module->getFunction(names[i]);
        if (f != NULL && f->empty() == false) {
            f->setLinkage(llvm::GlobalValue::InternalLinkage);
            modifiedAny = true;
        }
    }

    return modifiedAny;
}

static llvm::Pass *CreateMakeInternalFuncsStaticPass() { return new MakeInternalFuncsStaticPass; }

///////////////////////////////////////////////////////////////////////////
// PeepholePass

class PeepholePass : public llvm::FunctionPass {
  public:
    PeepholePass();

    llvm::StringRef getPassName() const { return "Peephole Optimizations"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);

    static char ID;
};

char PeepholePass::ID = 0;

PeepholePass::PeepholePass() : FunctionPass(ID) {}

using namespace llvm::PatternMatch;

template <typename Op_t, unsigned Opcode> struct CastClassTypes_match {
    Op_t Op;
    const llvm::Type *fromType, *toType;

    CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f, const llvm::Type *t)
        : Op(OpMatch), fromType(f), toType(t) {}

    template <typename OpTy> bool match(OpTy *V) {
        if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
            return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) && O->getType() == toType &&
                    O->getOperand(0)->getType() == fromType);
        return false;
    }
};

template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::SExt> m_SExt8To16(const OpTy &Op) {
    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(Op, LLVMTypes::Int8VectorType,
                                                               LLVMTypes::Int16VectorType);
}

template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt> m_ZExt8To16(const OpTy &Op) {
    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(Op, LLVMTypes::Int8VectorType,
                                                               LLVMTypes::Int16VectorType);
}

template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc> m_Trunc16To8(const OpTy &Op) {
    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(Op, LLVMTypes::Int16VectorType,
                                                                LLVMTypes::Int8VectorType);
}

template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::SExt> m_SExt16To32(const OpTy &Op) {
    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(Op, LLVMTypes::Int16VectorType,
                                                               LLVMTypes::Int32VectorType);
}

template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt> m_ZExt16To32(const OpTy &Op) {
    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(Op, LLVMTypes::Int16VectorType,
                                                               LLVMTypes::Int32VectorType);
}

template <typename OpTy> inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc> m_Trunc32To16(const OpTy &Op) {
    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(Op, LLVMTypes::Int32VectorType,
                                                                LLVMTypes::Int16VectorType);
}

template <typename Op_t> struct UDiv2_match {
    Op_t Op;

    UDiv2_match(const Op_t &OpMatch) : Op(OpMatch) {}

    template <typename OpTy> bool match(OpTy *V) {
        llvm::BinaryOperator *bop;
        llvm::ConstantDataVector *cdv;
        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) && cdv->getSplatValue() != NULL) {
            const llvm::APInt &apInt = cdv->getUniqueInteger();

            switch (bop->getOpcode()) {
            case llvm::Instruction::UDiv:
                // divide by 2
                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
            case llvm::Instruction::LShr:
                // shift left by 1
                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
            default:
                return false;
            }
        }
        return false;
    }
};

template <typename V> inline UDiv2_match<V> m_UDiv2(const V &v) { return UDiv2_match<V>(v); }

template <typename Op_t> struct SDiv2_match {
    Op_t Op;

    SDiv2_match(const Op_t &OpMatch) : Op(OpMatch) {}

    template <typename OpTy> bool match(OpTy *V) {
        llvm::BinaryOperator *bop;
        llvm::ConstantDataVector *cdv;
        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) && cdv->getSplatValue() != NULL) {
            const llvm::APInt &apInt = cdv->getUniqueInteger();

            switch (bop->getOpcode()) {
            case llvm::Instruction::SDiv:
                // divide by 2
                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
            case llvm::Instruction::AShr:
                // shift left by 1
                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
            default:
                return false;
            }
        }
        return false;
    }
};

template <typename V> inline SDiv2_match<V> m_SDiv2(const V &v) { return SDiv2_match<V>(v); }

// Returns true if the given function has a call to an intrinsic function
// in its definition.
static bool lHasIntrinsicInDefinition(llvm::Function *func) {
    llvm::Function::iterator bbiter = func->begin();
    for (; bbiter != func->end(); ++bbiter) {
        for (llvm::BasicBlock::iterator institer = bbiter->begin(); institer != bbiter->end(); ++institer) {
            if (llvm::isa<llvm::IntrinsicInst>(institer))
                return true;
        }
    }
    return false;
}

static llvm::Instruction *lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
    llvm::Function *func = m->module->getFunction(name);
    Assert(func != NULL);

    // Make sure that the definition of the llvm::Function has a call to an
    // intrinsic function in its instructions; otherwise we will generate
    // infinite loops where we "helpfully" turn the default implementations
    // of target builtins like __avg_up_uint8 that are implemented with plain
    // arithmetic ops into recursive calls to themselves.
    if (lHasIntrinsicInDefinition(func))
        return lCallInst(func, opa, opb, name);
    else
        return NULL;
}

//////////////////////////////////////////////////

static llvm::Instruction *lMatchAvgUpUInt8(llvm::Value *inst) {
    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
    llvm::Value *opa, *opb;
    const llvm::APInt *delta;
    if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
                        m_CombineOr(m_Add(m_ZExt8To16(m_Value(opa)), m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
                                    m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)), m_ZExt8To16(m_Value(opb)))),
                        m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))), m_APInt(delta))))))) {
        if (delta->isIntN(1) == false)
            return NULL;

        return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
    }
    return NULL;
}

static llvm::Instruction *lMatchAvgDownUInt8(llvm::Value *inst) {
    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
    llvm::Value *opa, *opb;
    if (match(inst, m_Trunc16To8(m_UDiv2(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))))))) {
        return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
    }
    return NULL;
}

static llvm::Instruction *lMatchAvgUpUInt16(llvm::Value *inst) {
    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
    llvm::Value *opa, *opb;
    const llvm::APInt *delta;
    if (match(inst,
              m_Trunc32To16(m_UDiv2(m_CombineOr(
                  m_CombineOr(m_Add(m_ZExt16To32(m_Value(opa)), m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
                              m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)), m_ZExt16To32(m_Value(opb)))),
                  m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))), m_APInt(delta))))))) {
        if (delta->isIntN(1) == false)
            return NULL;

        return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
    }
    return NULL;
}

static llvm::Instruction *lMatchAvgDownUInt16(llvm::Value *inst) {
    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
    llvm::Value *opa, *opb;
    if (match(inst, m_Trunc32To16(m_UDiv2(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))))))) {
        return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
    }
    return NULL;
}

static llvm::Instruction *lMatchAvgUpInt8(llvm::Value *inst) {
    // (int8)(((int16)a + (int16)b + 1)/2)
    llvm::Value *opa, *opb;
    const llvm::APInt *delta;
    if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
                        m_CombineOr(m_Add(m_SExt8To16(m_Value(opa)), m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
                                    m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)), m_SExt8To16(m_Value(opb)))),
                        m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))), m_APInt(delta))))))) {
        if (delta->isIntN(1) == false)
            return NULL;

        return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
    }
    return NULL;
}

static llvm::Instruction *lMatchAvgDownInt8(llvm::Value *inst) {
    // (int8)(((int16)a + (int16)b)/2)
    llvm::Value *opa, *opb;
    if (match(inst, m_Trunc16To8(m_SDiv2(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))))))) {
        return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
    }
    return NULL;
}

static llvm::Instruction *lMatchAvgUpInt16(llvm::Value *inst) {
    // (int16)(((int32)a + (int32)b + 1)/2)
    llvm::Value *opa, *opb;
    const llvm::APInt *delta;
    if (match(inst,
              m_Trunc32To16(m_SDiv2(m_CombineOr(
                  m_CombineOr(m_Add(m_SExt16To32(m_Value(opa)), m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
                              m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)), m_SExt16To32(m_Value(opb)))),
                  m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))), m_APInt(delta))))))) {
        if (delta->isIntN(1) == false)
            return NULL;

        return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
    }
    return NULL;
}

static llvm::Instruction *lMatchAvgDownInt16(llvm::Value *inst) {
    // (int16)(((int32)a + (int32)b)/2)
    llvm::Value *opa, *opb;
    if (match(inst, m_Trunc32To16(m_SDiv2(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))))))) {
        return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
    }
    return NULL;
}

bool PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("PeepholePass");

    bool modifiedAny = false;
restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        llvm::Instruction *inst = &*iter;

        llvm::Instruction *builtinCall = lMatchAvgUpUInt8(inst);
        if (!builtinCall)
            builtinCall = lMatchAvgUpUInt16(inst);
        if (!builtinCall)
            builtinCall = lMatchAvgDownUInt8(inst);
        if (!builtinCall)
            builtinCall = lMatchAvgDownUInt16(inst);
        if (!builtinCall)
            builtinCall = lMatchAvgUpInt8(inst);
        if (!builtinCall)
            builtinCall = lMatchAvgUpInt16(inst);
        if (!builtinCall)
            builtinCall = lMatchAvgDownInt8(inst);
        if (!builtinCall)
            builtinCall = lMatchAvgDownInt16(inst);
        if (builtinCall != NULL) {
            llvm::ReplaceInstWithInst(inst, builtinCall);
            modifiedAny = true;
            goto restart;
        }
    }

    DEBUG_END_PASS("PeepholePass");

    return modifiedAny;
}

bool PeepholePass::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("PeepholePass::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreatePeepholePass() { return new PeepholePass; }

/** Given an llvm::Value known to be an integer, return its value as
    an int64_t.
*/
static int64_t lGetIntValue(llvm::Value *offset) {
    llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
    Assert(intOffset && (intOffset->getBitWidth() == 32 || intOffset->getBitWidth() == 64));
    return intOffset->getSExtValue();
}

///////////////////////////////////////////////////////////////////////////
// ReplaceStdlibShiftPass

class ReplaceStdlibShiftPass : public llvm::FunctionPass {
  public:
    static char ID;
    ReplaceStdlibShiftPass() : FunctionPass(ID) {}

    llvm::StringRef getPassName() const { return "Resolve \"replace extract insert chains\""; }

    bool runOnBasicBlock(llvm::BasicBlock &BB);

    bool runOnFunction(llvm::Function &F);
};

char ReplaceStdlibShiftPass::ID = 0;

// This pass replaces shift() with ShuffleVector when the offset is a constant.
// rotate() which is similar in functionality has a slightly different
// implementation. This is due to LLVM(createInstructionCombiningPass)
// optimizing rotate() implementation better when similar implementations
// are used for both. This is a hack to produce similarly optimized code for
// shift.
bool ReplaceStdlibShiftPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("ReplaceStdlibShiftPass");
    bool modifiedAny = false;

    llvm::Function *shifts[6];
    shifts[0] = m->module->getFunction("shift___vytuni");
    shifts[1] = m->module->getFunction("shift___vysuni");
    shifts[2] = m->module->getFunction("shift___vyiuni");
    shifts[3] = m->module->getFunction("shift___vyIuni");
    shifts[4] = m->module->getFunction("shift___vyfuni");
    shifts[5] = m->module->getFunction("shift___vyduni");

    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        llvm::Instruction *inst = &*iter;

        if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
            llvm::Function *func = ci->getCalledFunction();
            for (int i = 0; i < 6; i++) {
                if (shifts[i] && (shifts[i] == func)) {
                    // we matched a call
                    llvm::Value *shiftedVec = ci->getArgOperand(0);
                    llvm::Value *shiftAmt = ci->getArgOperand(1);
                    if (llvm::isa<llvm::Constant>(shiftAmt)) {
                        int vectorWidth = g->target->getVectorWidth();
                        int *shuffleVals = new int[vectorWidth];
                        int shiftInt = lGetIntValue(shiftAmt);
                        for (int i = 0; i < vectorWidth; i++) {
                            int s = i + shiftInt;
                            s = (s < 0) ? vectorWidth : s;
                            s = (s >= vectorWidth) ? vectorWidth : s;
                            shuffleVals[i] = s;
                        }
                        llvm::Value *shuffleIdxs = LLVMInt32Vector(shuffleVals);
                        llvm::Value *zeroVec = llvm::ConstantAggregateZero::get(shiftedVec->getType());
                        llvm::Value *shuffle =
                            new llvm::ShuffleVectorInst(shiftedVec, zeroVec, shuffleIdxs, "vecShift", ci);
                        ci->replaceAllUsesWith(shuffle);
                        modifiedAny = true;
                        delete[] shuffleVals;
                    } else if (g->opt.level > 0) {
                        PerformanceWarning(SourcePos(), "Stdlib shift() called without constant shift amount.");
                    }
                }
            }
        }
    }

    DEBUG_END_PASS("ReplaceStdlibShiftPass");

    return modifiedAny;
}

bool ReplaceStdlibShiftPass::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("ReplaceStdlibShiftPass::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateReplaceStdlibShiftPass() { return new ReplaceStdlibShiftPass(); }

#ifdef ISPC_GENX_ENABLED

///////////////////////////////////////////////////////////////////////////
// GenXGatherCoalescingPass

/** This pass performs gather coalescing for GenX target.
 */
class GenXGatherCoalescing : public llvm::FunctionPass {
  public:
    static char ID;
    GenXGatherCoalescing() : FunctionPass(ID) {}

    llvm::StringRef getPassName() const { return "GenX Gather Coalescing"; }

    bool runOnBasicBlock(llvm::BasicBlock &BB);

    bool runOnFunction(llvm::Function &Fn);
};

char GenXGatherCoalescing::ID = 0;

// Returns pointer to pseudo_gather CallInst if inst is
// actually a pseudo_gather
static llvm::CallInst *lGetPseudoGather(llvm::Instruction *inst) {
    if (auto CI = llvm::dyn_cast<llvm::CallInst>(inst)) {
        if (CI->getCalledFunction()->getName().startswith("__pseudo_gather_base_offsets"))
            return CI;
    }
    return nullptr;
}

// Returns type of GEP users: nullptr in case when type is
// different.
//
// TODO: currently only few instructions are handled:
//   - load (scalar, see TODO below)
//   - bitcast (checking it users)
//   - pseudo_gather: see lGetPseudoGather
static llvm::Type *lGetLoadsType(llvm::Instruction *inst) {
    llvm::Type *ty = nullptr;

    // Bad user was provided
    if (!inst)
        return nullptr;

    for (auto ui = inst->use_begin(), ue = inst->use_end(); ui != ue; ++ui) {

        llvm::Instruction *user = llvm::dyn_cast<llvm::Instruction>(ui->getUser());
        llvm::Type *curr_type = nullptr;

        // Bitcast appeared, go through it
        if (auto BCI = llvm::dyn_cast<llvm::BitCastInst>(user)) {
            if (!(curr_type = lGetLoadsType(BCI)))
                return nullptr;
        } else {
            // Load is OK if it is scalar
            // TODO: perform full investigation on situation
            // with non-scalar loads
            if (auto LI = llvm::dyn_cast<llvm::LoadInst>(user)) {
                if (LI->getType()->isVectorTy())
                    return nullptr;
                curr_type = LI->getType();
            } else if (lGetPseudoGather(user)) {
                curr_type = user->getType();
            } else
                continue;
        }

        if (!curr_type || (ty && ty != curr_type))
            return nullptr;

        ty = curr_type;
    }

    return ty;
}

// Struct that contains info about memory users
struct PtrUse {
    llvm::Instruction *user; // Ptr

    std::vector<int64_t> idxs; // Users a presented in this list like it was Ptr[idx]
                               // TODO: current implementation depends on traverse order:
                               //       see lCreateBlockLDUse and lCollectIdxs

    llvm::Type *type; // Currently loads are grouped by types.
                      // TODO: it can be optimized

    int64_t &getScalarIdx() { return idxs[0]; }
};

// Create users for newly created block ld
// TODO: current implementation depends on traverse order: see lCollectIdxs
static void lCreateBlockLDUse(llvm::Instruction *currInst, std::vector<llvm::ExtractElementInst *> EEIs, PtrUse &ptrUse,
                              int &curr_idx, std::vector<llvm::Instruction *> &dead) {
    for (auto ui = currInst->use_begin(), ue = currInst->use_end(); ui != ue; ++ui) {
        if (auto BCI = llvm::dyn_cast<llvm::BitCastInst>(ui->getUser())) {
            // Go through bitcast
            lCreateBlockLDUse(BCI, EEIs, ptrUse, curr_idx, dead);
        } else if (auto LI = llvm::dyn_cast<llvm::LoadInst>(ui->getUser())) {
            // Only scalar loads are supported now
            llvm::ExtractElementInst *EEI = EEIs[ptrUse.idxs[curr_idx++]];
            LI->replaceAllUsesWith(EEI);
            lCopyMetadata(EEI, LI);
            dead.push_back(LI);
        } else if (auto gather = lGetPseudoGather(llvm::dyn_cast<llvm::Instruction>(ui->getUser()))) {
            // Collect idxs from gather and fix users
            llvm::Value *res = llvm::UndefValue::get(gather->getType());
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
            for (unsigned i = 0, end = llvm::dyn_cast<llvm::FixedVectorType>(res->getType())->getNumElements(); i < end;
                 ++i)
#else
            for (unsigned i = 0, end = llvm::dyn_cast<llvm::VectorType>(res->getType())->getNumElements(); i < end; ++i)
#endif
            {
                // Get element via IEI
                res = llvm::InsertElementInst::Create(res, EEIs[ptrUse.idxs[curr_idx++]],
                                                      llvm::ConstantInt::get(LLVMTypes::Int32Type, i),
                                                      "coalesced_gather_iei", gather);
            }
            gather->replaceAllUsesWith(res);
            lCopyMetadata(res, gather);
            dead.push_back(gather);
        } else {
            continue;
        }
    }
}

// Perform optimization from GEPs
static bool lVectorizeGEPs(llvm::Value *ptr, std::vector<PtrUse> &ptrUses, std::vector<llvm::Instruction *> &dead) {
    // Calculate memory width for GEPs
    int64_t min_idx = INT64_MAX;
    int64_t max_idx = INT64_MIN;
    for (auto elem : ptrUses) {
        for (auto i : elem.idxs) {
            int64_t idx = i;
            max_idx = std::max(idx, max_idx);
            min_idx = std::min(idx, min_idx);
        }
    }
    llvm::Type *type = ptrUses[0].type;
    llvm::Type *scalar_type = type;
    llvm::Instruction *insertBefore = ptrUses[0].user;

    // Calculate element size in bytes
    uint64_t t_size = type->getPrimitiveSizeInBits() >> 3;

    // Adjust values for vector load
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
    if (auto vecTy = llvm::dyn_cast<llvm::FixedVectorType>(type))
#else
    if (auto vecTy = llvm::dyn_cast<llvm::VectorType>(type))
#endif
    {
        // Get single element size
        t_size /= vecTy->getNumElements();
        // Get single element type
        scalar_type = vecTy->getScalarType();
    }

    // Pointer load should be done via inttoptr: replace type
    bool loadingPtr = false;
    llvm::Type *originalType = scalar_type;
    if (auto pTy = llvm::dyn_cast<llvm::PointerType>(scalar_type)) {
        scalar_type = g->target->is32Bit() ? LLVMTypes::Int32Type : LLVMTypes::Int64Type;
        t_size = scalar_type->getPrimitiveSizeInBits() >> 3;
        loadingPtr = true;
    }

    // Calculate length of array that needs to be loaded.
    // Idxs are in bytes now.
    uint64_t data_size = max_idx - min_idx + t_size;

    unsigned loads_needed = 1;
    uint64_t reqSize = 1;

    // Required load size. It can be 1, 2, 4, 8 OWORDs
    while (reqSize < data_size)
        reqSize <<= 1;

    // Adjust number of loads.
    // TODO: it is not clear if we should adjust
    // load size here
    if (reqSize > 8 * OWORD) {
        loads_needed = reqSize / (8 * OWORD);
        reqSize = 8 * OWORD;
    }

    // TODO: it is not clear if we should skip it or not
    if (reqSize < OWORD) {
        // Skip it for now
        return false;
    }

    // Coalesce loads/gathers
    std::vector<llvm::ExtractElementInst *> EEIs;
    for (unsigned i = 0; i < loads_needed; ++i) {
        llvm::Constant *offset = llvm::ConstantInt::get(LLVMTypes::Int64Type, min_idx + i * reqSize);
        llvm::PtrToIntInst *ptrToInt =
            new llvm::PtrToIntInst(ptr, LLVMTypes::Int64Type, "vectorized_ptrtoint", insertBefore);
        llvm::Instruction *addr = llvm::BinaryOperator::CreateAdd(ptrToInt, offset, "vectorized_address", insertBefore);
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
        llvm::Type *retType = llvm::FixedVectorType::get(scalar_type, reqSize / t_size);
#else
        llvm::Type *retType = llvm::VectorType::get(scalar_type, reqSize / t_size);
#endif
        llvm::Function *fn = llvm::GenXIntrinsic::getGenXDeclaration(
            m->module, llvm::GenXIntrinsic::genx_svm_block_ld_unaligned, {retType, addr->getType()});
        llvm::Instruction *ld = llvm::CallInst::Create(fn, {addr}, "vectorized_ld", insertBefore);

        if (loadingPtr) {
            // Cast int to ptr via inttoptr
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
            ld = new llvm::IntToPtrInst(ld, llvm::FixedVectorType::get(originalType, reqSize / t_size),
                                        "vectorized_inttoptr", insertBefore);
#else
            ld = new llvm::IntToPtrInst(ld, llvm::VectorType::get(originalType, reqSize / t_size),
                                        "vectorized_inttoptr", insertBefore);
#endif
        }

        // Scalar extracts for all loaded elements
        for (unsigned j = 0; j < reqSize / t_size; ++j) {
            auto EEI = llvm::ExtractElementInst::Create(ld, llvm::ConstantInt::get(LLVMTypes::Int64Type, j),
                                                        "vectorized_extr_elem", ld->getParent());
            EEIs.push_back(EEI);
            EEI->moveAfter(ld);
        }
    }

    // Change value in users and delete redundant insts
    for (auto elem : ptrUses) {
        // Recalculate idx: we are going to use extractelement
        for (auto &index : elem.idxs) {
            index -= min_idx;
            index /= t_size;
        }
        int curr_idx = 0;
        lCreateBlockLDUse(elem.user, EEIs, elem, curr_idx, dead);
    }

    return true;
}

// Get idxs from current instruction users
// TODO: current implementation depends on traverse order: see lCreateBlockLDUse
static void lCollectIdxs(llvm::Instruction *inst, std::vector<int64_t> &idxs, int64_t original_idx) {
    for (auto ui = inst->use_begin(), ue = inst->use_end(); ui != ue; ++ui) {

        llvm::Instruction *user = llvm::dyn_cast<llvm::Instruction>(ui->getUser());

        if (auto BCI = llvm::dyn_cast<llvm::BitCastInst>(user)) {
            // Bitcast appeared, go through it
            lCollectIdxs(BCI, idxs, original_idx);
        } else {
            if (auto LI = llvm::dyn_cast<llvm::LoadInst>(user)) {
                // Only scalar loads are supported now. It should be checked earlier.
                Assert(!LI->getType()->isVectorTy());
                idxs.push_back(original_idx);
            } else if (auto gather = lGetPseudoGather(user)) {
                // Collect gather's idxs
                auto scale_c = llvm::dyn_cast<llvm::ConstantInt>(gather->getOperand(1));
                int64_t scale = scale_c->getSExtValue();
                auto offset = llvm::dyn_cast<llvm::ConstantDataVector>(gather->getOperand(2));

                for (unsigned i = 0, size = offset->getType()->getNumElements(); i < size; ++i) {
                    int64_t curr_offset =
                        llvm::dyn_cast<llvm::ConstantInt>(offset->getElementAsConstant(i))->getSExtValue() * scale +
                        original_idx;
                    idxs.push_back(curr_offset);
                }
            } else {
                continue;
            }
        }
    }
}

// Update GEP index in case of nested GEP interaction
static void lSetRealPtr(llvm::GetElementPtrInst *GEP) {
    llvm::GetElementPtrInst *predGEP = llvm::dyn_cast<llvm::GetElementPtrInst>(GEP->getPointerOperand());
    // The deepest GEP
    if (!predGEP)
        return;
    // Pred is a bypass
    if (predGEP->hasAllConstantIndices() && predGEP->getNumOperands() == 2 &&
        llvm::dyn_cast<llvm::Constant>(predGEP->getOperand(1))->isNullValue()) {
        // Go through several bypasses
        lSetRealPtr(predGEP);
        GEP->setOperand(0, predGEP->getPointerOperand());
    }
}

static bool lSkipInst(llvm::Instruction *inst) {
    // TODO: this variable should have different
    // value for scatter coalescing
    unsigned min_operands = 1;

    // Skip phis
    if (llvm::isa<llvm::PHINode>(inst))
        return true;
    // Skip obviously wrong instruction
    if (inst->getNumOperands() < min_operands)
        return true;
    // Skip GEPs: handle its users instead
    if (llvm::isa<llvm::GetElementPtrInst>(inst))
        return true;

    return false;
}

// Prepare GEPs for further optimization:
//   - Simplify complex GEPs
//   - Copy GEPs that are located in other BBs
//   - Resolve possible duplications in current BB
// TODO: it looks possible to find common pointers in
//       some cases. Not implemented now.
static bool lPrepareGEPs(llvm::BasicBlock &bb) {
    bool changed = true;
    bool modified = false;
    // TODO: this variable should have different
    // value for scatter coalescing
    unsigned gep_operand_index = 0;

    while (changed) {
        changed = false;

        std::map<llvm::Value *, std::map<llvm::Value *, llvm::Value *>> ptrData;
        std::vector<llvm::Instruction *> dead;
        std::map<llvm::Value *, llvm::Value *> movedGEPs;
        for (auto bi = bb.begin(), be = bb.end(); bi != be; ++bi) {
            llvm::Instruction *inst = &*bi;

            if (lSkipInst(inst))
                continue;

            llvm::GetElementPtrInst *GEP = llvm::dyn_cast<llvm::GetElementPtrInst>(inst->getOperand(gep_operand_index));
            if (!GEP) {
                continue;
            }

            // Set real pointer in order to ignore possible bypasses
            lSetRealPtr(GEP);

            // Copy GEP to current BB
            if (GEP->getParent() != &bb) {
                auto it = movedGEPs.end();
                if ((it = movedGEPs.find(GEP)) == movedGEPs.end()) {
                    auto newGEP = llvm::dyn_cast<llvm::GetElementPtrInst>(GEP->clone());
                    Assert(newGEP != nullptr);
                    newGEP->insertBefore(inst);
                    inst->setOperand(gep_operand_index, newGEP);
                    movedGEPs[GEP] = newGEP;
                    dead.push_back(GEP);
                    GEP = newGEP;
                    changed = true;
                } else {
                    GEP = llvm::dyn_cast<llvm::GetElementPtrInst>(it->second);
                    Assert(GEP != nullptr);
                    inst->setOperand(gep_operand_index, GEP);
                    // Do not handle GEP during one BB run twice
                    continue;
                }
            }

            // GEP is ready for optimization
            if (GEP->hasAllConstantIndices())
                continue;
            // TODO: Currently skipping constants at first index
            // We can handle const-(const)*-arg-(arg)*-const-(const)*
            // pattern if the first 4 fields are the same for some GEPs
            if (llvm::dyn_cast<llvm::Constant>(GEP->getOperand(1)))
                continue;
            // Not interested in it
            if (!GEP->getNumUses())
                continue;

            llvm::Value *ptr = GEP->getPointerOperand();
            llvm::Value *idx = GEP->getOperand(1);
            llvm::Value *foundGEP = nullptr;
            auto it = ptrData[ptr].find(idx);

            if (it != ptrData[ptr].end()) {
                foundGEP = it->second;
                // Duplication: just move uses
                if (GEP->getNumOperands() == 2) {
                    if (GEP != foundGEP) {
                        GEP->replaceAllUsesWith(foundGEP);
                        lCopyMetadata(foundGEP, GEP);
                        dead.push_back(GEP);
                        changed = true;
                    }

                    continue;
                }
            } else {
                // Don't need to create new GEP, use this one
                if (GEP->getNumOperands() == 2)
                    foundGEP = GEP;
                else
                    foundGEP = llvm::GetElementPtrInst::Create(nullptr, ptr, {idx}, "lowered_gep", GEP);

                ptrData[ptr][idx] = foundGEP;
            }

            if (foundGEP == GEP)
                continue;

            std::vector<llvm::Value *> args;
            args.push_back(llvm::Constant::getNullValue(GEP->getOperand(1)->getType()));
            for (unsigned i = 2, end = GEP->getNumOperands(); i < end; ++i)
                args.push_back(GEP->getOperand(i));
            auto foundGEPUser = llvm::GetElementPtrInst::Create(nullptr, foundGEP, args, "lowered_gep_succ", GEP);
            GEP->replaceAllUsesWith(foundGEPUser);
            lCopyMetadata(foundGEPUser, GEP);
            dead.push_back(GEP);
            changed = true;
        }

        for (auto inst : dead) {
            llvm::RecursivelyDeleteTriviallyDeadInstructions(inst);
        }

        if (changed)
            modified = true;
    }

    return modified;
}

// Return true if there is a possibility of
// load-store-load sequence due to current instruction
static bool lCheckForPossibleStore(llvm::Instruction *inst) {
    if (auto CI = llvm::dyn_cast<llvm::CallInst>(inst)) {
        // Such call may introduce load-store-load sequence
        if (!CI->onlyReadsMemory())
            return true;
    } else if (auto SI = llvm::dyn_cast<llvm::StoreInst>(inst)) {
        // May introduce load-store-load sequence
        if (GetAddressSpace(SI->getPointerOperand()) == AddressSpace::External)
            return true;
    }

    // No possible bad store detected
    return false;
}

// Run optimization for all collected GEPs
static bool lRunGenXCoalescing(std::map<llvm::Value *, std::map<llvm::Type *, std::vector<PtrUse>>> &ptrUses,
                               std::vector<llvm::Instruction *> &dead) {
    bool modifiedAny = false;

    for (auto data : ptrUses) {
        for (auto data_t : data.second)
            modifiedAny |= lVectorizeGEPs(data.first, data_t.second, dead);
    }

    return modifiedAny;
}

// Analyze and optimize loads/gathers in current BB
static bool lVectorizeLoads(llvm::BasicBlock &bb) {
    bool modifiedAny = false;

    // TODO: this variable should have different
    // value for scatter coalescing
    unsigned gep_operand_index = 0;

    std::map<llvm::Value *, std::map<llvm::Type *, std::vector<PtrUse>>> ptrUses;
    std::vector<llvm::Instruction *> dead;
    std::set<llvm::GetElementPtrInst *> handledGEPs;
    for (auto bi = bb.begin(), be = bb.end(); bi != be; ++bi) {
        llvm::Instruction *inst = &*bi;

        if (lSkipInst(inst))
            continue;

        // Check for possible load-store-load sequence
        if (lCheckForPossibleStore(inst)) {
            // Perform early run of the optimization
            modifiedAny |= lRunGenXCoalescing(ptrUses, dead);
            // All current changes were applied
            ptrUses.clear();
            handledGEPs.clear();
            continue;
        }

        // Get GEP operand: current algorithm is based on GEPs
        // TODO: this fact leads to a problem with PTR[0]: it can be
        // accessed without GEP. The possible solution is to build GEP
        // during preprocessing GEPs.
        llvm::GetElementPtrInst *GEP = llvm::dyn_cast<llvm::GetElementPtrInst>(inst->getOperand(gep_operand_index));
        if (!GEP)
            continue;

        // Since gathers work with SVM, this optimization
        // should be applied to external address space only
        llvm::Value *ptr = GEP->getPointerOperand();
        if (GetAddressSpace(ptr) != AddressSpace::External)
            continue;

        // GEP was already added to the list for optimization
        if (handledGEPs.count(GEP))
            continue;
        handledGEPs.insert(GEP);

        // All GEPs that are located in other BB should be copied during preprocessing
        Assert(GEP->getParent() == &bb);

        // Create bypass for bad GEPs
        // TODO: that can be applied to several GEPs, so they can be grouped.
        // It is not done now.
        if (!GEP->hasAllConstantIndices()) {
            llvm::GetElementPtrInst *newGEP = llvm::GetElementPtrInst::Create(
                nullptr, GEP, {llvm::dyn_cast<llvm::ConstantInt>(llvm::ConstantInt::get(LLVMTypes::Int32Type, 0))},
                "ptr_bypass", GEP->getParent());
            newGEP->moveAfter(GEP);
            GEP->replaceAllUsesWith(newGEP);
            lCopyMetadata(newGEP, GEP);
            newGEP->setOperand(0, GEP);
            GEP = newGEP;
            continue;
        }

        // Calculate GEP offset
        llvm::APInt acc(g->target->is32Bit() ? 32 : 64, 0, true);
        bool checker = GEP->accumulateConstantOffset(GEP->getParent()->getParent()->getParent()->getDataLayout(), acc);
        Assert(checker);

        // Skip unsuitable GEPs
        llvm::Type *ty = nullptr;
        if (!(ty = lGetLoadsType(GEP)))
            continue;

        // Collect used idxs
        int64_t idx = acc.getSExtValue();
        std::vector<int64_t> idxs;
        lCollectIdxs(GEP, idxs, idx);

        // Store data for GEP
        ptrUses[ptr][ty].push_back({GEP, idxs, ty});
    }

    // Run optimization
    modifiedAny |= lRunGenXCoalescing(ptrUses, dead);

    // TODO: perform investigation on compilation failure with it
#if 0
    for (auto d : dead) {
        llvm::RecursivelyDeleteTriviallyDeadInstructions(d);
    }
#endif

    return modifiedAny;
}

bool GenXGatherCoalescing::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("GenXGatherCoalescing");

    bool modifiedAny = lPrepareGEPs(bb);

    modifiedAny |= lVectorizeLoads(bb);

    DEBUG_END_PASS("GenXGatherCoalescing");

    return modifiedAny;
}

bool GenXGatherCoalescing::runOnFunction(llvm::Function &F) {
    llvm::TimeTraceScope FuncScope("GenXGatherCoalescing::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateGenXGatherCoalescingPass() { return new GenXGatherCoalescing; }

///////////////////////////////////////////////////////////////////////////
// ReplaceLLVMIntrinsics

/** This pass replaces LLVM intrinsics unsupported on GenX
 */

class ReplaceLLVMIntrinsics : public llvm::FunctionPass {
  public:
    static char ID;
    ReplaceLLVMIntrinsics() : FunctionPass(ID) {}
    llvm::StringRef getPassName() const { return "LLVM intrinsics replacement"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);
};

char ReplaceLLVMIntrinsics::ID = 0;

bool ReplaceLLVMIntrinsics::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("LLVM intrinsics replacement");
    std::vector<llvm::AllocaInst *> Allocas;

    bool modifiedAny = false;

restart:
    for (llvm::BasicBlock::iterator I = bb.begin(), E = --bb.end(); I != E; ++I) {
        llvm::Instruction *inst = &*I;
        if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
            llvm::Function *func = ci->getCalledFunction();
            if (func == NULL || !func->isIntrinsic())
                continue;

            if (func->getName().equals("llvm.trap")) {
                llvm::Type *argTypes[] = {LLVMTypes::Int1VectorType, LLVMTypes::Int16VectorType};
                // Description of parameters for genx_raw_send_noresult can be found in target-genx.ll
                auto Fn = +llvm::GenXIntrinsic::getGenXDeclaration(
                    m->module, llvm::GenXIntrinsic::genx_raw_send_noresult, argTypes);
                llvm::SmallVector<llvm::Value *, 8> Args;
                Args.push_back(llvm::ConstantInt::get(LLVMTypes::Int32Type, 0));
                Args.push_back(llvm::ConstantVector::getSplat(
#if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
                    g->target->getNativeVectorWidth(),
#elif ISPC_LLVM_VERSION < ISPC_LLVM_12_0
                    {static_cast<unsigned int>(g->target->getNativeVectorWidth()), false},
#else // LLVM 12.0+
                    llvm::ElementCount::get(static_cast<unsigned int>(g->target->getNativeVectorWidth()), false),
#endif
                    llvm::ConstantInt::getTrue(*g->ctx)));

                Args.push_back(llvm::ConstantInt::get(LLVMTypes::Int32Type, 39));
                Args.push_back(llvm::ConstantInt::get(LLVMTypes::Int32Type, 33554448));
                llvm::Value *zeroMask = llvm::ConstantVector::getSplat(
#if ISPC_LLVM_VERSION < ISPC_LLVM_11_0
                    g->target->getNativeVectorWidth(),
#elif ISPC_LLVM_VERSION < ISPC_LLVM_12_0
                    {static_cast<unsigned int>(g->target->getNativeVectorWidth()), false},
#else // LLVM 12.0+
                    llvm::ElementCount::get(static_cast<unsigned int>(g->target->getNativeVectorWidth()), false),
#endif
                    llvm::Constant::getNullValue(llvm::Type::getInt16Ty(*g->ctx)));
                Args.push_back(zeroMask);

                llvm::Instruction *newInst = llvm::CallInst::Create(Fn, Args, ci->getName());
                if (newInst != NULL) {
                    llvm::ReplaceInstWithInst(ci, newInst);
                    modifiedAny = true;
                    goto restart;
                }
            } else if (func->getName().equals("llvm.assume") ||
                       func->getName().equals("llvm.experimental.noalias.scope.decl")) {
                // These intrinsics are not supported by backend so remove them.
                ci->eraseFromParent();
                modifiedAny = true;
                goto restart;
            } else if (func->getName().contains("llvm.abs")) {
                // Replace llvm.asb with llvm.genx.aba.alternative
                Assert(ci->getOperand(0));
                llvm::Type *argType = ci->getOperand(0)->getType();

                llvm::Type *Tys[2];
                Tys[0] = func->getReturnType(); // return type
                Tys[1] = argType;               // value type

                llvm::GenXIntrinsic::ID genxAbsID =
                    argType->isIntOrIntVectorTy() ? llvm::GenXIntrinsic::genx_absi : llvm::GenXIntrinsic::genx_absf;
                auto Fn = llvm::GenXIntrinsic::getGenXDeclaration(m->module, genxAbsID, Tys);
                Assert(Fn);
                llvm::Instruction *newInst = llvm::CallInst::Create(Fn, ci->getOperand(0), "");
                if (newInst != NULL) {
                    lCopyMetadata(newInst, ci);
                    llvm::ReplaceInstWithInst(ci, newInst);
                    modifiedAny = true;
                    goto restart;
                }
            }
        }
    }
    DEBUG_END_PASS("LLVM intrinsics replacement");
    return modifiedAny;
}

bool ReplaceLLVMIntrinsics::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("ReplaceLLVMIntrinsics::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateReplaceLLVMIntrinsics() { return new ReplaceLLVMIntrinsics(); }

///////////////////////////////////////////////////////////////////////////
// FixDivisionInstructions

/** This pass replaces instructions supported by LLVM IR, but not supported by GenX backend.
    There is IR for i64 div, but there is no div i64 in VISA and GenX backend doesn't handle
    this situation, so ISPC must not generate IR with i64 div.
 */

class FixDivisionInstructions : public llvm::FunctionPass {
  public:
    static char ID;
    FixDivisionInstructions() : FunctionPass(ID) {}
    llvm::StringRef getPassName() const { return "Fix division instructions unsupported by VC backend"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);
};

char FixDivisionInstructions::ID = 0;

bool FixDivisionInstructions::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("FixDivisionInstructions");
    bool modifiedAny = false;

    std::string name = "";
    llvm::Function *func;
    for (llvm::BasicBlock::iterator I = bb.begin(), E = --bb.end(); I != E; ++I) {
        llvm::Instruction *inst = &*I;
        // for now, all replaced inst have 2 operands
        if (inst->getNumOperands() > 1) {
            auto type = inst->getOperand(0)->getType();
            // for now, all replaced inst have i64 operands
            if ((type == LLVMTypes::Int64Type) || (type == LLVMTypes::Int64VectorType)) {
                switch (inst->getOpcode()) {
                case llvm::Instruction::BinaryOps::UDiv:
                    if (type == LLVMTypes::Int64Type) {
                        name = "__divus_ui64";
                    } else {
                        name = "__divus_vi64";
                    }
                    break;
                case llvm::Instruction::BinaryOps::SDiv:
                    if (type == LLVMTypes::Int64Type) {
                        name = "__divs_ui64";
                    } else {
                        name = "__divs_vi64";
                    }
                    break;
                case llvm::Instruction::BinaryOps::URem:
                    if (type == LLVMTypes::Int64Type) {
                        name = "__remus_ui64";
                    } else {
                        name = "__remus_vi64";
                    }
                    break;
                case llvm::Instruction::BinaryOps::SRem:
                    if (type == LLVMTypes::Int64Type) {
                        name = "__rems_ui64";
                    } else {
                        name = "__rems_vi64";
                    }
                    break;
                default:
                    name = "";
                    break;
                }
                if (name != "") {
                    func = m->module->getFunction(name);
                    Assert(func != NULL && "FixDivisionInstructions: Can't find correct function!!!");
                    llvm::SmallVector<llvm::Value *, 8> args;
                    args.push_back(inst->getOperand(0));
                    args.push_back(inst->getOperand(1));
                    llvm::Instruction *newInst = llvm::CallInst::Create(func, args, name);
                    llvm::ReplaceInstWithInst(inst, newInst);
                    modifiedAny = true;
                    I = newInst->getIterator();
                }
            }
        }
    }
    DEBUG_END_PASS("FixDivisionInstructions");
    return modifiedAny;
}

bool FixDivisionInstructions::runOnFunction(llvm::Function &F) {

    llvm::TimeTraceScope FuncScope("FixDivisionInstructions::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateFixDivisionInstructions() { return new FixDivisionInstructions(); }

///////////////////////////////////////////////////////////////////////////
// CheckUnsupportedInsts

/** This pass checks if there are any functions used which are not supported currently for gen target,
    reports error and stops compilation.
 */

class CheckUnsupportedInsts : public llvm::FunctionPass {
  public:
    static char ID;
    CheckUnsupportedInsts(bool last = false) : FunctionPass(ID) {}

    llvm::StringRef getPassName() const { return "Check unsupported instructions for gen target"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);
};

char CheckUnsupportedInsts::ID = 0;

bool CheckUnsupportedInsts::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("CheckUnsupportedInsts");
    bool modifiedAny = false;
    // This list contains regex expr for unsupported function names
    // To be extended

    for (llvm::BasicBlock::iterator I = bb.begin(), E = --bb.end(); I != E; ++I) {
        llvm::Instruction *inst = &*I;
        SourcePos pos;
        lGetSourcePosFromMetadata(inst, &pos);
        if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
            llvm::Function *func = ci->getCalledFunction();
            // Report error that prefetch is not supported on SKL and TGLLP
            if (func && func->getName().contains("genx.lsc.prefetch.stateless")) {
                if (!g->target->hasGenxPrefetch()) {
                    Error(pos, "\'prefetch\' is not supported by %s\n", g->target->getCPU().c_str());
                }
            }
        }
        // Report error if double type is not supported by the target
        if (!g->target->hasFp64Support()) {
            for (int i = 0; i < (int)inst->getNumOperands(); ++i) {
                llvm::Type *t = inst->getOperand(i)->getType();
                if (t == LLVMTypes::DoubleType || t == LLVMTypes::DoublePointerType ||
                    t == LLVMTypes::DoubleVectorType || t == LLVMTypes::DoubleVectorPointerType) {
                    Error(pos, "\'double\' type is not supported by the target\n");
                }
            }
        }
    }
    DEBUG_END_PASS("CheckUnsupportedInsts");

    return modifiedAny;
}

bool CheckUnsupportedInsts::runOnFunction(llvm::Function &F) {
    llvm::TimeTraceScope FuncScope("CheckUnsupportedInsts::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateCheckUnsupportedInsts() { return new CheckUnsupportedInsts(); }

///////////////////////////////////////////////////////////////////////////
// MangleOpenCLBuiltins

/** This pass mangles SPIR-V OpenCL builtins used in gen target file
 */

class MangleOpenCLBuiltins : public llvm::FunctionPass {
  public:
    static char ID;
    MangleOpenCLBuiltins(bool last = false) : FunctionPass(ID) {}

    llvm::StringRef getPassName() const { return "Mangle OpenCL builtins"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);
};

char MangleOpenCLBuiltins::ID = 0;

bool MangleOpenCLBuiltins::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("MangleOpenCLBuiltins");
    bool modifiedAny = false;
    for (llvm::BasicBlock::iterator I = bb.begin(), E = --bb.end(); I != E; ++I) {
        llvm::Instruction *inst = &*I;
        if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst)) {
            llvm::Function *func = ci->getCalledFunction();
            if (func == NULL)
                continue;
            if (func->getName().startswith("__spirv_ocl")) {
                std::string mangledName;
                llvm::Type *retType = func->getReturnType();
                std::string funcName = func->getName().str();
                std::vector<llvm::Type *> ArgTy;
                // spirv OpenCL builtins are used for double types only
                Assert(retType->isVectorTy() &&
                           llvm::dyn_cast<llvm::VectorType>(retType)->getElementType()->isDoubleTy() ||
                       retType->isSingleValueType() && retType->isDoubleTy());
                if (retType->isVectorTy() &&
                    llvm::dyn_cast<llvm::VectorType>(retType)->getElementType()->isDoubleTy()) {
                    ArgTy.push_back(LLVMTypes::DoubleVectorType);
                    // _DvWIDTH suffix is used in target file to differentiate scalar
                    // and vector versions of intrinsics. Here we remove this
                    // suffix and mangle the name.
                    size_t pos = funcName.find("_DvWIDTH");
                    if (pos != std::string::npos) {
                        funcName.erase(pos, 8);
                    }
                } else if (retType->isSingleValueType() && retType->isDoubleTy()) {
                    ArgTy.push_back(LLVMTypes::DoubleType);
                }
                mangleOpenClBuiltin(funcName, ArgTy, mangledName);
                func->setName(mangledName);
                modifiedAny = true;
            }
        }
    }
    DEBUG_END_PASS("MangleOpenCLBuiltins");

    return modifiedAny;
}

bool MangleOpenCLBuiltins::runOnFunction(llvm::Function &F) {
    llvm::TimeTraceScope FuncScope("MangleOpenCLBuiltins::runOnFunction", F.getName());
    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateMangleOpenCLBuiltins() { return new MangleOpenCLBuiltins(); }

///////////////////////////////////////////////////////////////////////////
// FixAddressSpace

/** This pass fixes should go close to the end of optimizations. It fixes address space issues
    caused by functions inlining and LLVM optimizations.
    TODO: the implemenattion is not completed yet, it just unblocks work on openvkl kernel.
    1. fix where needed svn.block.ld -> load, svm.block.st -> store
    2. svm gathers/scatters <-> private gathers/scatters
    3. ideally generate LLVM vector loads/stores in ImproveOptMemory pass, fix address space here
 */

class FixAddressSpace : public llvm::FunctionPass {
    llvm::Instruction *processVectorLoad(llvm::LoadInst *LI);
    llvm::Instruction *processSVMVectorLoad(llvm::Instruction *LI);
    llvm::Instruction *processVectorStore(llvm::StoreInst *SI);
    llvm::Instruction *processSVMVectorStore(llvm::Instruction *LI);
    void applyReplace(llvm::Instruction *Inst, llvm::Instruction *ToReplace);

  public:
    static char ID;
    FixAddressSpace() : FunctionPass(ID) {}
    llvm::StringRef getPassName() const { return "Fix address space"; }
    bool runOnBasicBlock(llvm::BasicBlock &BB);
    bool runOnFunction(llvm::Function &F);
};

char FixAddressSpace::ID = 0;

void FixAddressSpace::applyReplace(llvm::Instruction *Inst, llvm::Instruction *ToReplace) {
    lCopyMetadata(Inst, ToReplace);
    llvm::ReplaceInstWithInst(ToReplace, Inst);
}

llvm::Instruction *FixAddressSpace::processVectorLoad(llvm::LoadInst *LI) {
    llvm::Value *ptr = LI->getOperand(0);
    llvm::Type *retType = LI->getType();

    Assert(llvm::isa<llvm::VectorType>(LI->getType()));

    if (GetAddressSpace(ptr) != AddressSpace::External)
        return NULL;

    // Ptr load should be done via inttoptr
    bool isPtrLoad = false;
    if (retType->getScalarType()->isPointerTy()) {
        isPtrLoad = true;
        auto scalarType = g->target->is32Bit() ? LLVMTypes::Int32Type : LLVMTypes::Int64Type;
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
        retType =
            llvm::FixedVectorType::get(scalarType, llvm::dyn_cast<llvm::FixedVectorType>(retType)->getNumElements());
#else
        retType = llvm::VectorType::get(scalarType, retType->getVectorNumElements());
#endif
    }
    llvm::Instruction *res = lGenXLoadInst(ptr, retType, llvm::dyn_cast<llvm::Instruction>(LI));
    Assert(res);

    if (isPtrLoad) {
        res->insertBefore(LI);
        res = new llvm::IntToPtrInst(res, LI->getType(), "svm_ld_inttoptr");
    }

    return res;
}

llvm::Instruction *FixAddressSpace::processSVMVectorLoad(llvm::Instruction *CI) {
    llvm::Value *ptr = CI->getOperand(0);
    llvm::Type *retType = CI->getType();

    Assert(llvm::isa<llvm::VectorType>(retType));

    if (GetAddressSpace(ptr) == AddressSpace::External)
        return NULL;

    // Conevrt int64 ptr to pointer
    ptr = new llvm::IntToPtrInst(ptr, llvm::PointerType::get(retType, 0), CI->getName() + "_inttoptr", CI);
    llvm::Instruction *loadInst = NULL;
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
    Assert(llvm::isa<llvm::PointerType>(ptr->getType()));
    loadInst = new llvm::LoadInst(llvm::dyn_cast<llvm::PointerType>(ptr->getType())->getPointerElementType(), ptr,
                                  CI->getName(), false /* not volatile */,
                                  llvm::MaybeAlign(g->target->getNativeVectorAlignment()).valueOrOne(),
                                  (llvm::Instruction *)NULL);
#else
    loadInst = new llvm::LoadInst(ptr, CI->getName(), false,
                                  llvm::MaybeAlign(g->target->getNativeVectorAlignment()).valueOrOne(),
                                  (llvm::Instruction *)NULL);
#endif

    Assert(loadInst);
    return loadInst;
}

llvm::Instruction *FixAddressSpace::processVectorStore(llvm::StoreInst *SI) {
    llvm::Value *ptr = SI->getOperand(1);
    llvm::Value *val = SI->getOperand(0);
    Assert(ptr != NULL);
    Assert(val != NULL);

    llvm::Type *valType = val->getType();

    Assert(llvm::isa<llvm::VectorType>(valType));

    if (GetAddressSpace(ptr) != AddressSpace::External)
        return NULL;

    // Ptr store should be done via ptrtoint
    // Note: it doesn't look like a normal case for GenX target
    if (valType->getScalarType()->isPointerTy()) {
        auto scalarType = g->target->is32Bit() ? LLVMTypes::Int32Type : LLVMTypes::Int64Type;
#if ISPC_LLVM_VERSION >= ISPC_LLVM_11_0
        valType =
            llvm::FixedVectorType::get(scalarType, llvm::dyn_cast<llvm::FixedVectorType>(valType)->getNumElements());
#else
        valType = llvm::VectorType::get(scalarType, valType->getVectorNumElements());
#endif
        val = new llvm::PtrToIntInst(val, valType, "svm_st_val_ptrtoint", SI);
    }

    return lGenXStoreInst(val, ptr, llvm::dyn_cast<llvm::Instruction>(SI));
}

llvm::Instruction *FixAddressSpace::processSVMVectorStore(llvm::Instruction *CI) {
    llvm::Value *ptr = CI->getOperand(0);
    llvm::Value *val = CI->getOperand(1);

    Assert(ptr != NULL);
    Assert(val != NULL);

    llvm::Type *valType = val->getType();

    Assert(llvm::isa<llvm::VectorType>(valType));

    if (GetAddressSpace(ptr) == AddressSpace::External)
        return NULL;

    // Conevrt int64 ptr to pointer
    ptr = new llvm::IntToPtrInst(ptr, llvm::PointerType::get(valType, 0), CI->getName() + "_inttoptr", CI);

    llvm::Instruction *storeInst = NULL;
    storeInst = new llvm::StoreInst(val, ptr, (llvm::Instruction *)NULL,
                                    llvm::MaybeAlign(g->target->getNativeVectorAlignment()).valueOrOne());
    Assert(storeInst);
    return storeInst;
}

bool FixAddressSpace::runOnBasicBlock(llvm::BasicBlock &bb) {
    DEBUG_START_PASS("FixAddressSpace");
    bool modifiedAny = false;

restart:
    for (llvm::BasicBlock::iterator I = bb.begin(), E = --bb.end(); I != E; ++I) {
        llvm::Instruction *inst = &*I;
        if (llvm::LoadInst *ld = llvm::dyn_cast<llvm::LoadInst>(inst)) {
            if (llvm::isa<llvm::VectorType>(ld->getType())) {
                llvm::Instruction *load_inst = processVectorLoad(ld);
                if (load_inst != NULL) {
                    applyReplace(load_inst, ld);
                    modifiedAny = true;
                    goto restart;
                }
            }
        } else if (llvm::StoreInst *st = llvm::dyn_cast<llvm::StoreInst>(inst)) {
            llvm::Value *val = st->getOperand(0);
            Assert(val != NULL);
            if (llvm::isa<llvm::VectorType>(val->getType())) {
                llvm::Instruction *store_inst = processVectorStore(st);
                if (store_inst != NULL) {
                    applyReplace(store_inst, st);
                    modifiedAny = true;
                    goto restart;
                }
            }
        } else if (llvm::GenXIntrinsic::getGenXIntrinsicID(inst) == llvm::GenXIntrinsic::genx_svm_block_ld_unaligned) {
            llvm::Instruction *load_inst = processSVMVectorLoad(inst);
            if (load_inst != NULL) {
                applyReplace(load_inst, inst);
                modifiedAny = true;
                goto restart;
            }
        } else if (llvm::GenXIntrinsic::getGenXIntrinsicID(inst) == llvm::GenXIntrinsic::genx_svm_block_st) {
            llvm::Instruction *store_inst = processSVMVectorStore(inst);
            if (store_inst != NULL) {
                applyReplace(store_inst, inst);
                modifiedAny = true;
                goto restart;
            }
        }
    }
    DEBUG_END_PASS("Fix address space");
    return modifiedAny;
}

bool FixAddressSpace::runOnFunction(llvm::Function &F) {
    // Transformations are correct when the function is not internal.
    // This is due to address space calculation algorithm.
    // TODO: problems can be met in case of Stack Calls
    llvm::TimeTraceScope FuncScope("FixAddressSpace::runOnFunction", F.getName());
    if (F.getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage)
        return false;

    bool modifiedAny = false;
    for (llvm::BasicBlock &BB : F) {
        modifiedAny |= runOnBasicBlock(BB);
    }
    return modifiedAny;
}

static llvm::Pass *CreateFixAddressSpace() { return new FixAddressSpace(); }

class DemotePHIs : public llvm::FunctionPass {
  public:
    static char ID;
    DemotePHIs() : FunctionPass(ID) {}
    llvm::StringRef getPassName() const { return "Demote PHI nodes"; }
    bool runOnFunction(llvm::Function &F);
};

char DemotePHIs::ID = 0;

bool DemotePHIs::runOnFunction(llvm::Function &F) {
    llvm::TimeTraceScope FuncScope("DemotePHIs::runOnFunction", F.getName());
    if (F.isDeclaration() || skipFunction(F))
        return false;
    std::vector<llvm::Instruction *> WorkList;
    for (auto &ibb : F)
        for (llvm::BasicBlock::iterator iib = ibb.begin(), iie = ibb.end(); iib != iie; ++iib)
            if (llvm::isa<llvm::PHINode>(iib))
                WorkList.push_back(&*iib);

    // Demote phi nodes
    for (auto *ilb : llvm::reverse(WorkList))
        DemotePHIToStack(llvm::cast<llvm::PHINode>(ilb), nullptr);

    return !WorkList.empty();
}

static llvm::Pass *CreateDemotePHIs() { return new DemotePHIs(); }

#endif