UnrealEngine/Engine/Source/ThirdParty/Intel/ISPC/ispc-1.24.0/src/ctx.cpp

/*
  Copyright (c) 2010-2024, Intel Corporation

  SPDX-License-Identifier: BSD-3-Clause
*/

/** @file ctx.cpp
    @brief Implementation of the FunctionEmitContext class
*/

#include "ctx.h"
#include "builtins-decl.h"
#include "expr.h"
#include "func.h"
#include "llvmutil.h"
#include "module.h"
#include "stmt.h"
#include "sym.h"
#include "type.h"
#include "util.h"

#include <map>

#include <llvm/BinaryFormat/Dwarf.h>
#include <llvm/IR/DerivedTypes.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/Metadata.h>
#include <llvm/IR/Module.h>

#ifdef ISPC_XE_ENABLED
#include <llvm/GenXIntrinsics/GenXIntrinsics.h>
#endif

namespace ispc {

/** This is a small utility structure that records information related to one
    level of nested control flow.  It's mostly used in correctly restoring
    the mask and other state as we exit control flow nesting levels.
*/
struct CFInfo {
    /** Returns a new instance of the structure that represents entering an
        'if' statement */
    static CFInfo *GetIf(bool isUniform, bool isUniformEmulated, llvm::Value *savedMask);

    /** Returns a new instance of the structure that represents entering a
        loop. */
    static CFInfo *GetLoop(bool isUniform, bool isUniformEmulated, llvm::BasicBlock *breakTarget,
                           llvm::BasicBlock *continueTarget, AddressInfo *savedBreakLanesAddressInfo,
                           AddressInfo *savedContinueLanesAddressInfo, llvm::Value *savedMask,
                           llvm::Value *savedBlockEntryMask);

    static CFInfo *GetForeach(bool isUniformEmulated, FunctionEmitContext::ForeachType ft,
                              llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget,
                              AddressInfo *savedBreakLanesAddressInfo, AddressInfo *savedContinueLanesAddressInfo,
                              llvm::Value *savedMask, llvm::Value *savedBlockEntryMask);

    static CFInfo *GetSwitch(bool isUniform, bool isUniformEmulated, llvm::BasicBlock *breakTarget,
                             llvm::BasicBlock *continueTarget, AddressInfo *savedBreakLanesAddressInfo,
                             AddressInfo *savedContinueLanesAddressInfo, llvm::Value *savedMask,
                             llvm::Value *savedBlockEntryMask, llvm::Value *switchExpr,
                             AddressInfo *savedFallThroughMaskPtr, llvm::BasicBlock *bbDefault,
                             const std::vector<std::pair<int, llvm::BasicBlock *>> *bbCases,
                             const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbNext, bool scUniform);

    bool IsIf() { return type == If; }
    bool IsLoop() { return type == Loop; }
    bool IsForeach() { return (type == ForeachRegular || type == ForeachActive || type == ForeachUnique); }
    bool IsSwitch() { return type == Switch; }
    bool IsVarying() { return !isUniform; }
    bool IsUniform() { return isUniform; }
    bool IsUniformEmulated() { return isUniformEmulated; }

    enum CFType { If, Loop, ForeachRegular, ForeachActive, ForeachUnique, Switch };
    CFType type;
    bool isUniform;
    bool isUniformEmulated;
    llvm::BasicBlock *savedBreakTarget, *savedContinueTarget;
    AddressInfo *savedBreakLanesAddressInfo, *savedContinueLanesAddressInfo;
    llvm::Value *savedMask, *savedBlockEntryMask;
    llvm::Value *savedSwitchExpr;
    AddressInfo *savedSwitchFallThroughMaskAddressInfo;
    llvm::BasicBlock *savedDefaultBlock;
    const std::vector<std::pair<int, llvm::BasicBlock *>> *savedCaseBlocks;
    const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNextBlocks;
    bool savedSwitchConditionWasUniform;

  private:
    CFInfo(CFType t, bool uniformIf, bool uniformEmu, llvm::Value *sm) {
        Assert(t == If);
        type = t;
        isUniform = uniformIf;
        isUniformEmulated = uniformEmu;
        savedBreakTarget = savedContinueTarget = nullptr;
        savedBreakLanesAddressInfo = savedContinueLanesAddressInfo = nullptr;
        savedMask = savedBlockEntryMask = sm;
        savedSwitchExpr = nullptr;
        savedSwitchFallThroughMaskAddressInfo = nullptr;
        savedDefaultBlock = nullptr;
        savedCaseBlocks = nullptr;
        savedNextBlocks = nullptr;
        savedSwitchConditionWasUniform = false;
    }
    CFInfo(CFType t, bool iu, bool uniformEmulated, llvm::BasicBlock *bt, llvm::BasicBlock *ct, AddressInfo *sb,
           AddressInfo *sc, llvm::Value *sm, llvm::Value *lm, llvm::Value *sse = nullptr, AddressInfo *ssftmp = nullptr,
           llvm::BasicBlock *bbd = nullptr, const std::vector<std::pair<int, llvm::BasicBlock *>> *bbc = nullptr,
           const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *bbn = nullptr, bool scu = false) {
        Assert(t == Loop || t == Switch);
        type = t;
        isUniform = iu;
        isUniformEmulated = uniformEmulated;
        savedBreakTarget = bt;
        savedContinueTarget = ct;
        savedBreakLanesAddressInfo = sb;
        savedContinueLanesAddressInfo = sc;
        savedMask = sm;
        savedBlockEntryMask = lm;
        savedSwitchExpr = sse;
        savedSwitchFallThroughMaskAddressInfo = ssftmp;
        savedDefaultBlock = bbd;
        savedCaseBlocks = bbc;
        savedNextBlocks = bbn;
        savedSwitchConditionWasUniform = scu;
    }
    CFInfo(CFType t, bool uniformEmulated, llvm::BasicBlock *bt, llvm::BasicBlock *ct, AddressInfo *sb, AddressInfo *sc,
           llvm::Value *sm, llvm::Value *lm) {
        Assert(t == ForeachRegular || t == ForeachActive || t == ForeachUnique);
        type = t;
        isUniform = uniformEmulated;
        isUniformEmulated = uniformEmulated;
        savedBreakTarget = bt;
        savedContinueTarget = ct;
        savedBreakLanesAddressInfo = sb;
        savedContinueLanesAddressInfo = sc;
        savedMask = sm;
        savedBlockEntryMask = lm;
        savedSwitchExpr = nullptr;
        savedSwitchFallThroughMaskAddressInfo = nullptr;
        savedDefaultBlock = nullptr;
        savedCaseBlocks = nullptr;
        savedNextBlocks = nullptr;
        savedSwitchConditionWasUniform = false;
    }
};

CFInfo *CFInfo::GetIf(bool isUniform, bool isUniformEmulated, llvm::Value *savedMask) {
    return new CFInfo(If, isUniform, isUniformEmulated, savedMask);
}

CFInfo *CFInfo::GetLoop(bool isUniform, bool isUniformEmulated, llvm::BasicBlock *breakTarget,
                        llvm::BasicBlock *continueTarget, AddressInfo *savedBreakLanesAddressInfo,
                        AddressInfo *savedContinueLanesAddressInfo, llvm::Value *savedMask,
                        llvm::Value *savedBlockEntryMask) {
    return new CFInfo(Loop, isUniform, isUniformEmulated, breakTarget, continueTarget, savedBreakLanesAddressInfo,
                      savedContinueLanesAddressInfo, savedMask, savedBlockEntryMask);
}

CFInfo *CFInfo::GetForeach(bool isUniformEmulated, FunctionEmitContext::ForeachType ft, llvm::BasicBlock *breakTarget,
                           llvm::BasicBlock *continueTarget, AddressInfo *savedBreakLanesAddressInfo,
                           AddressInfo *savedContinueLanesAddressInfo, llvm::Value *savedMask,
                           llvm::Value *savedForeachMask) {
    CFType cfType;
    switch (ft) {
    case FunctionEmitContext::FOREACH_REGULAR:
        cfType = ForeachRegular;
        break;
    case FunctionEmitContext::FOREACH_ACTIVE:
        cfType = ForeachActive;
        break;
    case FunctionEmitContext::FOREACH_UNIQUE:
        cfType = ForeachUnique;
        break;
    default:
        FATAL("Unhandled foreach type");
        return nullptr;
    }

    return new CFInfo(cfType, isUniformEmulated, breakTarget, continueTarget, savedBreakLanesAddressInfo,
                      savedContinueLanesAddressInfo, savedMask, savedForeachMask);
}

CFInfo *CFInfo::GetSwitch(bool isUniform, bool isUniformEmulated, llvm::BasicBlock *breakTarget,
                          llvm::BasicBlock *continueTarget, AddressInfo *savedBreakLanesAddressInfo,
                          AddressInfo *savedContinueLanesAddressInfo, llvm::Value *savedMask,
                          llvm::Value *savedBlockEntryMask, llvm::Value *savedSwitchExpr,
                          AddressInfo *savedSwitchFallThroughMaskAddressInfo, llvm::BasicBlock *savedDefaultBlock,
                          const std::vector<std::pair<int, llvm::BasicBlock *>> *savedCases,
                          const std::map<llvm::BasicBlock *, llvm::BasicBlock *> *savedNext,
                          bool savedSwitchConditionUniform) {
    return new CFInfo(Switch, isUniform, isUniformEmulated, breakTarget, continueTarget, savedBreakLanesAddressInfo,
                      savedContinueLanesAddressInfo, savedMask, savedBlockEntryMask, savedSwitchExpr,
                      savedSwitchFallThroughMaskAddressInfo, savedDefaultBlock, savedCases, savedNext,
                      savedSwitchConditionUniform);
}

///////////////////////////////////////////////////////////////////////////

AddressInfo::AddressInfo(llvm::Value *p, llvm::Type *t) : pointer(p), elementType(t), ispcType(nullptr) {
    Assert(pointer != nullptr && "Pointer cannot be null");
    Assert(elementType != nullptr && "Element type cannot be null");
}
AddressInfo::AddressInfo(llvm::Value *p, const Type *t) : pointer(p), ispcType(t) {
    Assert(pointer != nullptr && "Pointer cannot be null");
    Assert(ispcType != nullptr && "ISPC type cannot be null");
    // Get LLVM pointer element type based on ISPC type.
    // TODO: need more testing
    if (CastType<ReferenceType>(t) != nullptr) {
        PointerType *pType = PointerType::GetUniform(t->GetReferenceTarget());
        elementType = pType->GetBaseType()->LLVMStorageType(g->ctx);
    } else if (CastType<PointerType>(t) != nullptr) {
        elementType = t->GetBaseType()->LLVMStorageType(g->ctx);
    } else {
        elementType = t->LLVMStorageType(g->ctx);
    }
    Assert(elementType != nullptr && "Element type cannot be null");
}

llvm::Type *AddressInfo::GetPointeeLLVMType(const PointerType *pt) {
    Assert(pt != nullptr && "ISPC type cannot be null");
    llvm::Type *type = pt->GetBaseType()->LLVMStorageType(g->ctx);
    Assert(type != nullptr && "LLVM pointer element type cannot be null");
    return type;
}
///////////////////////////////////////////////////////////////////////////

FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym, llvm::Function *lf, SourcePos firstStmtPos) {
    function = func;
    llvmFunction = lf;
    switchConditionWasUniform = false;

    /* Create a new basic block to store all of the allocas */
    allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", llvmFunction, 0);
    bblock = llvm::BasicBlock::Create(*g->ctx, "entry", llvmFunction, 0);
    /* But jump from it immediately into the real entry block */
    llvm::BranchInst::Create(bblock, allocaBlock);

    funcStartPos = funSym->pos;

    internalMaskAddressInfo = AllocaInst(LLVMTypes::MaskType, "internal_mask_memory");
    StoreInst(LLVMMaskAllOn, internalMaskAddressInfo);

    // If the function doesn't have __mask in parameters, there is no need to
    // have function mask
    if (((func->GetType()->isExported || func->GetType()->IsISPCExternal()) &&
         (lf->getFunctionType()->getNumParams() == func->GetType()->GetNumParameters())) ||
        (func->GetType()->isUnmasked) || func->GetType()->isTask) {
        functionMaskValue = nullptr;
        fullMaskAddressInfo = nullptr;
    } else {
        functionMaskValue = LLVMMaskAllOn;
        fullMaskAddressInfo = AllocaInst(LLVMTypes::MaskType, "full_mask_memory");
        StoreInst(LLVMMaskAllOn, fullMaskAddressInfo);
    }

    blockEntryMask = nullptr;
    breakLanesAddressInfo = continueLanesAddressInfo = nullptr;
    breakTarget = continueTarget = nullptr;

    switchExpr = nullptr;
    caseBlocks = nullptr;
    defaultBlock = nullptr;
    nextBlocks = nullptr;

    returnedLanesAddressInfo = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory");
    StoreInst(LLVMMaskAllOff, returnedLanesAddressInfo);

    launchedTasks = false;
    launchGroupHandleAddressInfo = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle");
    StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType), launchGroupHandleAddressInfo);

    disableGSWarningCount = 0;

    const Type *returnType = function->GetReturnType();
    if (!returnType || returnType->IsVoidType())
        returnValueAddressInfo = nullptr;
    else {
        returnValueAddressInfo = AllocaInst(returnType, "return_value_memory");
    }

#ifdef ISPC_XE_ENABLED
    if (emitXeHardwareMask()) {
        /* Create return point for Xe */
        returnPoint = llvm::BasicBlock::Create(*g->ctx, "return_point", llvmFunction, 0);
        /* Load return value and return it */
        if (returnValueAddressInfo != nullptr) {
            // We have value(s) to return; load them from their storage
            // location
            // Note that LoadInst() needs to be used instead of direct llvm instruction generation
            // to handle correctly bool values (they need extra convertion, as memory representation
            // is i8, while in SSa form they are l1)
            auto bb = GetCurrentBasicBlock();
            SetCurrentBasicBlock(returnPoint);
            llvm::Value *retVal = LoadInst(returnValueAddressInfo, returnType, "return_value");
            SetCurrentBasicBlock(bb);
            llvm::ReturnInst::Create(*g->ctx, retVal, returnPoint);
        } else {
            llvm::ReturnInst::Create(*g->ctx, returnPoint);
        }
    }
#endif

    if (g->opt.disableMaskAllOnOptimizations) {
        // This is really disgusting.  We want to be able to fool the
        // compiler to not be able to reason that the mask is all on, but
        // we don't want to pay too much of a price at the start of each
        // function to do so.
        //
        // Therefore: first, we declare a module-static __all_on_mask
        // variable that will hold an "all on" mask value.  At the start of
        // each function, we'll load its value and call SetInternalMaskAnd
        // with the result to set the current internal execution mask.
        // (This is a no-op at runtime.)
        //
        // Then, to fool the optimizer that maybe the value of
        // __all_on_mask can't be guaranteed to be "all on", we emit a
        // dummy function that sets __all_on_mask be "all off".  (That
        // function is never actually called.)
        llvm::Value *globalAllOnMaskPtr = m->module->getNamedGlobal("__all_on_mask");
        if (globalAllOnMaskPtr == nullptr) {
            globalAllOnMaskPtr =
                new llvm::GlobalVariable(*m->module, LLVMTypes::MaskType, false, llvm::GlobalValue::InternalLinkage,
                                         LLVMMaskAllOn, "__all_on_mask");

            char buf[256];
            snprintf(buf, sizeof(buf), "__off_all_on_mask_%s", g->target->GetISAString());

            llvm::FunctionCallee offFuncCallee = m->module->getOrInsertFunction(buf, LLVMTypes::VoidType);
            llvm::Constant *offFunc = llvm::cast<llvm::Constant>(offFuncCallee.getCallee());
            AssertPos(currentPos, llvm::isa<llvm::Function>(offFunc));
            llvm::BasicBlock *offBB = llvm::BasicBlock::Create(*g->ctx, "entry", (llvm::Function *)offFunc, 0);
            llvm::StoreInst *inst = new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB);
            if (g->opt.forceAlignedMemory) {
                inst->setAlignment(llvm::MaybeAlign(g->target->getNativeVectorAlignment()).valueOrOne());
            }
            llvm::ReturnInst::Create(*g->ctx, offBB);
        }

        llvm::Value *allOnMask =
            LoadInst(new AddressInfo(globalAllOnMaskPtr, LLVMTypes::MaskType), nullptr, "all_on_mask");
        SetInternalMaskAnd(LLVMMaskAllOn, allOnMask);
    }

    // If reset of FTZ/DAZ flags is requested and we're inside external function,
    // allocate memory for keeping the old FTZ/DAZ value
    if ((g->opt.resetFTZ_DAZ) &&
        (func->GetType()->isExported || func->GetType()->isExternC || func->GetType()->isExternSYCL ||
         func->GetType()->IsISPCKernel()) &&
        // The condition below checks that the function doesn't have additional `__mask` parameter.
        // If it has `__mask`, it's an internal version of export function and we don't need to set
        // FTZ/DAZ flags there.
        (lf->getFunctionType()->getNumParams() == func->GetType()->GetNumParameters())) {
        // On ARM the size of register with FTZ/DAZ flags is platform dependent,
        // on other platforms it's always i32.
        functionFTZ_DAZValue = AllocaInst(
            g->target->getArch() == Arch::aarch64 ? LLVMTypes::Int64Type : LLVMTypes::Int32Type, "func_entry_ftz");

    } else {
        functionFTZ_DAZValue = nullptr;
    }

    if (m->diBuilder) {
        currentPos = funSym->pos;

        /* If debugging is enabled, tell the debug information emission
           code about this new function */
        diFile = funcStartPos.GetDIFile();
        diSpace = funcStartPos.GetDINamespace();
        llvm::DIScope *scope = m->diCompileUnit;
        llvm::DIType *diSubprogramType = nullptr;

        const FunctionType *functionType = function->GetType();
        if (functionType == nullptr)
            AssertPos(currentPos, m->errorCount > 0);
        else {
            diSubprogramType = functionType->GetDIType(scope);
            /*#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6 // 3.2, 3.3, 3.4, 3.5, 3.6
                        AssertPos(currentPos, diSubprogramType.Verify());
            #else // LLVM 3.7+
                        // comming soon
            #endif*/
        }
        /* LLVM 4.0+ */
        Assert(llvm::isa<llvm::DISubroutineType>(diSubprogramType));
        llvm::DISubroutineType *diSubprogramType_n = llvm::cast<llvm::DISubroutineType>(diSubprogramType);
        llvm::DINode::DIFlags flags = llvm::DINode::FlagPrototyped;

        std::string mangledName = std::string(llvmFunction->getName());
        if (mangledName == funSym->name)
            mangledName = "";

        bool isStatic = (funSym->storageClass == SC_STATIC);
        bool isOptimized = (g->opt.level > 0);
        int firstLine = funcStartPos.first_line;

        /* isDefinition is always set to 'true' */
        llvm::DISubprogram::DISPFlags SPFlags = llvm::DISubprogram::SPFlagDefinition;
        if (isOptimized)
            SPFlags |= llvm::DISubprogram::SPFlagOptimized;
        if (isStatic)
            SPFlags |= llvm::DISubprogram::SPFlagLocalToUnit;

        diSubprogram = m->diBuilder->createFunction(diSpace /* scope */, funSym->name, mangledName, diFile, firstLine,
                                                    diSubprogramType_n, firstLine, flags, SPFlags);
        llvmFunction->setSubprogram(diSubprogram);

        /* And start a scope representing the initial function scope */
        StartScope();
    } else {
        diSubprogram = nullptr;
        diFile = nullptr;
        diSpace = nullptr;
    }
}

FunctionEmitContext::~FunctionEmitContext() {
    AssertPos(currentPos, controlFlowInfo.size() == 0);
    AssertPos(currentPos, debugScopes.size() == (m->diBuilder ? 1 : 0));
}

const Function *FunctionEmitContext::GetFunction() const { return function; }

llvm::BasicBlock *FunctionEmitContext::GetCurrentBasicBlock() { return bblock; }

void FunctionEmitContext::SetCurrentBasicBlock(llvm::BasicBlock *bb) { bblock = bb; }

llvm::Value *FunctionEmitContext::GetFunctionMask() { return fullMaskAddressInfo ? functionMaskValue : LLVMMaskAllOn; }

llvm::Value *FunctionEmitContext::GetInternalMask() { return LoadInst(internalMaskAddressInfo, nullptr, "load_mask"); }

llvm::Value *FunctionEmitContext::GetFullMask() {
    return fullMaskAddressInfo ? BinaryOperator(llvm::Instruction::And, GetInternalMask(), functionMaskValue,
                                                WrapSemantics::None, "internal_mask&function_mask")
                               : GetInternalMask();
}

AddressInfo *FunctionEmitContext::GetFullMaskAddressInfo() {
    return fullMaskAddressInfo ? fullMaskAddressInfo : internalMaskAddressInfo;
}

void FunctionEmitContext::SetFunctionMask(llvm::Value *value) {
    if (fullMaskAddressInfo != nullptr) {
        functionMaskValue = value;
        if (bblock != nullptr)
            StoreInst(GetFullMask(), fullMaskAddressInfo);
    }
}

void FunctionEmitContext::SetBlockEntryMask(llvm::Value *value) { blockEntryMask = value; }

void FunctionEmitContext::SetInternalMask(llvm::Value *value) {
    StoreInst(value, internalMaskAddressInfo);
    // kludge so that __mask returns the right value in ispc code.
    if (fullMaskAddressInfo)
        StoreInst(GetFullMask(), fullMaskAddressInfo);
}

void FunctionEmitContext::SetInternalMaskAnd(llvm::Value *oldMask, llvm::Value *test) {
    llvm::Value *mask = BinaryOperator(llvm::Instruction::And, oldMask, test, WrapSemantics::None, "oldMask&test");
    SetInternalMask(mask);
}

void FunctionEmitContext::SetInternalMaskAndNot(llvm::Value *oldMask, llvm::Value *test) {
    llvm::Value *notTest = BinaryOperator(llvm::Instruction::Xor, test, LLVMMaskAllOn, WrapSemantics::None, "~test");
    llvm::Value *mask = BinaryOperator(llvm::Instruction::And, oldMask, notTest, WrapSemantics::None, "oldMask&~test");
    SetInternalMask(mask);
}

llvm::Instruction *FunctionEmitContext::BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
    AssertPos(currentPos, bblock != nullptr);
    llvm::Value *any = Any(GetFullMask());
    llvm::Instruction *bInst = BranchInst(btrue, bfalse, any);
    // It's illegal to add any additional instructions to the basic block
    // now that it's terminated, so set bblock to nullptr to be safe
    bblock = nullptr;
    return bInst;
}

void FunctionEmitContext::BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) {
    AssertPos(currentPos, bblock != nullptr);
    llvm::Value *all = All(GetFullMask());
    BranchInst(btrue, bfalse, all);
    // It's illegal to add any additional instructions to the basic block
    // now that it's terminated, so set bblock to nullptr to be safe
    bblock = nullptr;
}

void FunctionEmitContext::StartUniformIf(bool emulateUniform) {
    controlFlowInfo.push_back(CFInfo::GetIf(true, emulateUniform, GetInternalMask()));
}

void FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) {
    controlFlowInfo.push_back(CFInfo::GetIf(false, false, oldMask));
}

void FunctionEmitContext::EndIf() {
    std::unique_ptr<CFInfo> ci(popCFState());
    // Make sure we match up with a Start{Uniform,Varying}If().
    AssertPos(currentPos, ci->IsIf());

    // 'uniform' ifs don't change the mask so we only need to restore the
    // mask going into the if for 'varying' if statements
    if (ci->IsUniform() || bblock == nullptr)
        return;

    // We can't just restore the mask as it was going into the 'if'
    // statement.  First we have to take into account any program
    // instances that have executed 'return' statements; the restored
    // mask must be off for those lanes.
    restoreMaskGivenReturns(ci->savedMask);

    // If the 'if' statement is inside a loop with a 'varying'
    // condition, we also need to account for any break or continue
    // statements that executed inside the 'if' statmeent; we also must
    // leave the lane masks for the program instances that ran those
    // off after we restore the mask after the 'if'.  The code below
    // ends up being optimized out in the case that there were no break
    // or continue statements (and breakLanesAddressInfo and continueLanesAddressInfo
    // have their initial 'all off' values), so we don't need to check
    // for that here.
    //
    // There are three general cases to deal with here:
    // - Loops: both break and continue are allowed, and thus the corresponding
    //   lane mask pointers are non-nullptr
    // - Foreach: only continueLanesAddressInfo may be non-nullptr
    // - Switch: only breakLanesAddressInfo may be non-nullptr
    if (continueLanesAddressInfo != nullptr || breakLanesAddressInfo != nullptr) {
        // We want to compute:
        // newMask = (oldMask & ~(breakLanes | continueLanes)),
        // treading breakLanes or continueLanes as "all off" if the
        // corresponding pointer is nullptr.
        llvm::Value *bcLanes = nullptr;

        if (continueLanesAddressInfo != nullptr)
            bcLanes = LoadInst(continueLanesAddressInfo, nullptr, "continue_lanes");
        else
            bcLanes = LLVMMaskAllOff;

        if (breakLanesAddressInfo != nullptr) {
            llvm::Value *breakLanes = LoadInst(breakLanesAddressInfo, nullptr, "break_lanes");
            bcLanes = BinaryOperator(llvm::Instruction::Or, bcLanes, breakLanes, WrapSemantics::None, "|break_lanes");
        }

        llvm::Value *notBreakOrContinue = BinaryOperator(llvm::Instruction::Xor, bcLanes, LLVMMaskAllOn,
                                                         WrapSemantics::None, "!(break|continue)_lanes");
        llvm::Value *oldMask = GetInternalMask();
        llvm::Value *newMask =
            BinaryOperator(llvm::Instruction::And, oldMask, notBreakOrContinue, WrapSemantics::None, "new_mask");
        SetInternalMask(newMask);
    }
}

void FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct, bool uniformCF,
                                    bool isEmulatedUniform) {
    // Store the current values of various loop-related state so that we
    // can restore it when we exit this loop.
    llvm::Value *oldMask = GetInternalMask();
    controlFlowInfo.push_back(CFInfo::GetLoop(uniformCF, isEmulatedUniform, breakTarget, continueTarget,
                                              breakLanesAddressInfo, continueLanesAddressInfo, oldMask,
                                              blockEntryMask));
    if (uniformCF)
        // If the loop has a uniform condition, we don't need to track
        // which lanes 'break' or 'continue'; all of the running ones go
        // together, so we just jump
        breakLanesAddressInfo = continueLanesAddressInfo = nullptr;
    else {
        // For loops with varying conditions, allocate space to store masks
        // that record which lanes have done these
        continueLanesAddressInfo = AllocaInst(LLVMTypes::MaskType, "continue_lanes_memory");
        StoreInst(LLVMMaskAllOff, continueLanesAddressInfo);
        breakLanesAddressInfo = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
        StoreInst(LLVMMaskAllOff, breakLanesAddressInfo);
    }

    breakTarget = bt;
    continueTarget = ct;
    blockEntryMask = nullptr; // this better be set by the loop!
}

void FunctionEmitContext::EndLoop() {
    std::unique_ptr<CFInfo> ci(popCFState());
    AssertPos(currentPos, ci->IsLoop());

    if (!ci->IsUniform())
        // If the loop had a 'uniform' test, then it didn't make any
        // changes to the mask so there's nothing to restore.  If it had a
        // varying test, we need to restore the mask to what it was going
        // into the loop, but still leaving off any lanes that executed a
        // 'return' statement.
        restoreMaskGivenReturns(ci->savedMask);
}

void FunctionEmitContext::StartForeach(ForeachType ft, bool isEmulatedUniform) {
    // Issue an error if we're in a nested foreach...
    if (ft == FOREACH_REGULAR) {
        for (int i = 0; i < (int)controlFlowInfo.size(); ++i) {
            if (controlFlowInfo[i]->type == CFInfo::ForeachRegular) {
                Error(currentPos, "Nested \"foreach\" statements are currently "
                                  "illegal.");
                break;
                // Don't return here, however, and in turn allow the caller to
                // do the rest of its codegen and then call EndForeach()
                // normally--the idea being that this gives a chance to find
                // any other errors inside the body of the foreach loop...
            }
        }
    }

    // Store the current values of various loop-related state so that we
    // can restore it when we exit this loop.
    llvm::Value *oldMask = GetInternalMask();
    controlFlowInfo.push_back(CFInfo::GetForeach(isEmulatedUniform, ft, breakTarget, continueTarget,
                                                 breakLanesAddressInfo, continueLanesAddressInfo, oldMask,
                                                 blockEntryMask));
    breakLanesAddressInfo = nullptr;
    breakTarget = nullptr;

    continueLanesAddressInfo = nullptr;
    if (!isEmulatedUniform) {
        continueLanesAddressInfo = AllocaInst(LLVMTypes::MaskType, "foreach_continue_lanes");
        StoreInst(LLVMMaskAllOff, continueLanesAddressInfo);
    }

    continueTarget = nullptr; // should be set by SetContinueTarget()

    blockEntryMask = nullptr;
}

void FunctionEmitContext::EndForeach() {
    std::unique_ptr<CFInfo> ci(popCFState());
    AssertPos(currentPos, ci->IsForeach());
}

void FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
    if (!bblock)
        return;

    // Restore the mask to the given old mask, but leave off any lanes that
    // executed a return statement.
    // newMask = (oldMask & ~returnedLanes)
    llvm::Value *returnedLanes = LoadInst(returnedLanesAddressInfo, nullptr, "returned_lanes");
    llvm::Value *notReturned =
        BinaryOperator(llvm::Instruction::Xor, returnedLanes, LLVMMaskAllOn, WrapSemantics::None, "~returned_lanes");
    llvm::Value *newMask =
        BinaryOperator(llvm::Instruction::And, oldMask, notReturned, WrapSemantics::None, "new_mask");
    SetInternalMask(newMask);
}

/** Returns "true" if the first enclosing non-if control flow expression is
    a "switch" statement.
*/
bool FunctionEmitContext::inSwitchStatement() const {
    // Go backwards through controlFlowInfo, since we add new nested scopes
    // to the back.
    int i = controlFlowInfo.size() - 1;
    while (i >= 0 && controlFlowInfo[i]->IsIf())
        --i;
    // Got to the first non-if (or end of CF info)
    if (i == -1)
        return false;
    return controlFlowInfo[i]->IsSwitch();
}

void FunctionEmitContext::Break(bool doCoherenceCheck) {
    if (breakTarget == nullptr) {
        Error(currentPos, "\"break\" statement is illegal outside of "
                          "for/while/do loops and \"switch\" statements.");
        return;
    }
    AssertPos(currentPos, controlFlowInfo.size() > 0);

    if (bblock == nullptr)
        return;

    if (inSwitchStatement() == true && switchConditionWasUniform == true && ifsInCFAllUniform(CFInfo::Switch)) {
        // We know that all program instances are executing the break, so
        // just jump to the block immediately after the switch.
        AssertPos(currentPos, breakTarget != nullptr);
        BranchInst(breakTarget);
        bblock = nullptr;
        return;
    }

    // If all of the enclosing 'if' tests in the loop have uniform control
    // flow or if we can tell that the mask is all on, then we can just
    // jump to the break location.
    if (inSwitchStatement() == false && ifsInCFAllUniform(CFInfo::Loop)) {
        BranchInst(breakTarget);
        // Set bblock to nullptr since the jump has terminated the basic block
        bblock = nullptr;
    } else {
        // Varying switch, uniform switch where the 'break' is under
        // varying control flow, or a loop with varying 'if's above the
        // break.  In these cases, we need to update the mask of the lanes
        // that have executed a 'break' statement:
        // breakLanes = breakLanes | mask
        AssertPos(currentPos, breakLanesAddressInfo != nullptr);

        llvm::Value *mask = GetInternalMask();
        llvm::Value *breakMask = LoadInst(breakLanesAddressInfo, nullptr, "break_mask");
        llvm::Value *newMask =
            BinaryOperator(llvm::Instruction::Or, mask, breakMask, WrapSemantics::None, "mask|break_mask");
        StoreInst(newMask, breakLanesAddressInfo);

        // Set the current mask to be all off, just in case there are any
        // statements in the same scope after the 'break'.  Most of time
        // this will be optimized away since we'll likely end the scope of
        // an 'if' statement and restore the mask then.
        SetInternalMask(LLVMMaskAllOff);

        if (doCoherenceCheck) {
            if (continueTarget != nullptr)
                // If the user has indicated that this is a 'coherent'
                // break statement, then check to see if the mask is all
                // off.  If so, we have to conservatively jump to the
                // continueTarget, not the breakTarget, since part of the
                // reason the mask is all off may be due to 'continue'
                // statements that executed in the current loop iteration.
                jumpIfAllLoopLanesAreDone(continueTarget);
            else if (breakTarget != nullptr)
                // Similarly handle these for switch statements, where we
                // only have a break target.
                jumpIfAllLoopLanesAreDone(breakTarget);
        }
    }
}

static bool lEnclosingLoopIsForeachActive(const std::vector<CFInfo *> &controlFlowInfo) {
    for (int i = (int)controlFlowInfo.size() - 1; i >= 0; --i) {
        if (controlFlowInfo[i]->type == CFInfo::ForeachActive)
            return true;
    }
    return false;
}

void FunctionEmitContext::Continue(bool doCoherenceCheck) {
    if (!continueTarget) {
        Error(currentPos, "\"continue\" statement illegal outside of "
                          "for/while/do/foreach loops.");
        return;
    }
    AssertPos(currentPos, controlFlowInfo.size() > 0);

    if (ifsInCFAllUniform(CFInfo::Loop) || lEnclosingLoopIsForeachActive(controlFlowInfo)) {
        // Similarly to 'break' statements, we can immediately jump to the
        // continue target if we're only in 'uniform' control flow within
        // loop or if we can tell that the mask is all on.  Here, we can
        // also jump if the enclosing loop is a 'foreach_active' loop, in
        // which case we know that only a single program instance is
        // executing.
        AddInstrumentationPoint("continue: uniform CF, jumped");
        BranchInst(continueTarget);
        bblock = nullptr;
    } else {
        // Otherwise update the stored value of which lanes have 'continue'd.
        // continueLanes = continueLanes | mask
        AssertPos(currentPos, continueLanesAddressInfo);
        llvm::Value *mask = GetInternalMask();
        llvm::Value *continueMask = LoadInst(continueLanesAddressInfo, nullptr, "continue_mask");
        llvm::Value *newMask =
            BinaryOperator(llvm::Instruction::Or, mask, continueMask, WrapSemantics::None, "mask|continueMask");
        StoreInst(newMask, continueLanesAddressInfo);

        // And set the current mask to be all off in case there are any
        // statements in the same scope after the 'continue'
        SetInternalMask(LLVMMaskAllOff);

        if (doCoherenceCheck)
            // If this is a 'coherent continue' statement, then emit the
            // code to see if all of the lanes are now off due to
            // breaks/continues and jump to the continue target if so.
            jumpIfAllLoopLanesAreDone(continueTarget);
    }
}

/** This function checks to see if all of the 'if' statements (if any)
    between the current scope and the first enclosing loop/switch of given
    control flow type have 'uniform' tests.
 */
bool FunctionEmitContext::ifsInCFAllUniform(int type) const {
    AssertPos(currentPos, controlFlowInfo.size() > 0);
    // Go backwards through controlFlowInfo, since we add new nested scopes
    // to the back.  Stop once we come to the first enclosing control flow
    // structure of the desired type.
    int i = controlFlowInfo.size() - 1;
    while (i >= 0 && controlFlowInfo[i]->type != type) {
        if (controlFlowInfo[i]->isUniform == false)
            // Found a scope due to an 'if' statement with a varying test
            return false;
        --i;
    }
    return true;
}

void FunctionEmitContext::jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target) {
    llvm::Value *allDone = nullptr;

    if (breakLanesAddressInfo == nullptr) {
        llvm::Value *continued = LoadInst(continueLanesAddressInfo, nullptr, "continue_lanes");
        continued =
            BinaryOperator(llvm::Instruction::And, continued, GetFunctionMask(), WrapSemantics::None, "continued&func");
        allDone = MasksAllEqual(continued, blockEntryMask);
    } else {
        // Check to see if (returned lanes | continued lanes | break lanes) is
        // equal to the value of mask at the start of the loop iteration.  If
        // so, everyone is done and we can jump to the given target
        llvm::Value *returned = LoadInst(returnedLanesAddressInfo, nullptr, "returned_lanes");
        llvm::Value *breaked = LoadInst(breakLanesAddressInfo, nullptr, "break_lanes");
        llvm::Value *finishedLanes =
            BinaryOperator(llvm::Instruction::Or, returned, breaked, WrapSemantics::None, "returned|breaked");
        if (continueLanesAddressInfo != nullptr) {
            // It's nullptr for "switch" statements...
            llvm::Value *continued = LoadInst(continueLanesAddressInfo, nullptr, "continue_lanes");
            finishedLanes = BinaryOperator(llvm::Instruction::Or, finishedLanes, continued, WrapSemantics::None,
                                           "returned|breaked|continued");
        }

        finishedLanes = BinaryOperator(llvm::Instruction::And, finishedLanes, GetFunctionMask(), WrapSemantics::None,
                                       "finished&func");

        // Do we match the mask at loop or switch statement entry?
        allDone = MasksAllEqual(finishedLanes, blockEntryMask);
    }

    llvm::BasicBlock *bAll = CreateBasicBlock("all_continued_or_breaked");
    llvm::BasicBlock *bNotAll = CreateBasicBlock("not_all_continued_or_breaked");

    BranchInst(bAll, bNotAll, allDone);

    // If so, have an extra basic block along the way to add
    // instrumentation, if the user asked for it.
    bblock = bAll;
    AddInstrumentationPoint("break/continue: all dynamically went");
    BranchInst(target);

    // And set the current basic block to a new one for future instructions
    // for the path where we weren't able to jump
    bblock = bNotAll;
    AddInstrumentationPoint("break/continue: not all went");
}

void FunctionEmitContext::RestoreContinuedLanes() {
    if (continueLanesAddressInfo == nullptr)
        return;

    // mask = mask & continueFlags
    llvm::Value *mask = GetInternalMask();
    llvm::Value *continueMask = LoadInst(continueLanesAddressInfo, nullptr, "continue_mask");
    llvm::Value *orMask =
        BinaryOperator(llvm::Instruction::Or, mask, continueMask, WrapSemantics::None, "mask|continue_mask");
    SetInternalMask(orMask);

    // continueLanes = 0
    StoreInst(LLVMMaskAllOff, continueLanesAddressInfo);
}

void FunctionEmitContext::ClearBreakLanes() {
    if (breakLanesAddressInfo == nullptr)
        return;

    // breakLanes = 0
    StoreInst(LLVMMaskAllOff, breakLanesAddressInfo);
}

void FunctionEmitContext::StartSwitch(bool cfIsUniform, llvm::BasicBlock *bbBreak, bool isEmulatedUniform) {
    llvm::Value *oldMask = GetInternalMask();
    controlFlowInfo.push_back(CFInfo::GetSwitch(cfIsUniform, isEmulatedUniform, breakTarget, continueTarget,
                                                breakLanesAddressInfo, continueLanesAddressInfo, oldMask,
                                                blockEntryMask, switchExpr, switchFallThroughMaskAddressInfo,
                                                defaultBlock, caseBlocks, nextBlocks, switchConditionWasUniform));

    breakLanesAddressInfo = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory");
    StoreInst(LLVMMaskAllOff, breakLanesAddressInfo);
    breakTarget = bbBreak;

    continueLanesAddressInfo = nullptr;
    continueTarget = nullptr;
    blockEntryMask = nullptr;

    // These will be set by the SwitchInst() method
    switchExpr = nullptr;
    switchFallThroughMaskAddressInfo = nullptr;
    defaultBlock = nullptr;
    caseBlocks = nullptr;
    nextBlocks = nullptr;
}

void FunctionEmitContext::EndSwitch() {
    AssertPos(currentPos, bblock != nullptr);

    std::unique_ptr<CFInfo> ci(popCFState());
    if (ci->IsVarying() && bblock != nullptr)
        restoreMaskGivenReturns(ci->savedMask);
}

/** Emit code to check for an "all off" mask before the code for a
    case or default label in a "switch" statement.
 */
void FunctionEmitContext::addSwitchMaskCheck(llvm::Value *mask) {
    llvm::Value *allOff = None(mask);
    llvm::BasicBlock *bbSome = CreateBasicBlock("case_default_on");

    // Find the basic block for the case or default label immediately after
    // the current one in the switch statement--that's where we want to
    // jump if the mask is all off at this label.
    AssertPos(currentPos, nextBlocks->find(bblock) != nextBlocks->end());
    llvm::BasicBlock *bbNext = nextBlocks->find(bblock)->second;

    // Jump to the next one of the mask is all off; otherwise jump to the
    // newly created block that will hold the actual code for this label.
    BranchInst(bbNext, bbSome, allOff);
    SetCurrentBasicBlock(bbSome);
}

/** Returns the execution mask at entry to the first enclosing "switch"
    statement. */
llvm::Value *FunctionEmitContext::getMaskAtSwitchEntry() {
    AssertPos(currentPos, controlFlowInfo.size() > 0);
    int i = controlFlowInfo.size() - 1;
    while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Switch)
        --i;
    AssertPos(currentPos, i != -1);
    return controlFlowInfo[i]->savedMask;
}

void FunctionEmitContext::EmitDefaultLabel(bool checkMask, SourcePos pos) {
    if (inSwitchStatement() == false) {
        Error(pos, "\"default\" label illegal outside of \"switch\" "
                   "statement.");
        return;
    }

    // If there's a default label in the switch, a basic block for it
    // should have been provided in the previous call to SwitchInst().
    AssertPos(currentPos, defaultBlock != nullptr);

#ifdef ISPC_XE_ENABLED
    llvm::BasicBlock *bbDefaultImpl = nullptr;
    if (emitXeHardwareMask()) {
        // Create basic block with actual default implementation
        bbDefaultImpl = CreateBasicBlock("default_impl", defaultBlock);
    }
#endif

    if (bblock != nullptr) {
        // The previous case in the switch fell through, or we're in a
        // varying switch; terminate the current block with a jump to the
        // block for the code for the default label.
#ifdef ISPC_XE_ENABLED
        if (emitXeHardwareMask() && !inXeSimdCF()) {
            // Skip check, branch directly to implementation
            BranchInst(bbDefaultImpl);
        } else
#endif
            BranchInst(defaultBlock);
    }
    SetCurrentBasicBlock(defaultBlock);

#ifdef ISPC_XE_ENABLED
    if (switchConditionWasUniform && emitXeHardwareMask()) {
        // Find next basic block after default
        auto iter = nextBlocks->find(defaultBlock);
        AssertPos(currentPos, iter != nextBlocks->end());
        llvm::BasicBlock *bbNext = iter->second;

        llvm::Value *testVal = llvm::isa<llvm::VectorType>(switchExpr->getType()) ? LLVMMaskAllOn : LLVMTrue;

        // We check only cases after default:
        // EM is turned off for previous ones (or not in case off fall through)
        // Find case value for the next case
        auto caseBlocksIt = caseBlocks->begin();
        for (auto e = caseBlocks->end(); (caseBlocksIt != e) && (caseBlocksIt->second != bbNext); ++caseBlocksIt)
            ;

        for (auto e = caseBlocks->end(); caseBlocksIt != e; ++caseBlocksIt) {
            int value = caseBlocksIt->first;
            llvm::Value *val = nullptr;
            if (llvm::isa<llvm::VectorType>(switchExpr->getType())) {
                val = (switchExpr->getType() == LLVMTypes::Int32VectorType) ? LLVMInt32Vector(value)
                                                                            : LLVMInt64Vector(value);
            } else {
                val = (switchExpr->getType() == LLVMTypes::Int32Type) ? LLVMInt32(value) : LLVMInt64(value);
            }

            // The way to get cmp is the same as under TODO comment below.
            // However, seems like such constructions are transformed to cmp.ne
            // in vISA anyway
            llvm::Value *matchesCaseValue =
                CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr, val, "cmp_case_value");
            llvm::Value *notMatchesCaseValue = NotOperator(matchesCaseValue);
            testVal = BinaryOperator(llvm::Instruction::And, testVal, notMatchesCaseValue, WrapSemantics::None,
                                     "default&~case_match");
        }

        // Don't need to check fall through mask: all lanes that
        // executed on fall through won't fail case checks from
        // above

        // Branch to default/next block. It will set Xe EM
        // for this block and restore mask for turned off lanes after
        // reaching next block
        BranchInst(bbDefaultImpl, bbNext, testVal);
        SetCurrentBasicBlock(bbDefaultImpl);
    }
#endif

    if (switchConditionWasUniform)
        // Nothing more to do for this case; return back to the caller,
        // which will then emit the code for the default case.
        return;

    // For a varying switch, we need to update the execution mask.
    //
    // First, compute the mask that corresponds to which program instances
    // should execute the "default" code; this corresponds to the set of
    // program instances that don't match any of the case statements.
    // Therefore, we generate code that compares the value of the switch
    // expression to the value associated with each of the "case"
    // statements such that the surviving lanes didn't match any of them.
    llvm::Value *matchesDefault = getMaskAtSwitchEntry();
    for (int i = 0; i < (int)caseBlocks->size(); ++i) {
        int value = (*caseBlocks)[i].first;
        llvm::Value *valueVec =
            (switchExpr->getType() == LLVMTypes::Int32VectorType) ? LLVMInt32Vector(value) : LLVMInt64Vector(value);
        // TODO: for AVX2 at least, the following generates better code
        // than doing ICMP_NE and skipping the NotOperator() below; file a
        // LLVM bug?
        llvm::Value *matchesCaseValue =
            CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr, valueVec, "cmp_case_value");
        matchesCaseValue = I1VecToBoolVec(matchesCaseValue);

        llvm::Value *notMatchesCaseValue = NotOperator(matchesCaseValue);
        matchesDefault = BinaryOperator(llvm::Instruction::And, matchesDefault, notMatchesCaseValue,
                                        WrapSemantics::None, "default&~case_match");
    }

    // The mask may have some lanes on, which corresponds to the previous
    // label falling through; compute the updated mask by ANDing with the
    // current mask.
    llvm::Value *oldMask = GetInternalMask();
    llvm::Value *newMask =
        BinaryOperator(llvm::Instruction::Or, oldMask, matchesDefault, WrapSemantics::None, "old_mask|matches_default");
    SetInternalMask(newMask);

    if (checkMask)
        addSwitchMaskCheck(newMask);
}

void FunctionEmitContext::EmitCaseLabel(int value, bool checkMask, SourcePos pos) {
    if (inSwitchStatement() == false) {
        Error(pos, "\"case\" label illegal outside of \"switch\" statement.");
        return;
    }

    // Find the basic block for this case statement.
    llvm::BasicBlock *bbCase = nullptr;
    AssertPos(currentPos, caseBlocks != nullptr);
    for (int i = 0; i < (int)caseBlocks->size(); ++i)
        if ((*caseBlocks)[i].first == value) {
            bbCase = (*caseBlocks)[i].second;
            break;
        }
    AssertPos(currentPos, bbCase != nullptr);

#ifdef ISPC_XE_ENABLED
    llvm::BasicBlock *bbCaseImpl = nullptr;
    if (emitXeHardwareMask()) {
        // Create basic block with actual case implementation
        bbCaseImpl = CreateBasicBlock(llvm::Twine(bbCase->getName()) + "_impl", bbCase);
    }
#endif

    if (bblock != nullptr) {
        // fall through from the previous case
#ifdef ISPC_XE_ENABLED
        if (emitXeHardwareMask() && llvm::isa<llvm::VectorType>(switchExpr->getType())) {
            // EM will be restored after this branch.
            // We need to skip case check for lanes that are
            // turned on at this point.
            StoreInst(XeSimdCFPredicate(LLVMMaskAllOn), switchFallThroughMaskAddressInfo);
        }

        if (emitXeHardwareMask() && !inXeSimdCF()) {
            // Skip check, branch directly to implementation
            BranchInst(bbCaseImpl);
        } else
#endif
            BranchInst(bbCase);
    }
    SetCurrentBasicBlock(bbCase);

#ifdef ISPC_XE_ENABLED
    if (switchConditionWasUniform && emitXeHardwareMask()) {
        // Find the next basic block after this case
        std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
        iter = nextBlocks->find(bbCase);
        AssertPos(currentPos, iter != nextBlocks->end());
        llvm::BasicBlock *bbNext = iter->second;

        // Create compare value
        llvm::Value *caseTest = nullptr;
        if (llvm::isa<llvm::VectorType>(switchExpr->getType())) {
            // Take fall through lanes to turn them on in the next block
            llvm::Value *fallThroughMask = LoadInst(switchFallThroughMaskAddressInfo, nullptr, "fall_through_mask");
            llvm::Value *val =
                (switchExpr->getType() == LLVMTypes::Int32VectorType) ? LLVMInt32Vector(value) : LLVMInt64Vector(value);
            llvm::Value *cmpVal =
                CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr, val, "cmp_case_value");
            caseTest = BinaryOperator(llvm::Instruction::Or, cmpVal, fallThroughMask, WrapSemantics::None, "case_test");
        } else {
            llvm::Value *val = (switchExpr->getType() == LLVMTypes::Int32Type) ? LLVMInt32(value) : LLVMInt64(value);
            caseTest = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr, val, "case_test");
        }

        // Branch to current case/next block. It will set Xe EM
        // for this block and restore mask for turned off lanes after
        // reaching next block
        BranchInst(bbCaseImpl, bbNext, caseTest);
        SetCurrentBasicBlock(bbCaseImpl);
    }
#endif

    if (switchConditionWasUniform)
        return;

    // update the mask: first, get a mask that indicates which program
    // instances have a value for the switch expression that matches this
    // case statement.
    llvm::Value *valueVec =
        (switchExpr->getType() == LLVMTypes::Int32VectorType) ? LLVMInt32Vector(value) : LLVMInt64Vector(value);
    llvm::Value *matchesCaseValue =
        CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, switchExpr, valueVec, "cmp_case_value");
    matchesCaseValue = I1VecToBoolVec(matchesCaseValue);

    // If a lane was off going into the switch, we don't care if has a
    // value in the switch expression that happens to match this case.
    llvm::Value *entryMask = getMaskAtSwitchEntry();
    matchesCaseValue = BinaryOperator(llvm::Instruction::And, entryMask, matchesCaseValue, WrapSemantics::None,
                                      "entry_mask&case_match");

    // Take the surviving lanes and turn on the mask for them.
    llvm::Value *oldMask = GetInternalMask();
    llvm::Value *newMask =
        BinaryOperator(llvm::Instruction::Or, oldMask, matchesCaseValue, WrapSemantics::None, "mask|case_match");
    SetInternalMask(newMask);

    if (checkMask)
        addSwitchMaskCheck(newMask);
}

void FunctionEmitContext::SwitchInst(llvm::Value *expr, llvm::BasicBlock *bbDefault,
                                     const std::vector<std::pair<int, llvm::BasicBlock *>> &bbCases,
                                     const std::map<llvm::BasicBlock *, llvm::BasicBlock *> &bbNext) {
    // The calling code should have called StartSwitch() before calling
    // SwitchInst().
    AssertPos(currentPos, controlFlowInfo.size() && controlFlowInfo.back()->IsSwitch());

    switchExpr = expr;
    defaultBlock = bbDefault;
    caseBlocks = new std::vector<std::pair<int, llvm::BasicBlock *>>(bbCases);
    nextBlocks = new std::map<llvm::BasicBlock *, llvm::BasicBlock *>(bbNext);
    switchConditionWasUniform =
        (llvm::isa<llvm::VectorType>(expr->getType()) == false) || (controlFlowInfo.back()->IsUniformEmulated());

    // Do not make LLVM switch for Xe
    if (switchConditionWasUniform == true && !emitXeHardwareMask()) {
        // For a uniform switch condition, just wire things up to the LLVM
        // switch instruction.
        llvm::SwitchInst *s = llvm::SwitchInst::Create(expr, bbDefault, bbCases.size(), bblock);
        for (int i = 0; i < (int)bbCases.size(); ++i) {
            if (expr->getType() == LLVMTypes::Int32Type)
                s->addCase(LLVMInt32(bbCases[i].first), bbCases[i].second);
            else {
                AssertPos(currentPos, expr->getType() == LLVMTypes::Int64Type);
                s->addCase(LLVMInt64(bbCases[i].first), bbCases[i].second);
            }
        }

        AddDebugPos(s);
        // switch is a terminator
        bblock = nullptr;
    } else {
        if (emitXeHardwareMask()) {
            // Init fall through mask
            switchFallThroughMaskAddressInfo = AllocaInst(LLVMTypes::MaskType, "fall_through_mask");
            StoreInst(LLVMMaskAllOff, switchFallThroughMaskAddressInfo);
        } else {
            // For a varying switch, we first turn off all lanes of the mask
            SetInternalMask(LLVMMaskAllOff);
        }

        if (nextBlocks->size() > 0) {
            // If there are any labels inside the switch, jump to the first
            // one; any code before the first label won't be executed by
            // anyone.
            std::map<llvm::BasicBlock *, llvm::BasicBlock *>::const_iterator iter;
            iter = nextBlocks->find(nullptr);
            AssertPos(currentPos, iter != nextBlocks->end());
            llvm::BasicBlock *bbFirst = iter->second;
            BranchInst(bbFirst);
            bblock = nullptr;
        }
    }
}

int FunctionEmitContext::VaryingCFDepth() const {
    int sum = 0;
    for (unsigned int i = 0; i < controlFlowInfo.size(); ++i)
        if (controlFlowInfo[i]->IsVarying())
            ++sum;
    return sum;
}

bool FunctionEmitContext::InForeachLoop() const {
    for (unsigned int i = 0; i < controlFlowInfo.size(); ++i)
        if (controlFlowInfo[i]->IsForeach())
            return true;
    return false;
}

void FunctionEmitContext::DisableGatherScatterWarnings() { ++disableGSWarningCount; }

void FunctionEmitContext::EnableGatherScatterWarnings() { --disableGSWarningCount; }

bool FunctionEmitContext::initLabelBBlocks(ASTNode *node, void *data) {
    LabeledStmt *ls = llvm::dyn_cast<LabeledStmt>(node);
    if (ls == nullptr)
        return true;

    FunctionEmitContext *ctx = (FunctionEmitContext *)data;

    if (ctx->labelMap.find(ls->name) != ctx->labelMap.end())
        Error(ls->pos, "Multiple labels named \"%s\" in function.", ls->name.c_str());
    else {
        llvm::BasicBlock *bb = ctx->CreateBasicBlock(ls->name);
        ctx->labelMap[ls->name] = bb;
    }
    return true;
}

void FunctionEmitContext::InitializeLabelMap(Stmt *code) {
    labelMap.erase(labelMap.begin(), labelMap.end());
    WalkAST(code, initLabelBBlocks, nullptr, this);
}

llvm::BasicBlock *FunctionEmitContext::GetLabeledBasicBlock(const std::string &label) {
    if (labelMap.find(label) != labelMap.end())
        return labelMap[label];
    else
        return nullptr;
}

std::vector<std::string> FunctionEmitContext::GetLabels() {
    // Initialize vector to the right size
    std::vector<std::string> labels(labelMap.size());

    // Iterate through labelMap and grab only the keys
    std::map<std::string, llvm::BasicBlock *>::iterator iter;
    for (iter = labelMap.begin(); iter != labelMap.end(); iter++)
        labels.push_back(iter->first);

    return labels;
}

void FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
    const Type *returnType = function->GetReturnType();
    if (returnType->IsVoidType()) {
        if (expr != nullptr) {
            const Type *exprType = expr->GetType();
            Assert(exprType);
            Error(expr->pos, "Can't return non-void type \"%s\" from void function.", exprType->GetString().c_str());
        }
    } else {
        if (expr == nullptr) {
            Error(funcStartPos, "Must provide return value for return "
                                "statement for non-void function.");
            return;
        }

        expr = TypeConvertExpr(expr, returnType, "return statement");
        if (expr != nullptr) {
            llvm::Value *retVal = expr->GetValue(this);
            if (retVal != nullptr) {
                if (returnType->IsUniformType() || CastType<ReferenceType>(returnType) != nullptr)
                    StoreInst(retVal, returnValueAddressInfo, returnType);
                else {
                    // Use a masked store to store the value of the expression
                    // in the return value memory; this preserves the return
                    // values from other lanes that may have executed return
                    // statements previously.
                    StoreInst(retVal, returnValueAddressInfo->getPointer(), GetInternalMask(), returnType,
                              PointerType::GetUniform(returnType));
                }
            }
        }
    }

    if (emitXeHardwareMask() || (!emitXeHardwareMask() && VaryingCFDepth() == 0)) {
        // Don't need to create mask management instructions for Xe
        // since execution is managed through Xe EM

        // If there is only uniform control flow between us and the
        // function entry, then it's guaranteed that all lanes are running,
        // so we can just emit a true return instruction
        AddInstrumentationPoint("return: uniform control flow");
        ReturnInst();
    } else {
        // Otherwise we update the returnedLanes value by ANDing it with
        // the current lane mask.
        llvm::Value *oldReturnedLanes = LoadInst(returnedLanesAddressInfo, nullptr, "old_returned_lanes");
        llvm::Value *newReturnedLanes = BinaryOperator(llvm::Instruction::Or, oldReturnedLanes, GetFullMask(),
                                                       WrapSemantics::None, "old_mask|returned_lanes");

        // For 'coherent' return statements, emit code to check if all
        // lanes have returned
        if (doCoherenceCheck) {
            // if newReturnedLanes == functionMaskValue, get out of here!
            llvm::Value *cmp = MasksAllEqual(GetFunctionMask(), newReturnedLanes);
            llvm::BasicBlock *bDoReturn = CreateBasicBlock("do_return");
            llvm::BasicBlock *bNoReturn = CreateBasicBlock("no_return");
            BranchInst(bDoReturn, bNoReturn, cmp);

            bblock = bDoReturn;
            AddInstrumentationPoint("return: all lanes have returned");
            ReturnInst();

            bblock = bNoReturn;
        }
        // Otherwise update returnedLanesAddressInfo and turn off all of the lanes
        // in the current mask so that any subsequent statements in the
        // same scope after the return have no effect
        StoreInst(newReturnedLanes, returnedLanesAddressInfo);
        AddInstrumentationPoint("return: some but not all lanes have returned");
        SetInternalMask(LLVMMaskAllOff);
    }
}

void FunctionEmitContext::SetFunctionFTZ_DAZFlags() {
    if (functionFTZ_DAZValue == nullptr)
        return;
    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction(builtin::__set_ftz_daz_flags, &mm);
    AssertPos(currentPos, mm.size() >= 1);
    llvm::Function *fmm = mm[0]->function;
    std::vector<llvm::Value *> args;
    llvm::Value *oldFTZ = CallInst(fmm, nullptr, args, "");
    StoreInst(oldFTZ, functionFTZ_DAZValue);
}

void FunctionEmitContext::RestoreFunctionFTZ_DAZFlags() {
    if (functionFTZ_DAZValue == nullptr)
        return;
    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction(builtin::__restore_ftz_daz_flags, &mm);
    AssertPos(currentPos, mm.size() >= 1);
    llvm::Function *fmm = mm[0]->function;
    llvm::Value *oldFTZ = LoadInst(functionFTZ_DAZValue);
    std::vector<llvm::Value *> args;
    args.push_back(oldFTZ);
    CallInst(fmm, nullptr, args, "");
}

llvm::Value *FunctionEmitContext::Any(llvm::Value *mask) {
    // Call the target-dependent any function to test that the mask is non-zero

    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction(builtin::__any, &mm);
    if (g->target->getMaskBitCount() == 1)
        AssertPos(currentPos, mm.size() == 1);
    else
        // There should be one with signed int signature, one unsigned int.
        AssertPos(currentPos, mm.size() == 2);
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
    return CallInst(fmm, nullptr, mask, llvm::Twine(mask->getName()) + "_any");
}

llvm::Value *FunctionEmitContext::All(llvm::Value *mask) {
    // Call the target-dependent movmsk function to turn the vector mask
    // into an i64 value
    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction(builtin::__all, &mm);
    if (g->target->getMaskBitCount() == 1)
        AssertPos(currentPos, mm.size() == 1);
    else
        // There should be one with signed int signature, one unsigned int.
        AssertPos(currentPos, mm.size() == 2);
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
    return CallInst(fmm, nullptr, mask, llvm::Twine(mask->getName()) + "_all");
}

llvm::Value *FunctionEmitContext::None(llvm::Value *mask) {
    // Call the target-dependent movmsk function to turn the vector mask
    // into an i64 value
    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction(builtin::__none, &mm);
    if (g->target->getMaskBitCount() == 1)
        AssertPos(currentPos, mm.size() == 1);
    else
        // There should be one with signed int signature, one unsigned int.
        AssertPos(currentPos, mm.size() == 2);
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
    return CallInst(fmm, nullptr, mask, llvm::Twine(mask->getName()) + "_none");
}

llvm::Value *FunctionEmitContext::LaneMask(llvm::Value *v) {
    // Call the target-dependent movmsk function to turn the vector mask
    // into an i64 value
    std::vector<Symbol *> mm;
    m->symbolTable->LookupFunction(builtin::__movmsk, &mm);
    if (g->target->getMaskBitCount() == 1)
        AssertPos(currentPos, mm.size() == 1);
    else
        // There should be one with signed int signature, one unsigned int.
        AssertPos(currentPos, mm.size() == 2);
    // We can actually call either one, since both are i32s as far as
    // LLVM's type system is concerned...
    llvm::Function *fmm = mm[0]->function;
    return CallInst(fmm, nullptr, v, llvm::Twine(v->getName()) + "_movmsk");
}

llvm::Value *FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
    if (g->target->getArch() == Arch::wasm32 || g->target->getArch() == Arch::wasm64) {
        llvm::Function *fmm = m->module->getFunction("__wasm_cmp_msk_eq");
        return CallInst(fmm, nullptr, {v1, v2},
                        ((llvm::Twine("wasm_cmp_msk_eq_") + v1->getName()) + "_") + v2->getName());
    }
    llvm::Value *mm1 = LaneMask(v1);
    llvm::Value *mm2 = LaneMask(v2);
    return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
                   ((llvm::Twine("equal_") + v1->getName()) + "_") + v2->getName());
}

llvm::Value *FunctionEmitContext::ProgramIndexVector(bool is32bits) {
    llvm::SmallVector<llvm::Constant *, 16> array;
    for (int i = 0; i < g->target->getVectorWidth(); ++i) {
        llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
        array.push_back(C);
    }

    llvm::Constant *index = llvm::ConstantVector::get(array);

    return index;
}

llvm::Value *FunctionEmitContext::GetStringPtr(const std::string &str) {
    llvm::Constant *lstr = llvm::ConstantDataArray::getString(*g->ctx, str);
    llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage;
    llvm::Value *lstrPtr =
        new llvm::GlobalVariable(*m->module, lstr->getType(), true /*isConst*/, linkage, lstr, "__str");
    return new llvm::BitCastInst(lstrPtr, LLVMTypes::VoidPointerType, "str_void_ptr", bblock);
}

llvm::BasicBlock *FunctionEmitContext::CreateBasicBlock(const llvm::Twine &name, llvm::BasicBlock *insertAfter) {
    llvm::BasicBlock *newBB = llvm::BasicBlock::Create(*g->ctx, name, llvmFunction);
    if (insertAfter)
        newBB->moveAfter(insertAfter);
    return newBB;
}

llvm::Value *FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
    if (b == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::ArrayType *at = llvm::dyn_cast<llvm::ArrayType>(b->getType());
    if (at) {
        // If we're given an array of vectors of i1s, then do the
        // conversion for each of the elements.
        // Although it is a varying short vector stored as array, use
        // 'LLVMTypes::BoolVectorType' since it can't be exported to C/C++.
        // BoolVectorStorageType is to be used only for entites that can be
        // interfaced with C/C++.
        llvm::Type *boolArrayType = llvm::ArrayType::get(LLVMTypes::BoolVectorType, at->getNumElements());
        llvm::Value *ret = llvm::UndefValue::get(boolArrayType);

        for (unsigned int i = 0; i < at->getNumElements(); ++i) {
            llvm::Value *elt = ExtractInst(b, i);
            llvm::Value *sext =
                SwitchBoolToMaskType(elt, LLVMTypes::BoolVectorType, llvm::Twine(elt->getName()) + "_to_boolvec");
            ret = InsertInst(ret, sext, i);
        }
        return ret;
    } else {
        // For non-array types, convert to 'LLVMTypes::BoolVectorType' if
        // necessary.
        return SwitchBoolToMaskType(b, LLVMTypes::BoolVectorType, llvm::Twine(b->getName()) + "_to_boolvec");
    }
}

static llvm::Value *lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) {
    llvm::Constant *sConstant = llvm::ConstantDataArray::getString(*g->ctx, s, true);
    std::string var_name = "_";
    var_name = var_name + s;
    llvm::GlobalVariable *sPtr =
        new llvm::GlobalVariable(*m->module, sConstant->getType(), true /* const */, llvm::GlobalValue::InternalLinkage,
                                 sConstant, var_name.c_str());
    llvm::Value *indices[2] = {LLVMInt32(0), LLVMInt32(0)};
    llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
    return llvm::GetElementPtrInst::Create(sPtr->getValueType(), sPtr, arrayRef, "sptr", bblock);
}

void FunctionEmitContext::AddInstrumentationPoint(const char *note) {
    AssertPos(currentPos, note != nullptr);
    if (!g->emitInstrumentation)
        return;

    std::vector<llvm::Value *> args;
    // arg 1: filename as string
    args.push_back(lGetStringAsValue(bblock, currentPos.name));
    // arg 2: provided note
    args.push_back(lGetStringAsValue(bblock, note));
    // arg 3: line number
    args.push_back(LLVMInt32(currentPos.first_line));
    // arg 4: current mask, movmsk'ed down to an int64
    args.push_back(LaneMask(GetFullMask()));

    llvm::Function *finst = m->module->getFunction("ISPCInstrument");
    CallInst(finst, nullptr, args, "");
}

void FunctionEmitContext::SetDebugPos(SourcePos pos) { currentPos = pos; }

SourcePos FunctionEmitContext::GetDebugPos() const { return currentPos; }

void FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, llvm::DIScope *scope) {
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(value);
    if (inst != nullptr && m->diBuilder) {
        SourcePos p = pos ? *pos : currentPos;
        if (p.first_line != 0) {
            // If first_line == 0, then we're in the middle of setting up
            // the standard library or the like; don't add debug positions
            // for those functions
            scope = scope ? scope : GetDIScope();
            llvm::DebugLoc diLoc =
                llvm::DILocation::get(scope->getContext(), p.first_line, p.first_column, scope, nullptr, false);
            inst->setDebugLoc(diLoc);
        }
    }
}

void FunctionEmitContext::StartScope() {
    if (m->diBuilder != nullptr) {
        llvm::DIScope *parentScope;
        llvm::DILexicalBlock *lexicalBlock;
        if (debugScopes.size() > 0)
            parentScope = debugScopes.back();
        else
            parentScope = diSubprogram;
        lexicalBlock = m->diBuilder->createLexicalBlock(parentScope, diFile, currentPos.first_line,
                                                        // Revision 216239 in LLVM removes support of DWARF
                                                        // discriminator as the last argument
                                                        currentPos.first_column);
        debugScopes.push_back(llvm::cast<llvm::DILexicalBlockBase>(lexicalBlock));
    }
}

void FunctionEmitContext::EndScope() {
    if (m->diBuilder != nullptr) {
        AssertPos(currentPos, debugScopes.size() > 0);
        debugScopes.pop_back();
    }
}

llvm::DIScope *FunctionEmitContext::GetDIScope() const {
    AssertPos(currentPos, debugScopes.size() > 0);
    return debugScopes.back();
}

void FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) {
    if (m->diBuilder == nullptr)
        return;

    llvm::DIScope *scope = GetDIScope();
    llvm::DIType *diType = sym->type->GetDIType(scope);
    llvm::DILocalVariable *var = m->diBuilder->createAutoVariable(
        scope, sym->name, sym->pos.GetDIFile(), sym->pos.first_line, diType, true /* preserve through opts */);

    llvm::DebugLoc diLoc =
        llvm::DILocation::get(scope->getContext(), sym->pos.first_line, sym->pos.first_column, scope, nullptr, false);
    llvm::Instruction *declareInst =
#if ISPC_LLVM_VERSION >= ISPC_LLVM_19_0
        llvm::cast<llvm::Instruction *>(m->diBuilder->insertDeclare(sym->storageInfo->getPointer(), var,
                                                                    m->diBuilder->createExpression(), diLoc, bblock));
#else
        m->diBuilder->insertDeclare(sym->storageInfo->getPointer(), var, m->diBuilder->createExpression(), diLoc,
                                    bblock);
#endif
    AddDebugPos(declareInst, &sym->pos, scope);
}

void FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym, int argNum) {
    if (m->diBuilder == nullptr)
        return;

    llvm::DINode::DIFlags flags = llvm::DINode::FlagZero;
    llvm::DIScope *scope = diSubprogram;
    llvm::DIType *diType = sym->type->GetDIType(scope);
    llvm::DILocalVariable *var =
        m->diBuilder->createParameterVariable(scope, sym->name, argNum + 1, sym->pos.GetDIFile(), sym->pos.first_line,
                                              diType, true /* preserve through opts */, flags);

    llvm::DebugLoc diLoc =
        llvm::DILocation::get(scope->getContext(), sym->pos.first_line, sym->pos.first_column, scope, nullptr, false);
    llvm::Instruction *declareInst =
#if ISPC_LLVM_VERSION >= ISPC_LLVM_19_0
        llvm::cast<llvm::Instruction *>(m->diBuilder->insertDeclare(sym->storageInfo->getPointer(), var,
                                                                    m->diBuilder->createExpression(), diLoc, bblock));
#else
        m->diBuilder->insertDeclare(sym->storageInfo->getPointer(), var, m->diBuilder->createExpression(), diLoc,
                                    bblock);
#endif
    AddDebugPos(declareInst, &sym->pos, scope);
}

/** If the given type is an array of vector types, then it's the
    representation of an ispc VectorType with varying elements.  If it is
    one of these, return the array size (i.e. the VectorType's size).
    Otherwise return zero.
 */
static int lArrayVectorWidth(llvm::Type *t) {
    llvm::ArrayType *arrayType = llvm::dyn_cast<llvm::ArrayType>(t);
    if (arrayType == nullptr) {
        return 0;
    }

    // We shouldn't be seeing arrays of anything but vectors being passed
    // to things like FunctionEmitContext::BinaryOperator() as operands.
    llvm::FixedVectorType *vectorElementType = llvm::dyn_cast<llvm::FixedVectorType>(arrayType->getElementType());
    Assert((vectorElementType != nullptr && (int)vectorElementType->getNumElements() == g->target->getVectorWidth()));

    return (int)arrayType->getNumElements();
}

llvm::Value *FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst, llvm::Value *v0, llvm::Value *v1,
                                                 WrapSemantics wrapSemantics, const llvm::Twine &name) {
    if (v0 == nullptr || v1 == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    AssertPos(currentPos, v0->getType() == v1->getType());
    llvm::Type *type = v0->getType();
    int arraySize = lArrayVectorWidth(type);

    if (arraySize == 0) {
        llvm::Instruction *bop = llvm::BinaryOperator::Create(inst, v0, v1, name, bblock);
        // We need to enable the nsw bit for signed integer arithmetic to
        // enable some optimizations (including induction variable promotion).
        // Disabled on Xe targets, or via `--wrap-signed-int`.
        if (!g->wrapSignedInt && wrapSemantics == WrapSemantics::NSW)
            bop->setHasNoSignedWrap();
        AddDebugPos(bop);
        return bop;
    } else {
        // If this is an ispc VectorType, apply the binary operator to each
        // of the elements of the array (which in turn should be either
        // scalar types or llvm::VectorTypes.)
        llvm::Value *ret = llvm::UndefValue::get(type);
        for (int i = 0; i < arraySize; ++i) {
            llvm::Value *a = ExtractInst(v0, i);
            llvm::Value *b = ExtractInst(v1, i);
            llvm::Value *op = BinaryOperator(inst, a, b, wrapSemantics);
            ret = InsertInst(ret, op, i);
        }
        return ret;
    }
}

llvm::Value *FunctionEmitContext::NotOperator(llvm::Value *v, const llvm::Twine &name) {
    if (v == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // Similarly to BinaryOperator, do the operation on all the elements of
    // the array if we're given an array type; otherwise just do the
    // regular llvm operation.
    llvm::Type *type = v->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *binst = llvm::BinaryOperator::CreateNot(v, name.isTriviallyEmpty() ? "not" : name, bblock);
        AddDebugPos(binst);
        return binst;
    } else {
        llvm::Value *ret = llvm::UndefValue::get(type);
        for (int i = 0; i < arraySize; ++i) {
            llvm::Value *a = ExtractInst(v, i);
            llvm::Value *op = llvm::BinaryOperator::CreateNot(a, name.isTriviallyEmpty() ? "not" : name, bblock);
            AddDebugPos(op);
            ret = InsertInst(ret, op, i);
        }
        return ret;
    }
}

llvm::Value *FunctionEmitContext::FNegInst(llvm::Value *v, const llvm::Twine &name) {
    if (v == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // Similarly to BinaryOperator, do the operation on all the elements of
    // the array if we're given an array type; otherwise just do the
    // regular llvm operation.
    llvm::Type *type = v->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *fneg = llvm::UnaryOperator::CreateFNeg(v, name.isTriviallyEmpty() ? "fneg" : name, bblock);
        AddDebugPos(fneg);
        return fneg;
    } else {
        llvm::Value *ret = llvm::UndefValue::get(type);
        for (int i = 0; i < arraySize; ++i) {
            llvm::Value *a = ExtractInst(v, i);
            llvm::Value *op = llvm::UnaryOperator::CreateFNeg(a, name.isTriviallyEmpty() ? "fneg" : name, bblock);
            AddDebugPos(op);
            ret = InsertInst(ret, op, i);
        }
        return ret;
    }
}

// Given the llvm Type that represents an ispc VectorType, return an
// equally-shaped type with boolean elements.  (This is the type that will
// be returned from CmpInst with ispc VectorTypes).
static llvm::Type *lGetMatchingBoolVectorType(llvm::Type *type) {
    llvm::ArrayType *arrayType = llvm::dyn_cast<llvm::ArrayType>(type);
    Assert(arrayType != nullptr);

    llvm::FixedVectorType *vectorElementType = llvm::dyn_cast<llvm::FixedVectorType>(arrayType->getElementType());
    Assert(vectorElementType != nullptr);
    Assert((int)vectorElementType->getNumElements() == g->target->getVectorWidth());

    llvm::Type *base = LLVMVECTOR::get(LLVMTypes::BoolType, g->target->getVectorWidth());
    return llvm::ArrayType::get(base, arrayType->getNumElements());
}

llvm::Value *FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst, llvm::CmpInst::Predicate pred,
                                          llvm::Value *v0, llvm::Value *v1, const llvm::Twine &name) {
    if (v0 == nullptr || v1 == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    AssertPos(currentPos, v0->getType() == v1->getType());
    llvm::Type *type = v0->getType();
    int arraySize = lArrayVectorWidth(type);
    if (arraySize == 0) {
        llvm::Instruction *ci =
            llvm::CmpInst::Create(inst, pred, v0, v1, name.isTriviallyEmpty() ? "cmp" : name, bblock);
        AddDebugPos(ci);
        return ci;
    } else {
        llvm::Type *boolType = lGetMatchingBoolVectorType(type);
        llvm::Value *ret = llvm::UndefValue::get(boolType);
        for (int i = 0; i < arraySize; ++i) {
            llvm::Value *a = ExtractInst(v0, i);
            llvm::Value *b = ExtractInst(v1, i);
            llvm::Value *op = CmpInst(inst, pred, a, b, name.isTriviallyEmpty() ? "cmp" : name);
            ret = InsertInst(ret, op, i);
        }
        return ret;
    }
}

llvm::Value *FunctionEmitContext::SmearUniform(llvm::Value *value, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Value *ret = nullptr;
    llvm::Type *eltType = value->getType();
    llvm::Type *vecType = nullptr;

    llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(eltType);
    if (pt != nullptr) {
        // Varying pointers are represented as vectors of i32/i64s
        vecType = LLVMTypes::VoidPointerVectorType;
        value = PtrToIntInst(value);
    } else {
        // All other varying types are represented as vectors of the
        // underlying type.
        vecType = LLVMVECTOR::get(eltType, g->target->getVectorWidth());
    }

    // Check for a constant case.
    if (llvm::Constant *const_val = llvm::dyn_cast<llvm::Constant>(value)) {
        ret = llvm::ConstantVector::getSplat(
            llvm::ElementCount::get(static_cast<unsigned int>(g->target->getVectorWidth()), false), const_val);
        return ret;
    }

    ret = BroadcastValue(value, vecType, name);

    return ret;
}

llvm::Value *FunctionEmitContext::BitCastInst(llvm::Value *value, llvm::Type *type, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Instruction *inst = new llvm::BitCastInst(
        value, type, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_bitcast" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

llvm::Value *FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    if (llvm::isa<llvm::VectorType>(value->getType()))
        // no-op for varying pointers; they're already vectors of ints
        return value;

    llvm::Type *type = LLVMTypes::PointerIntType;
    llvm::Instruction *inst = new llvm::PtrToIntInst(
        value, type, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_ptr2int" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

llvm::Value *FunctionEmitContext::PtrToIntInst(llvm::Value *value, llvm::Type *toType, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Type *fromType = value->getType();
    if (llvm::isa<llvm::VectorType>(fromType)) {
        // varying pointer
        if (fromType == toType)
            // already the right type--done
            return value;
        else if (fromType->getScalarSizeInBits() > toType->getScalarSizeInBits())
            return TruncInst(value, toType,
                             name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_ptr2int" : name);
        else {
            AssertPos(currentPos, fromType->getScalarSizeInBits() < toType->getScalarSizeInBits());
            return ZExtInst(value, toType, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_ptr2int" : name);
        }
    }

    llvm::Instruction *inst = new llvm::PtrToIntInst(
        value, toType, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_ptr2int" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

llvm::Value *FunctionEmitContext::IntToPtrInst(llvm::Value *value, llvm::Type *toType, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Type *fromType = value->getType();
    if (llvm::isa<llvm::VectorType>(fromType)) {
        // varying pointer
        if (fromType == toType)
            // done
            return value;
        else if (fromType->getScalarSizeInBits() > toType->getScalarSizeInBits())
            return TruncInst(value, toType,
                             name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_int2ptr" : name);
        else {
            AssertPos(currentPos, fromType->getScalarSizeInBits() < toType->getScalarSizeInBits());
            return ZExtInst(value, toType, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_int2ptr" : name);
        }
    }

    llvm::Instruction *inst = new llvm::IntToPtrInst(
        value, toType, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_int2ptr" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

llvm::Instruction *FunctionEmitContext::TruncInst(llvm::Value *value, llvm::Type *type, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // TODO: we should probably handle the array case as in
    // e.g. BitCastInst(), but we don't currently need that functionality
    llvm::Instruction *inst = new llvm::TruncInst(
        value, type, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_trunc" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

llvm::Instruction *FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value, llvm::Type *type,
                                                 const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // TODO: we should probably handle the array case as in
    // e.g. BitCastInst(), but we don't currently need that functionality
    llvm::Instruction *inst = llvm::CastInst::Create(
        op, value, type, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_cast" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

llvm::Instruction *FunctionEmitContext::FPCastInst(llvm::Value *value, llvm::Type *type, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // TODO: we should probably handle the array case as in
    // e.g. BitCastInst(), but we don't currently need that functionality
    llvm::Instruction *inst = llvm::CastInst::CreateFPCast(
        value, type, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_cast" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

llvm::Instruction *FunctionEmitContext::SExtInst(llvm::Value *value, llvm::Type *type, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // TODO: we should probably handle the array case as in
    // e.g. BitCastInst(), but we don't currently need that functionality
    llvm::Instruction *inst = new llvm::SExtInst(
        value, type, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_sext" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

llvm::Instruction *FunctionEmitContext::ZExtInst(llvm::Value *value, llvm::Type *type, const llvm::Twine &name) {
    if (value == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // TODO: we should probably handle the array case as in
    // e.g. BitCastInst(), but we don't currently need that functionality
    llvm::Instruction *inst = new llvm::ZExtInst(
        value, type, name.isTriviallyEmpty() ? llvm::Twine(value->getName()) + "_zext" : name, bblock);
    AddDebugPos(inst);
    return inst;
}

/** Utility routine used by the GetElementPtrInst() methods; given a
    pointer to some type (either uniform or varying) and an index (also
    either uniform or varying), this returns the new pointer (varying if
    appropriate) given by offsetting the base pointer by the index times
    the size of the object that the pointer points to.
 */
llvm::Value *FunctionEmitContext::applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index, const Type *ptrType) {
    // Find the scale factor for the index (i.e. the size of the object
    // that the pointer(s) point(s) to.
    const Type *scaleType = ptrType->GetBaseType();
    llvm::Type *llvmScaleType = scaleType->LLVMType(g->ctx);
    Assert(llvmScaleType != nullptr);
    llvm::Value *scale = g->target->SizeOf(llvmScaleType, bblock);

    bool indexIsVarying = llvm::isa<llvm::VectorType>(index->getType());
    llvm::Value *offset = nullptr;
    if (indexIsVarying == false) {
        // Truncate or sign extend the index as appropriate to a 32 or
        // 64-bit type.
        if ((g->target->is32Bit() || g->opt.force32BitAddressing) && index->getType() == LLVMTypes::Int64Type)
            index = TruncInst(index, LLVMTypes::Int32Type);
        else if ((!g->target->is32Bit() && !g->opt.force32BitAddressing) && index->getType() == LLVMTypes::Int32Type)
            index = SExtInst(index, LLVMTypes::Int64Type);

        // do a scalar multiply to get the offset as index * scale and then
        // smear the result out to be a vector; this is more efficient than
        // first promoting both the scale and the index to vectors and then
        // multiplying.
        offset = BinaryOperator(llvm::Instruction::Mul, scale, index, WrapSemantics::NSW);
        offset = SmearUniform(offset);
    } else {
        // Similarly, truncate or sign extend the index to be a 32 or 64
        // bit vector type
        if ((g->target->is32Bit() || g->opt.force32BitAddressing) && index->getType() == LLVMTypes::Int64VectorType)
            index = TruncInst(index, LLVMTypes::Int32VectorType);
        else if ((!g->target->is32Bit() && !g->opt.force32BitAddressing) &&
                 index->getType() == LLVMTypes::Int32VectorType)
            index = SExtInst(index, LLVMTypes::Int64VectorType);

        scale = SmearUniform(scale);
        Assert(index != nullptr);
        // offset = index * scale
        offset = BinaryOperator(llvm::Instruction::Mul, scale, index, WrapSemantics::NSW,
                                ((llvm::Twine("mul_") + scale->getName()) + "_") + index->getName());
    }

    // For 64-bit targets, if we've been doing our offset calculations in
    // 32 bits, we still have to convert to a 64-bit value before we
    // actually add the offset to the pointer.
    if (g->target->is32Bit() == false && g->opt.force32BitAddressing == true)
        offset = SExtInst(offset, LLVMTypes::Int64VectorType, llvm::Twine(offset->getName()) + "_to_64");

    // Smear out the pointer to be varying; either the base pointer or the
    // index must be varying for this method to be called.
    bool baseIsUniform = (llvm::isa<llvm::PointerType>(basePtr->getType()));
    AssertPos(currentPos, baseIsUniform == false || indexIsVarying == true);
    llvm::Value *varyingPtr = baseIsUniform ? SmearUniform(basePtr) : basePtr;

    // newPtr = ptr + offset
    return BinaryOperator(llvm::Instruction::Add, varyingPtr, offset, WrapSemantics::None,
                          llvm::Twine(basePtr->getName()) + "_offset");
}

void FunctionEmitContext::MatchIntegerTypes(llvm::Value **v0, llvm::Value **v1) {
    llvm::Type *type0 = (*v0)->getType();
    llvm::Type *type1 = (*v1)->getType();

    // First, promote to a vector type if one of the two values is a vector
    // type
    if (llvm::isa<llvm::VectorType>(type0) && !llvm::isa<llvm::VectorType>(type1)) {
        *v1 = SmearUniform(*v1, "smear_v1");
        type1 = (*v1)->getType();
    }
    if (!llvm::isa<llvm::VectorType>(type0) && llvm::isa<llvm::VectorType>(type1)) {
        *v0 = SmearUniform(*v0, "smear_v0");
        type0 = (*v0)->getType();
    }

    // And then update to match bit widths
    if (type0 == LLVMTypes::Int32Type && type1 == LLVMTypes::Int64Type)
        *v0 = SExtInst(*v0, LLVMTypes::Int64Type);
    else if (type1 == LLVMTypes::Int32Type && type0 == LLVMTypes::Int64Type)
        *v1 = SExtInst(*v1, LLVMTypes::Int64Type);
    else if (type0 == LLVMTypes::Int32VectorType && type1 == LLVMTypes::Int64VectorType)
        *v0 = SExtInst(*v0, LLVMTypes::Int64VectorType);
    else if (type1 == LLVMTypes::Int32VectorType && type0 == LLVMTypes::Int64VectorType)
        *v1 = SExtInst(*v1, LLVMTypes::Int64VectorType);
}

/** Given an integer index in indexValue that's indexing into an array of
    soa<> structures with given soaWidth, compute the two sub-indices we
    need to do the actual indexing calculation:

    subIndices[0] = (indexValue >> log(soaWidth))
    subIndices[1] = (indexValue & (soaWidth-1))
 */
static llvm::Value *lComputeSliceIndex(FunctionEmitContext *ctx, int soaWidth, llvm::Value *indexValue,
                                       llvm::Value *ptrSliceOffset, llvm::Value **newSliceOffset) {
    // Compute the log2 of the soaWidth.
    Assert(soaWidth > 0);
    int logWidth = 0, sw = soaWidth;
    while (sw > 1) {
        ++logWidth;
        sw >>= 1;
    }
    Assert((1 << logWidth) == soaWidth);

    ctx->MatchIntegerTypes(&indexValue, &ptrSliceOffset);
    Assert(indexValue != nullptr);
    llvm::Type *indexType = indexValue->getType();
    llvm::Value *shift = LLVMIntAsType(logWidth, indexType);
    llvm::Value *mask = LLVMIntAsType(soaWidth - 1, indexType);

    llvm::Value *indexSum =
        ctx->BinaryOperator(llvm::Instruction::Add, indexValue, ptrSliceOffset, WrapSemantics::None, "index_sum");

    // minor index = (index & (soaWidth - 1))
    *newSliceOffset =
        ctx->BinaryOperator(llvm::Instruction::And, indexSum, mask, WrapSemantics::None, "slice_index_minor");
    // slice offsets are always 32 bits...
    if ((*newSliceOffset)->getType() == LLVMTypes::Int64Type)
        *newSliceOffset = ctx->TruncInst(*newSliceOffset, LLVMTypes::Int32Type);
    else if ((*newSliceOffset)->getType() == LLVMTypes::Int64VectorType)
        *newSliceOffset = ctx->TruncInst(*newSliceOffset, LLVMTypes::Int32VectorType);

    // major index = (index >> logWidth)
    return ctx->BinaryOperator(llvm::Instruction::AShr, indexSum, shift, WrapSemantics::None, "slice_index_major");
}

llvm::Value *FunctionEmitContext::MakeSlicePointer(llvm::Value *ptr, llvm::Value *offset) {
    // Create a small struct where the first element is the type of the
    // given pointer and the second element is the type of the offset
    // value.
    std::vector<llvm::Type *> eltTypes;
    eltTypes.push_back(ptr->getType());
    eltTypes.push_back(offset->getType());
    llvm::StructType *st = llvm::StructType::get(*g->ctx, eltTypes);

    llvm::Value *ret = llvm::UndefValue::get(st);
    ret = InsertInst(ret, ptr, 0, llvm::Twine(ret->getName()) + "_slice_ptr");
    ret = InsertInst(ret, offset, 1, llvm::Twine(ret->getName()) + "_slice_offset");
    return ret;
}

const PointerType *FunctionEmitContext::RegularizePointer(const Type *ptrRefType) {
    const PointerType *ptrType;
    if (CastType<ReferenceType>(ptrRefType) != nullptr)
        ptrType = PointerType::GetUniform(ptrRefType->GetReferenceTarget());
    else {
        ptrType = CastType<PointerType>(ptrRefType);
    }
    return ptrType;
}

llvm::Value *FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index, const Type *ptrRefType,
                                                    const llvm::Twine &name) {
    if (basePtr == nullptr || index == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // Regularize to a standard pointer type for basePtr's type
    const PointerType *ptrType = RegularizePointer(ptrRefType);

    if (ptrType->IsSlice()) {
        AssertPos(currentPos, llvm::isa<llvm::StructType>(basePtr->getType()));

        llvm::Value *ptrSliceOffset = ExtractInst(basePtr, 1);
        if (ptrType->IsFrozenSlice() == false) {
            // For slice pointers that aren't frozen, we compute a new
            // index based on the given index plus the offset in the slice
            // pointer.  This gives us an updated integer slice index for
            // the resulting slice pointer and then an index to index into
            // the soa<> structs with.
            llvm::Value *newSliceOffset;
            int soaWidth = ptrType->GetBaseType()->GetSOAWidth();
            index = lComputeSliceIndex(this, soaWidth, index, ptrSliceOffset, &newSliceOffset);
            ptrSliceOffset = newSliceOffset;
        }

        // Handle the indexing into the soa<> structs with the major
        // component of the index through a recursive call
        llvm::Value *p = GetElementPtrInst(ExtractInst(basePtr, 0), index, ptrType->GetAsNonSlice(), name);
        if (p == nullptr) {
            AssertPos(currentPos, m->errorCount > 0);
            return nullptr;
        }
        // And mash the results together for the return value
        return MakeSlicePointer(p, ptrSliceOffset);
    }

    // Double-check consistency between the given pointer type and its LLVM
    // type.
    if (ptrType->IsUniformType())
        AssertPos(currentPos, llvm::isa<llvm::PointerType>(basePtr->getType()));
    else if (ptrType->IsVaryingType())
        AssertPos(currentPos, llvm::isa<llvm::VectorType>(basePtr->getType()));

    bool indexIsVaryingType = llvm::isa<llvm::VectorType>(index->getType());

    if (indexIsVaryingType == false && ptrType->IsUniformType() == true) {
        // The easy case: both the base pointer and the indices are
        // uniform, so just emit the regular LLVM GEP instruction
        llvm::Value *ind[1] = {index};
        llvm::ArrayRef<llvm::Value *> arrayRef(&ind[0], &ind[1]);
        llvm::Type *llvmPtrElType = AddressInfo::GetPointeeLLVMType(ptrType);
        llvm::Instruction *inst = llvm::GetElementPtrInst::Create(llvmPtrElType, basePtr, arrayRef,
                                                                  name.isTriviallyEmpty() ? "gep" : name, bblock);
        AddDebugPos(inst);
        return inst;
    } else
        return applyVaryingGEP(basePtr, index, ptrType);
}

llvm::Value *FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0, llvm::Value *index1,
                                                    const Type *ptrRefType, const llvm::Twine &name) {
    if (basePtr == nullptr || index0 == nullptr || index1 == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // Regularize the pointer type for basePtr
    const PointerType *ptrType = RegularizePointer(ptrRefType);

    if (ptrType->IsSlice()) {
        // Similar to the 1D GEP implementation above, for non-frozen slice
        // pointers we do the two-step indexing calculation and then pass
        // the new major index on to a recursive GEP call.
        AssertPos(currentPos, llvm::isa<llvm::StructType>(basePtr->getType()));
        llvm::Value *ptrSliceOffset = ExtractInst(basePtr, 1);
        if (ptrType->IsFrozenSlice() == false) {
            llvm::Value *newSliceOffset;
            int soaWidth = ptrType->GetBaseType()->GetSOAWidth();
            index1 = lComputeSliceIndex(this, soaWidth, index1, ptrSliceOffset, &newSliceOffset);
            ptrSliceOffset = newSliceOffset;
        }

        llvm::Value *p = GetElementPtrInst(ExtractInst(basePtr, 0), index0, index1, ptrType->GetAsNonSlice(), name);
        if (p == nullptr) {
            AssertPos(currentPos, m->errorCount > 0);
            return nullptr;
        }
        return MakeSlicePointer(p, ptrSliceOffset);
    }

    bool index0IsVaryingType = llvm::isa<llvm::VectorType>(index0->getType());
    bool index1IsVaryingType = llvm::isa<llvm::VectorType>(index1->getType());

    if (index0IsVaryingType == false && index1IsVaryingType == false && ptrType->IsUniformType() == true) {
        // The easy case: both the base pointer and the indices are
        // uniform, so just emit the regular LLVM GEP instruction
        llvm::Value *indices[2] = {index0, index1};
        llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[2]);
        llvm::Type *llvmPtrElType = AddressInfo::GetPointeeLLVMType(ptrType);
        llvm::Instruction *inst = llvm::GetElementPtrInst::Create(llvmPtrElType, basePtr, arrayRef,
                                                                  name.isTriviallyEmpty() ? "gep" : name, bblock);
        AddDebugPos(inst);
        return inst;
    } else {
        // Handle the first dimension with index0
        llvm::Value *ptr0 = GetElementPtrInst(basePtr, index0, ptrType);

        // Now index into the second dimension with index1.  First figure
        // out the type of ptr0.
        const Type *baseType = ptrType->GetBaseType();
        const SequentialType *st = CastType<SequentialType>(baseType);
        AssertPos(currentPos, st != nullptr);

        bool ptr0IsUniform = llvm::isa<llvm::PointerType>(ptr0->getType());
        const Type *ptr0BaseType = st->GetElementType();
        const Type *ptr0Type =
            ptr0IsUniform ? PointerType::GetUniform(ptr0BaseType) : PointerType::GetVarying(ptr0BaseType);

        return applyVaryingGEP(ptr0, index1, ptr0Type);
    }
}

llvm::Value *FunctionEmitContext::AddElementOffset(AddressInfo *fullBasePtrInfo, int elementNum,
                                                   const llvm::Twine &name, const PointerType **resultPtrType) {
    llvm::Value *fullBasePtr = fullBasePtrInfo->getPointer();
    if (resultPtrType != nullptr)
        AssertPos(currentPos, fullBasePtrInfo->getISPCType() != nullptr);

    llvm::StructType *llvmStructType = llvm::dyn_cast<llvm::StructType>(fullBasePtrInfo->getElementType());
    if (llvmStructType != nullptr && llvmStructType->isSized() == false) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    // (Unfortunately) it's not required to have a non-nullptr ispcType in fullBasePtrInfo, but
    // if we have one, regularize into a pointer type.
    const PointerType *ptrType = nullptr;
    if (fullBasePtrInfo->getISPCType() != nullptr) {
        ptrType = RegularizePointer(fullBasePtrInfo->getISPCType());
    }

    // Similarly, we have to see if the pointer type is a struct to see if
    // we have a slice pointer instead of looking at ptrType; this is also
    // unfortunate...
    llvm::Value *basePtr = fullBasePtr;
    bool baseIsSlicePtr = llvm::isa<llvm::StructType>(fullBasePtr->getType());
    const PointerType *rpt;
    if (baseIsSlicePtr) {
        AssertPos(currentPos, ptrType != nullptr);
        // Update basePtr to just be the part that actually points to the
        // start of an soa<> struct for now; the element offset computation
        // doesn't change the slice offset, so we'll incorporate that into
        // the final value right before this method returns.
        basePtr = ExtractInst(fullBasePtr, 0);
        if (resultPtrType == nullptr)
            resultPtrType = &rpt;
    }

    // Return the pointer type of the result of this call, for callers that
    // want it.
    if (resultPtrType != nullptr) {
        AssertPos(currentPos, ptrType != nullptr);
        const CollectionType *ct = CastType<CollectionType>(ptrType->GetBaseType());
        AssertPos(currentPos, ct != nullptr);
        *resultPtrType = new PointerType(ct->GetElementType(elementNum), ptrType->GetVariability(),
                                         ptrType->IsConstType(), ptrType->IsSlice());
    }

    llvm::Value *resultPtr = nullptr;
    if (ptrType == nullptr || ptrType->IsUniformType()) {
        // If the pointer is uniform, we can use the regular LLVM GEP.
        llvm::Value *offsets[2] = {LLVMInt32(0), LLVMInt32(elementNum)};
        llvm::ArrayRef<llvm::Value *> arrayRef(&offsets[0], &offsets[2]);
        resultPtr = llvm::GetElementPtrInst::Create(fullBasePtrInfo->getElementType(), basePtr, arrayRef,
                                                    name.isTriviallyEmpty() ? "struct_offset" : name, bblock);
    } else {
        // Otherwise do the math to find the offset and add it to the given
        // varying pointers
        const StructType *st = CastType<StructType>(ptrType->GetBaseType());
        llvm::Value *offset = nullptr;
        if (st != nullptr)
            // If the pointer is to a structure, Target::StructOffset() gives
            // us the offset in bytes to the given element of the structure
            offset = g->target->StructOffset(st->LLVMType(g->ctx), elementNum, bblock);
        else {
            // Otherwise we should have a vector or array here and the offset
            // is given by the element number times the size of the element
            // type of the vector.
            const SequentialType *st = CastType<SequentialType>(ptrType->GetBaseType());
            AssertPos(currentPos, st != nullptr);
            llvm::Type *elemLLVMType = st->GetElementType()->LLVMType(g->ctx);
            Assert(elemLLVMType);
            llvm::Value *size = g->target->SizeOf(elemLLVMType, bblock);
            llvm::Value *scale =
                (g->target->is32Bit() || g->opt.force32BitAddressing) ? LLVMInt32(elementNum) : LLVMInt64(elementNum);
            offset = BinaryOperator(llvm::Instruction::Mul, size, scale, WrapSemantics::NSW);
        }

        offset = SmearUniform(offset, "offset_smear");

        if (g->target->is32Bit() == false && g->opt.force32BitAddressing == true)
            // If we're doing 32 bit addressing with a 64 bit target, although
            // we did the math above in 32 bit, we need to go to 64 bit before
            // we add the offset to the varying pointers.
            offset = SExtInst(offset, LLVMTypes::Int64VectorType, "offset_to_64");

        resultPtr = BinaryOperator(llvm::Instruction::Add, basePtr, offset, WrapSemantics::None, "struct_ptr_offset");
    }

    // Finally, if had a slice pointer going in, mash back together with
    // the original (unchanged) slice offset.
    if (baseIsSlicePtr)
        return MakeSlicePointer(resultPtr, ExtractInst(fullBasePtr, 1));
    else
        return resultPtr;
}

llvm::Value *FunctionEmitContext::lSwitchBoolSize_2(llvm::Value *value, llvm::Type *toType, bool toStorageType,
                                                    const llvm::Twine &name) {
    if ((value == nullptr) || (toType == nullptr)) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Type *fromType = value->getType();

    if (fromType == toType) {
        // No cast is needed
        return value;
    }

    // Internal representation of bool matches mask behaviour that requires
    // true value to be -1. It is an implication of some instruction
    // requirements, e.g., blendvps (the most significant bit should be 1).
    // Whereas, storage representation matches SystemV ABI. It requires bool to
    // be presented as byte with the least significant bit equal 0 or 1 and
    // others 7 bits equal to 0, i.e., true is 1 when we read or write values
    // to/from C/C++.
    // Both fromType and toType can be vector or scalar and storage or internal
    // bool representation. E.g., we need to support here conversion from
    // internal vector to storage vector type, it may be conversion from
    // 1) <4 x i32> to <4 x i8> or 2) <4 x i8> to <4 x i32>.
    // 1) <-1, 0,...> to <1, 0,...>
    // 2) <1, 0,...> to <-1, 0,...>
    // To support all cases with less code do bool casting in two stages:
    // 1) truncate to LLVM IR native bool types i1 or <N x i1>
    // 2) zero or sign extend from native bool types to storage or internal correspondingly.
    llvm::Value *i1Bool = value;
    llvm::Twine newName = name.isTriviallyEmpty() ? (llvm::Twine(value->getName()) + "_toi1") : name;
    if (llvm::dyn_cast<llvm::FixedVectorType>(fromType)) {
        llvm::VectorType *i1VecType = LLVMVECTOR::get(
            llvm::Type::getInt1Ty(*g->ctx), llvm::dyn_cast<llvm::FixedVectorType>(fromType)->getNumElements());
        // trunc only if needed
        if (fromType != i1VecType) {
            i1Bool = TruncInst(value, i1VecType, newName);
        }
        // return if we already have requested type
        if (toType == i1VecType) {
            return i1Bool;
        }
    } else {
        llvm::Type *i1Type = llvm::Type::getInt1Ty(*g->ctx);
        if (fromType != i1Type) {
            i1Bool = TruncInst(value, i1Type, newName);
        }
        if (toType == i1Type) {
            return i1Bool;
        }
    }

    llvm::Value *newBool = nullptr;
    // We can't distinguish bool storage and mask type comparing the type sizes
    // only because the width of storage and mask type can be same, e.g., for
    // i8xN targets. Although, these types have same width they has to be
    // extended in different way to match ABI or ISPC internals. Utilize
    // toStorageType argument to distinguish these cases.
    if (toStorageType) {
        // zero extend to storage types to comply with ABI
        llvm::Twine toSorageBool = name.isTriviallyEmpty() ? (llvm::Twine(value->getName()) + "_toStorageBool") : name;
        newBool = ZExtInst(i1Bool, toType, toSorageBool);
    } else {
        // sign extend to internal representation
        llvm::Twine toMaskBool = name.isTriviallyEmpty() ? (llvm::Twine(value->getName()) + "_toMaskBool") : name;
        newBool = SExtInst(i1Bool, toType, toMaskBool);
    }

    return newBool;
}

llvm::Value *FunctionEmitContext::lSwitchBoolSize_1(llvm::Value *value, llvm::Type *toType, bool toStorageType,
                                                    const llvm::Twine &name) {
    llvm::ArrayType *at = llvm::dyn_cast<llvm::ArrayType>(toType);
    if (at) {
        // We're given an array of vectors (short vector).
        llvm::Type *eltType = at->getElementType();
        llvm::Value *ret = llvm::UndefValue::get(toType);
        for (unsigned int i = 0; i < at->getNumElements(); ++i) {
            llvm::Value *elt = ExtractInst(value, i);
            llvm::Value *x = lSwitchBoolSize_2(elt, eltType, toStorageType, llvm::Twine(elt->getName()) + "_bv");
            ret = InsertInst(ret, x, i);
        }
        return ret;
    } else {
        return lSwitchBoolSize_2(value, toType, toStorageType, name);
    }
}

llvm::Value *FunctionEmitContext::SwitchBoolToMaskType(llvm::Value *value, llvm::Type *toType,
                                                       const llvm::Twine &name) {
    return lSwitchBoolSize_1(value, toType, false, name);
}

llvm::Value *FunctionEmitContext::SwitchBoolToStorageType(llvm::Value *value, llvm::Type *toType,
                                                          const llvm::Twine &name) {
    return lSwitchBoolSize_1(value, toType, true, name);
}

llvm::Value *FunctionEmitContext::LoadInst(AddressInfo *ptrInfo, const Type *type, const llvm::Twine &name) {
    if (ptrInfo == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }
    llvm::Value *ptr = ptrInfo->getPointer();
    llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(ptr->getType());
    AssertPos(currentPos, pt != nullptr);

    llvm::LoadInst *inst =
        new llvm::LoadInst(ptrInfo->getElementType(), ptr,
                           name.isTriviallyEmpty() ? (llvm::Twine(ptr->getName()) + "_load") : name, bblock);

    if (g->opt.forceAlignedMemory && llvm::dyn_cast<llvm::VectorType>(ptrInfo->getElementType())) {
        inst->setAlignment(llvm::MaybeAlign(g->target->getNativeVectorAlignment()).valueOrOne());
    }

    AddDebugPos(inst);

    llvm::Value *loadVal = inst;
    // bool type is stored as i8. So, it requires some processing.
    if ((type != nullptr) && (type->IsBoolType())) {
        if (CastType<AtomicType>(type) != nullptr) {
            loadVal = SwitchBoolToMaskType(loadVal, type->LLVMType(g->ctx));
        } else if ((CastType<VectorType>(type) != nullptr)) {
            const VectorType *vType = CastType<VectorType>(type);
            if (CastType<AtomicType>(vType->GetElementType()) != nullptr) {
                loadVal = SwitchBoolToMaskType(loadVal, type->LLVMType(g->ctx));
            }
        }
    }
    return loadVal;
}

/** Given a slice pointer to soa'd data that is a basic type (atomic,
    pointer, or enum type), use the slice offset to compute pointer(s) to
    the appropriate individual data element(s).
 */
static llvm::Value *lFinalSliceOffset(FunctionEmitContext *ctx, llvm::Value *ptr, const PointerType **ptrType) {
    Assert(CastType<PointerType>(*ptrType) != nullptr);

    llvm::Value *slicePtr = ctx->ExtractInst(ptr, 0, llvm::Twine(ptr->getName()) + "_ptr");
    llvm::Value *sliceOffset = ctx->ExtractInst(ptr, 1, llvm::Twine(ptr->getName()) + "_offset");

    // slicePtr should be a pointer to an soa-width wide array of the
    // final atomic/enum/pointer type
    const Type *unifBaseType = (*ptrType)->GetBaseType()->GetAsUniformType();
    Assert(Type::IsBasicType(unifBaseType));

    // The final pointer type is a uniform or varying pointer to the
    // underlying uniform type, depending on whether the given pointer is
    // uniform or varying.
    *ptrType =
        (*ptrType)->IsUniformType() ? PointerType::GetUniform(unifBaseType) : PointerType::GetVarying(unifBaseType);

    // For uniform pointers, bitcast to a pointer to the uniform element
    // type, so that the GEP below does the desired indexing
    if ((*ptrType)->IsUniformType())
        slicePtr = ctx->BitCastInst(slicePtr, (*ptrType)->LLVMType(g->ctx));

    // And finally index based on the slice offset
    return ctx->GetElementPtrInst(slicePtr, sliceOffset, *ptrType, llvm::Twine(slicePtr->getName()) + "_final_gep");
}

/** Utility routine that loads from a uniform pointer to soa<> data,
    returning a regular uniform (non-SOA result).
 */
llvm::Value *FunctionEmitContext::loadUniformFromSOA(llvm::Value *ptr, llvm::Value *mask, const PointerType *ptrType,
                                                     const llvm::Twine &name) {
    const Type *unifType = ptrType->GetBaseType()->GetAsUniformType();

    const CollectionType *ct = CastType<CollectionType>(ptrType->GetBaseType());
    if (ct != nullptr) {
        // If we have a struct/array, we need to decompose it into
        // individual element loads to fill in the result structure since
        // the SOA slice of values we need isn't contiguous in memory...
        llvm::Type *llvmReturnType = unifType->LLVMType(g->ctx);
        llvm::Value *retValue = llvm::UndefValue::get(llvmReturnType);

        for (int i = 0; i < ct->GetElementCount(); ++i) {
            const PointerType *eltPtrType;
            llvm::Value *eltPtr = AddElementOffset(new AddressInfo(ptr, ptrType), i, "elt_offset", &eltPtrType);
            llvm::Value *eltValue = LoadInst(eltPtr, mask, eltPtrType, name);
            retValue = InsertInst(retValue, eltValue, i, "set_value");
        }

        return retValue;
    } else {
        // Otherwise we've made our way to a slice pointer to a basic type;
        // we need to apply the slice offset into this terminal SOA array
        // and then perform the final load
        ptr = lFinalSliceOffset(this, ptr, &ptrType);
        return LoadInst(ptr, mask, ptrType, name);
    }
}

llvm::Value *FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask, const Type *ptrRefType,
                                           const llvm::Twine &name, bool one_elem) {
    if (ptr == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    AssertPos(currentPos, ptrRefType != nullptr && mask != nullptr);

    const PointerType *ptrType = RegularizePointer(ptrRefType);
    const Type *elType;
    if (CastType<ReferenceType>(ptrRefType) != nullptr) {
        elType = ptrRefType->GetReferenceTarget();
    } else {
        elType = ptrType->GetBaseType();
    }

    if (CastType<UndefinedStructType>(ptrType->GetBaseType())) {
        Error(currentPos, "Unable to load to undefined struct type \"%s\".",
              ptrType->GetBaseType()->GetString().c_str());
        return nullptr;
    }

    if (ptrType->IsUniformType()) {
        if (ptrType->IsSlice()) {
            return loadUniformFromSOA(ptr, mask, ptrType,
                                      name.isTriviallyEmpty() ? (llvm::Twine(ptr->getName()) + "_load") : name);
        } else {
            // FIXME: same issue as above load inst regarding alignment...
            //
            // If the ptr is a straight up regular pointer, then just issue
            // a regular load.  First figure out the alignment; in general we
            // can just assume the natural alignment (0 here), but for varying
            // atomic types, we need to make sure that the compiler emits
            // unaligned vector loads, so we specify a reduced alignment here.
            const AtomicType *atomicType = CastType<AtomicType>(ptrType->GetBaseType());

            llvm::Type *llvmPtrType = AddressInfo::GetPointeeLLVMType(ptrType);
            llvm::LoadInst *inst = new llvm::LoadInst(
                llvmPtrType, ptr, name.isTriviallyEmpty() ? (llvm::Twine(ptr->getName()) + "_load") : name,
                false /* not volatile */, bblock);

            if (atomicType != nullptr && atomicType->IsVaryingType()) {
                // We actually just want to align to the vector element
                // alignment, but can't easily get that here, so just tell LLVM
                // it's totally unaligned.  (This shouldn't make any difference
                // vs the proper alignment in practice.)
                int align = 1;

                inst->setAlignment(llvm::MaybeAlign(align).valueOrOne());
            }

            AddDebugPos(inst);
            llvm::Value *loadVal = inst;
            // bool type is stored as i8. So, it requires some processing.
            if (elType->IsBoolType() && (CastType<AtomicType>(elType) != nullptr)) {
                loadVal = SwitchBoolToMaskType(loadVal, elType->LLVMType(g->ctx));
            }
            return loadVal;
        }
    } else {
        // Otherwise we should have a varying ptr and it's time for a
        // gather.
        llvm::Value *gather_result = gather(ptr, ptrType, GetFullMask(),
                                            name.isTriviallyEmpty() ? (llvm::Twine(ptr->getName()) + "_load") : name);
        if (!one_elem)
            return gather_result;

        // It is a kludge. When we dereference varying pointer to uniform struct
        // with "bound uniform" member, we should return first unmasked member.
        Warning(currentPos, "Dereferencing varying pointer to uniform struct with 'bound uniform' member,\n"
                            " only one value will survive. Possible loss of data.");
        // Call the target-dependent movmsk function to turn the vector mask
        // into an i64 value
        std::vector<Symbol *> mm;
        m->symbolTable->LookupFunction(builtin::__movmsk, &mm);
        if (g->target->getMaskBitCount() == 1)
            AssertPos(currentPos, mm.size() == 1);
        else
            // There should be one with signed int signature, one unsigned int.
            AssertPos(currentPos, mm.size() == 2);
        // We can actually call either one, since both are i32s as far as
        // LLVM's type system is concerned...
        llvm::Function *fmm = mm[0]->function;
        llvm::Value *int_mask = CallInst(fmm, nullptr, mask, llvm::Twine(mask->getName()) + "_movmsk");
        std::vector<Symbol *> lz;
        m->symbolTable->LookupFunction(builtin::__count_trailing_zeros_i64, &lz);
        llvm::Function *flz = lz[0]->function;
        llvm::Value *elem_idx = CallInst(flz, nullptr, int_mask, llvm::Twine(mask->getName()) + "_clz");
        llvm::Value *elem = llvm::ExtractElementInst::Create(
            gather_result, elem_idx, llvm::Twine(gather_result->getName()) + "_umasked_elem", bblock);
        return elem;
    }
}

llvm::Value *FunctionEmitContext::gather(llvm::Value *ptr, const PointerType *ptrType, llvm::Value *mask,
                                         const llvm::Twine &name) {
    // We should have a varying pointer if we get here...
    AssertPos(currentPos, ptrType->IsVaryingType());

    const Type *returnType = ptrType->GetBaseType()->GetAsVaryingType();
    llvm::Type *llvmReturnType = returnType->LLVMType(g->ctx);
    const CollectionType *collectionType = CastType<CollectionType>(ptrType->GetBaseType());
    if (collectionType != nullptr) {
        // For collections, recursively gather element wise to find the
        // result.
        llvm::Value *retValue = llvm::UndefValue::get(llvmReturnType);

        const CollectionType *returnCollectionType = CastType<CollectionType>(returnType->GetBaseType());

        for (int i = 0; i < collectionType->GetElementCount(); ++i) {
            const PointerType *eltPtrType;
            llvm::Value *eltPtr = AddElementOffset(new AddressInfo(ptr, ptrType), i, "gather_elt_ptr", &eltPtrType);

            eltPtr = addVaryingOffsetsIfNeeded(eltPtr, eltPtrType);
            const Type *eltType = nullptr;
            if (returnCollectionType) {
                eltType = returnCollectionType->GetElementType(i);
            }

            // It is a kludge. When we dereference varying pointer to uniform struct
            // with "bound uniform" member, we should return first unmasked member.
            int need_one_elem = CastType<StructType>(ptrType->GetBaseType()) && eltType && eltType->IsUniformType();
            // This in turn will be another gather
            llvm::Value *eltValues = LoadInst(eltPtr, mask, eltPtrType, name, need_one_elem);
            if (eltType && eltType->IsBoolType()) {
                eltValues =
                    SwitchBoolToStorageType(eltValues, eltType->LLVMStorageType(g->ctx), "bool_storage_convert");
            }

            retValue = InsertInst(retValue, eltValues, i, "set_value");
        }
        return retValue;
    } else if (ptrType->IsSlice()) {
        // If we have a slice pointer, we need to add the final slice
        // offset here right before issuing the actual gather
        //
        // FIXME: would it be better to do the corresponding same thing for
        // all of the varying offsets stuff here (and in scatter)?
        ptr = lFinalSliceOffset(this, ptr, &ptrType);
    }

    // Otherwise we should just have a basic scalar or pointer type and we
    // can go and do the actual gather
    AddInstrumentationPoint("gather");
    // Figure out which gather function to call based on the size of
    // the elements.
    const PointerType *pt = CastType<PointerType>(returnType);
    const char *funcName = nullptr;
    if (pt != nullptr)
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_i32 : builtin::__pseudo_gather64_i64;
    // bool type is stored as i8.
    else if (returnType->IsBoolType())
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_i8 : builtin::__pseudo_gather64_i8;
    else if (llvmReturnType == LLVMTypes::DoubleVectorType)
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_double : builtin::__pseudo_gather64_double;
    else if (llvmReturnType == LLVMTypes::Int64VectorType)
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_i64 : builtin::__pseudo_gather64_i64;
    else if (llvmReturnType == LLVMTypes::FloatVectorType)
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_float : builtin::__pseudo_gather64_float;
    else if (llvmReturnType == LLVMTypes::Float16VectorType)
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_half : builtin::__pseudo_gather64_half;
    else if (llvmReturnType == LLVMTypes::Int32VectorType)
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_i32 : builtin::__pseudo_gather64_i32;
    else if (llvmReturnType == LLVMTypes::Int16VectorType)
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_i16 : builtin::__pseudo_gather64_i16;
    else {
        AssertPos(currentPos, llvmReturnType == LLVMTypes::Int8VectorType);
        funcName = g->target->is32Bit() ? builtin::__pseudo_gather32_i8 : builtin::__pseudo_gather64_i8;
    }

    llvm::Function *gatherFunc = m->module->getFunction(funcName);
    AssertPos(currentPos, gatherFunc != nullptr);
#ifdef ISPC_XE_ENABLED
    if (emitXeHardwareMask()) {
        // Predicate ISPC mask with Xe execution mask so
        // after CMSimdCFLoweringPass pseudo_gather will have correct masked value.
        mask = XeSimdCFPredicate(mask);
    }
#endif

    llvm::Value *gatherCall = CallInst(gatherFunc, nullptr, ptr, mask, name);

    // Add metadata about the source file location so that the
    // optimization passes can print useful performance warnings if we
    // can't optimize out this gather
    if (disableGSWarningCount == 0)
        addGSMetadata(gatherCall, currentPos);

    // bool type is stored as i8. So, it requires some processing.
    if (returnType->IsBoolType()) {
        if (g->target->getDataLayout()->getTypeSizeInBits(returnType->LLVMStorageType(g->ctx)) <
            g->target->getDataLayout()->getTypeSizeInBits(llvmReturnType)) {
            // This is needed when array of bool is passed in from cpp side
            // TRUE in clang is '1'. This is zero extended to i8.
            // In ispc, this is uniform * varying which after gather becomes
            // varying bool. Varying bool in ispc is '-1'. The most
            // significant bit being set to 1 is important for blendv
            // operations to work as expected.
            if (ptrType->GetBaseType()->IsUniformType()) {
                gatherCall = TruncInst(gatherCall, LLVMTypes::Int1VectorType);
                gatherCall = SExtInst(gatherCall, llvmReturnType);
            } else {
                gatherCall = SExtInst(gatherCall, llvmReturnType);
            }
        } else if (g->target->getDataLayout()->getTypeSizeInBits(returnType->LLVMStorageType(g->ctx)) >
                   g->target->getDataLayout()->getTypeSizeInBits(llvmReturnType)) {
            gatherCall = TruncInst(gatherCall, llvmReturnType);
        }
    }
    return gatherCall;
}

/** Add metadata to the given instruction to encode the current source file
    position.  This data is used in the lGetSourcePosFromMetadata()
    function in opt.cpp.
*/
void FunctionEmitContext::addGSMetadata(llvm::Value *v, SourcePos pos) {
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
    if (inst == nullptr)
        return;
    llvm::MDString *str = llvm::MDString::get(*g->ctx, pos.name);
    llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str);
    inst->setMetadata("filename", md);

    llvm::Metadata *first_line = llvm::ConstantAsMetadata::get(LLVMInt32(pos.first_line));
    md = llvm::MDNode::get(*g->ctx, first_line);
    inst->setMetadata("first_line", md);

    llvm::Metadata *first_column = llvm::ConstantAsMetadata::get(LLVMInt32(pos.first_column));
    md = llvm::MDNode::get(*g->ctx, first_column);
    inst->setMetadata("first_column", md);

    llvm::Metadata *last_line = llvm::ConstantAsMetadata::get(LLVMInt32(pos.last_line));
    md = llvm::MDNode::get(*g->ctx, last_line);
    inst->setMetadata("last_line", md);

    llvm::Metadata *last_column = llvm::ConstantAsMetadata::get(LLVMInt32(pos.last_column));
    md = llvm::MDNode::get(*g->ctx, last_column);
    inst->setMetadata("last_column", md);
}

llvm::Value *FunctionEmitContext::AddrSpaceCastInst(llvm::Value *val, AddressSpace as, bool atEntryBlock) {
    Assert(llvm::isa<llvm::PointerType>(val->getType()));
    llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(val->getType());
    Assert(pt);
    if (pt->getAddressSpace() == (unsigned)as) {
        return val;
    }
#ifdef ISPC_OPAQUE_PTR_MODE
    llvm::PointerType *newType = llvm::PointerType::get(*g->ctx, (unsigned)as);
#else
    llvm::PointerType *newType = llvm::PointerType::getWithSamePointeeType(pt, (unsigned)as);
#endif
    llvm::AddrSpaceCastInst *inst;
    if (atEntryBlock) {
        inst = new llvm::AddrSpaceCastInst(val, newType, val->getName() + "__cast", allocaBlock->getTerminator());
    } else {
        inst = new llvm::AddrSpaceCastInst(val, newType, val->getName() + "__cast", bblock);
    }

    return inst;
}

AddressInfo *FunctionEmitContext::AllocaInst(llvm::Type *llvmType, llvm::Value *size, const llvm::Twine &name,
                                             int align, bool atEntryBlock) {
    if ((llvmType == nullptr) || (size == nullptr)) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::AllocaInst *inst = nullptr;
    unsigned AS = llvmFunction->getParent()->getDataLayout().getAllocaAddrSpace();
    if (atEntryBlock) {
        // We usually insert it right before the jump instruction at the
        // end of allocaBlock
        llvm::Instruction *retInst = allocaBlock->getTerminator();
        AssertPos(currentPos, retInst);
        inst = new llvm::AllocaInst(llvmType, AS, size, name, retInst);
    } else {
        // Unless the caller overrode the default and wants it in the
        // current basic block
        inst = new llvm::AllocaInst(llvmType, AS, size, name, bblock);
    }

    // If no alignment was specified but we have an array of a uniform
    // type, then align it to the native vector alignment; it's not
    // unlikely that this array will be loaded into varying variables with
    // what will be aligned accesses if the uniform -> varying load is done
    // in regular chunks.
    llvm::ArrayType *arrayType = llvm::dyn_cast<llvm::ArrayType>(llvmType);
    if (align == 0 && arrayType != nullptr && !llvm::isa<llvm::VectorType>(arrayType->getElementType()))
        align = g->target->getNativeVectorAlignment();

    if (align != 0) {
        inst->setAlignment(llvm::MaybeAlign(align).valueOrOne());
    }
    return new AddressInfo(inst, llvmType);
    ;
}

AddressInfo *FunctionEmitContext::AllocaInst(llvm::Type *llvmType, const llvm::Twine &name, int align,
                                             bool atEntryBlock) {
    if (llvmType == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::AllocaInst *inst = nullptr;
    unsigned AS = llvmFunction->getParent()->getDataLayout().getAllocaAddrSpace();
    if (atEntryBlock) {
        // We usually insert it right before the jump instruction at the
        // end of allocaBlock
        llvm::Instruction *retInst = allocaBlock->getTerminator();
        AssertPos(currentPos, retInst);
        inst = new llvm::AllocaInst(llvmType, AS, name, retInst);
    } else {
        // Unless the caller overrode the default and wants it in the
        // current basic block
        inst = new llvm::AllocaInst(llvmType, AS, name, bblock);
    }

    // If no alignment was specified but we have an array of a uniform
    // type, then align it to the native vector alignment; it's not
    // unlikely that this array will be loaded into varying variables with
    // what will be aligned accesses if the uniform -> varying load is done
    // in regular chunks.
    llvm::ArrayType *arrayType = llvm::dyn_cast<llvm::ArrayType>(llvmType);
    if (align == 0 && arrayType != nullptr && !llvm::isa<llvm::VectorType>(arrayType->getElementType()))
        align = g->target->getNativeVectorAlignment();

    if (align != 0) {
        inst->setAlignment(llvm::MaybeAlign(align).valueOrOne());
    }
    // Don't add debugging info to alloca instructions
    return new AddressInfo(inst, llvmType);
}

AddressInfo *FunctionEmitContext::AllocaInst(const Type *ptrType, const llvm::Twine &name, int align,
                                             bool atEntryBlock) {
    if (ptrType == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Type *llvmStorageType = ptrType->LLVMType(g->ctx);
    if ((((CastType<AtomicType>(ptrType) != nullptr) || (CastType<VectorType>(ptrType) != nullptr)) &&
         (ptrType->IsBoolType())) ||
        ((CastType<ArrayType>(ptrType) != nullptr) && (ptrType->GetBaseType()->IsBoolType()))) {
        llvmStorageType = ptrType->LLVMStorageType(g->ctx);
    }

    return AllocaInst(llvmStorageType, name, align, atEntryBlock);
}

/** Code to store the given varying value to the given location, only
    storing the elements that correspond to active program instances as
    given by the provided storeMask value.  Note that the lvalue is only a
    single pointer, not a varying lvalue of one pointer per program
    instance (that case is handled by scatters).
 */
void FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, llvm::Value *mask) {
    if (value == nullptr || ptr == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return;
    }

    AssertPos(currentPos, CastType<PointerType>(ptrType) != nullptr);
    AssertPos(currentPos, ptrType->IsUniformType());

    const Type *valueType = ptrType->GetBaseType();
    const CollectionType *collectionType = CastType<CollectionType>(valueType);
    if (collectionType != nullptr) {
        // Assigning a structure / array / vector. Handle each element
        // individually with what turns into a recursive call to
        // makedStore()
        for (int i = 0; i < collectionType->GetElementCount(); ++i) {
            const Type *eltType = collectionType->GetElementType(i);
            if (eltType == nullptr) {
                Assert(m->errorCount > 0);
                continue;
            }
            llvm::Value *eltValue = ExtractInst(value, i, "value_member");
            llvm::Value *eltPtr = AddElementOffset(new AddressInfo(ptr, ptrType), i, "struct_ptr_ptr");
            const Type *eltPtrType = PointerType::GetUniform(eltType);
            StoreInst(eltValue, eltPtr, mask, eltType, eltPtrType);
        }
        return;
    }

    // We must have a regular atomic, enumerator, or pointer type at this
    // point.
    AssertPos(currentPos, Type::IsBasicType(valueType));
    valueType = valueType->GetAsNonConstType();

    // Figure out if we need a 8, 16, 32 or 64-bit masked store.
    llvm::Function *maskedStoreFunc = nullptr;
    llvm::Type *llvmValueType = value->getType();
    llvm::Type *llvmValueStorageType = llvmValueType;

    const PointerType *pt = CastType<PointerType>(valueType);
    // bool type is stored as i8. So, it requires some processing.
    if ((pt == nullptr) && (valueType->IsBoolType())) {
        llvmValueStorageType = LLVMTypes::BoolVectorStorageType;
    }
    if (pt != nullptr) {
        if (pt->IsSlice()) {
            // Masked store of (varying) slice pointer.
            AssertPos(currentPos, pt->IsVaryingType());

            // First, extract the pointer from the slice struct and masked
            // store that.
            AddressInfo *ptrInfo = new AddressInfo(ptr, ptrType);
            llvm::Value *v0 = ExtractInst(value, 0);
            llvm::Value *p0 = AddElementOffset(ptrInfo, 0);
            maskedStore(v0, p0, PointerType::GetUniform(pt->GetAsNonSlice()), mask);

            // And then do same for the integer offset
            llvm::Value *v1 = ExtractInst(value, 1);
            llvm::Value *p1 = AddElementOffset(ptrInfo, 1);
            const Type *offsetType = AtomicType::VaryingInt32;
            maskedStore(v1, p1, PointerType::GetUniform(offsetType), mask);

            return;
        }

        if (g->target->is32Bit())
            maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_i32);
        else
            maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_i64);
    } else if (llvmValueType == LLVMTypes::Int1VectorType) {
        llvm::Value *notMask =
            BinaryOperator(llvm::Instruction::Xor, mask, LLVMMaskAllOn, WrapSemantics::None, "~mask");
        AddressInfo *ptrInfo = new AddressInfo(ptr, llvmValueStorageType);
        llvm::Value *old = LoadInst(ptrInfo, valueType);
        llvm::Value *maskedOld = BinaryOperator(llvm::Instruction::And, old, notMask, WrapSemantics::None, "old&~mask");
        llvm::Value *maskedNew = BinaryOperator(llvm::Instruction::And, value, mask, WrapSemantics::None, "new&mask");
        llvm::Value *final =
            BinaryOperator(llvm::Instruction::Or, maskedOld, maskedNew, WrapSemantics::None, "old_new_result");
        StoreInst(final, ptrInfo, valueType);
        return;
    } else if (llvmValueStorageType == LLVMTypes::DoubleVectorType) {
        maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_double);
    } else if (llvmValueStorageType == LLVMTypes::Int64VectorType) {
        maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_i64);
    } else if (llvmValueStorageType == LLVMTypes::FloatVectorType) {
        maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_float);
    } else if (llvmValueStorageType == LLVMTypes::Float16VectorType) {
        maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_half);
    } else if (llvmValueStorageType == LLVMTypes::Int32VectorType) {
        maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_i32);
    } else if (llvmValueStorageType == LLVMTypes::Int16VectorType) {
        maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_i16);
    } else if (llvmValueStorageType == LLVMTypes::Int8VectorType) {
        maskedStoreFunc = m->module->getFunction(builtin::__pseudo_masked_store_i8);
        value = SwitchBoolToStorageType(value, llvmValueStorageType);
    }
    AssertPos(currentPos, maskedStoreFunc != nullptr);

#ifdef ISPC_XE_ENABLED
    if (emitXeHardwareMask()) {
        mask = XeSimdCFPredicate(mask);
    }
#endif
    std::vector<llvm::Value *> args;
    args.push_back(ptr);
    args.push_back(value);
    args.push_back(mask);

    CallInst(maskedStoreFunc, nullptr, args);
}

/** Scatter the given varying value to the locations given by the varying
    lvalue (which should be an array of pointers with size equal to the
    target's vector width.  We want to store each rvalue element at the
    corresponding pointer's location, *if* the mask for the corresponding
    program instance are on.  If they're off, don't do anything.
*/
void FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr, const Type *valueType, const Type *origPt,
                                  llvm::Value *mask) {
    const PointerType *ptrType = CastType<PointerType>(origPt);
    AssertPos(currentPos, ptrType != nullptr);
    AssertPos(currentPos, ptrType->IsVaryingType());
    const CollectionType *srcCollectionType = CastType<CollectionType>(valueType);
    if (srcCollectionType != nullptr) {
        // We're scattering a collection type--we need to keep track of the
        // source type (the type of the data values to be stored) and the
        // destination type (the type of objects in memory that will be
        // stored into) separately.  This is necessary so that we can get
        // all of the addressing calculations right if we're scattering
        // from a varying struct to an array of uniform instances of the
        // same struct type, versus scattering into an array of varying
        // instances of the struct type, etc.
        const CollectionType *dstCollectionType = CastType<CollectionType>(ptrType->GetBaseType());
        AssertPos(currentPos, dstCollectionType != nullptr);

        // Scatter the collection elements individually
        for (int i = 0; i < srcCollectionType->GetElementCount(); ++i) {
            // First, get the values for the current element out of the
            // source.
            llvm::Value *eltValue = ExtractInst(value, i);
            const Type *srcEltType = srcCollectionType->GetElementType(i);

            // We may be scattering a uniform atomic element; in this case
            // we'll smear it out to be varying before making the recursive
            // scatter() call below.
            if (srcEltType->IsUniformType() && Type::IsBasicType(srcEltType)) {
                eltValue = SmearUniform(eltValue, "to_varying");
                srcEltType = srcEltType->GetAsVaryingType();
            }

            // Get the (varying) pointer to the i'th element of the target
            // collection
            llvm::Value *eltPtr = AddElementOffset(new AddressInfo(ptr, ptrType), i);

            // The destination element type may be uniform (e.g. if we're
            // scattering to an array of uniform structs).  Thus, we need
            // to be careful about passing the correct type to
            // addVaryingOffsetsIfNeeded() here.
            const Type *dstEltType = dstCollectionType->GetElementType(i);
            const PointerType *dstEltPtrType = PointerType::GetVarying(dstEltType);
            if (ptrType->IsSlice())
                dstEltPtrType = dstEltPtrType->GetAsSlice();

            eltPtr = addVaryingOffsetsIfNeeded(eltPtr, dstEltPtrType);

            // And recursively scatter() until we hit a basic type, at
            // which point the actual memory operations can be performed...
            scatter(eltValue, eltPtr, srcEltType, dstEltPtrType, mask);
        }
        return;
    } else if (ptrType->IsSlice()) {
        // As with gather, we need to add the final slice offset finally
        // once we get to a terminal SOA array of basic types..
        ptr = lFinalSliceOffset(this, ptr, &ptrType);
    }

    const PointerType *pt = CastType<PointerType>(valueType);

    // And everything should be a pointer or atomic (or enum) from here on out...
    AssertPos(currentPos,
              pt != nullptr || CastType<AtomicType>(valueType) != nullptr || CastType<EnumType>(valueType) != nullptr);

    llvm::Type *llvmStorageType = value->getType();
    ;
    // bool type is stored as i8. So, it requires some processing.
    if ((pt == nullptr) && (valueType->IsBoolType())) {
        llvmStorageType = LLVMTypes::BoolVectorStorageType;
        value = SwitchBoolToStorageType(value, llvmStorageType);
    }
    const char *funcName = nullptr;
    if (pt != nullptr) {
        funcName = g->target->is32Bit() ? builtin::__pseudo_scatter32_i32 : builtin::__pseudo_scatter64_i64;
    } else if (llvmStorageType == LLVMTypes::DoubleVectorType) {
        funcName = g->target->is32Bit() ? builtin::__pseudo_scatter32_double : builtin::__pseudo_scatter64_double;
    } else if (llvmStorageType == LLVMTypes::Int64VectorType) {
        funcName = g->target->is32Bit() ? builtin::__pseudo_scatter32_i64 : builtin::__pseudo_scatter64_i64;
    } else if (llvmStorageType == LLVMTypes::FloatVectorType) {
        funcName = g->target->is32Bit() ? builtin::__pseudo_scatter32_float : builtin::__pseudo_scatter64_float;
    } else if (llvmStorageType == LLVMTypes::Float16VectorType) {
        funcName = g->target->is32Bit() ? builtin::__pseudo_scatter32_half : builtin::__pseudo_scatter64_half;
    } else if (llvmStorageType == LLVMTypes::Int32VectorType) {
        funcName = g->target->is32Bit() ? builtin::__pseudo_scatter32_i32 : builtin::__pseudo_scatter64_i32;
    } else if (llvmStorageType == LLVMTypes::Int16VectorType) {
        funcName = g->target->is32Bit() ? builtin::__pseudo_scatter32_i16 : builtin::__pseudo_scatter64_i16;
    } else if (llvmStorageType == LLVMTypes::Int8VectorType) {
        funcName = g->target->is32Bit() ? builtin::__pseudo_scatter32_i8 : builtin::__pseudo_scatter64_i8;
    }

    llvm::Function *scatterFunc = m->module->getFunction(funcName);
    AssertPos(currentPos, scatterFunc != nullptr);

    AddInstrumentationPoint("scatter");
#ifdef ISPC_XE_ENABLED
    if (emitXeHardwareMask()) {
        // Predicate ISPC mask with Xe execution mask so
        // after CMSimdCFLoweringPass pseudo_scatter will have correct masked value.
        mask = XeSimdCFPredicate(mask);
    }
#endif
    std::vector<llvm::Value *> args;
    args.push_back(ptr);
    args.push_back(value);
    args.push_back(mask);
    llvm::Value *inst = CallInst(scatterFunc, nullptr, args);

    if (disableGSWarningCount == 0)
        addGSMetadata(inst, currentPos);
}

void FunctionEmitContext::StoreInst(llvm::Value *value, AddressInfo *ptrInfo, const Type *ptrType) {
    if (value == nullptr || ptrInfo == nullptr) {
        // may happen due to error elsewhere
        AssertPos(currentPos, m->errorCount > 0);
        return;
    }
    llvm::Value *ptr = ptrInfo->getPointer();

    llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(ptr->getType());
    AssertPos(currentPos, pt != nullptr);
    if ((ptrType != nullptr) && (ptrType->IsBoolType())) {
        if ((CastType<AtomicType>(ptrType) != nullptr)) {
            value = SwitchBoolToStorageType(value, ptrType->LLVMStorageType(g->ctx));
        } else if (CastType<VectorType>(ptrType) != nullptr) {
            const VectorType *vType = CastType<VectorType>(ptrType);
            if (CastType<AtomicType>(vType->GetElementType()) != nullptr) {
                value = SwitchBoolToStorageType(value, ptrType->LLVMStorageType(g->ctx));
            }
        }
    }

    llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock);

    if (g->opt.forceAlignedMemory && llvm::dyn_cast<llvm::VectorType>(ptrInfo->getElementType())) {
        inst->setAlignment(llvm::MaybeAlign(g->target->getNativeVectorAlignment()).valueOrOne());
    }

    AddDebugPos(inst);
}

void FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr, llvm::Value *mask, const Type *valueType,
                                    const Type *ptrRefType) {
    if (value == nullptr || ptr == nullptr) {
        // may happen due to error elsewhere
        AssertPos(currentPos, m->errorCount > 0);
        return;
    }

    const PointerType *ptrType = RegularizePointer(ptrRefType);
    AddressInfo *ptrInfo = new AddressInfo(ptr, ptrType);

    if (CastType<UndefinedStructType>(ptrType->GetBaseType())) {
        Error(currentPos, "Unable to store to undefined struct type \"%s\".",
              ptrType->GetBaseType()->GetString().c_str());
        return;
    }

    // Figure out what kind of store we're doing here
    if (ptrType->IsUniformType()) {
        if (ptrType->IsSlice())
            // storing a uniform value to a single slice of a SOA type
            storeUniformToSOA(value, ptr, mask, valueType, ptrType);
        else if (ptrType->GetBaseType()->IsUniformType())
            // the easy case
            StoreInst(value, ptrInfo, valueType);
        else if (mask == LLVMMaskAllOn && !g->opt.disableMaskAllOnOptimizations)
            // Otherwise it is a masked store unless we can determine that the
            // mask is all on...  (Unclear if this check is actually useful.)
            StoreInst(value, ptrInfo, valueType);
        else {
            maskedStore(value, ptr, ptrType, mask);
        }
    } else {
        AssertPos(currentPos, ptrType->IsVaryingType());
        // We have a varying ptr (an array of pointers), so it's time to
        // scatter
        scatter(value, ptr, valueType, ptrType, GetFullMask());
    }
}

/** Store a uniform type to SOA-laid-out memory.
 */
void FunctionEmitContext::storeUniformToSOA(llvm::Value *value, llvm::Value *ptr, llvm::Value *mask,
                                            const Type *valueType, const PointerType *ptrType) {
    AssertPos(currentPos, Type::EqualIgnoringConst(ptrType->GetBaseType()->GetAsUniformType(), valueType));

    const CollectionType *ct = CastType<CollectionType>(valueType);
    if (ct != nullptr) {
        // Handle collections element wise...
        for (int i = 0; i < ct->GetElementCount(); ++i) {
            llvm::Value *eltValue = ExtractInst(value, i);
            const Type *eltType = ct->GetElementType(i);
            const PointerType *dstEltPtrType;
            llvm::Value *dstEltPtr = AddElementOffset(new AddressInfo(ptr, ptrType), i, "slice_offset", &dstEltPtrType);
            StoreInst(eltValue, dstEltPtr, mask, eltType, dstEltPtrType);
        }
    } else {
        // We're finally at a leaf SOA array; apply the slice offset and
        // then we can do a final regular store
        AssertPos(currentPos, Type::IsBasicType(valueType));
        ptr = lFinalSliceOffset(this, ptr, &ptrType);
        StoreInst(value, new AddressInfo(ptr, ptrType), valueType);
    }
}

void FunctionEmitContext::MemcpyInst(llvm::Value *dest, llvm::Value *src, llvm::Value *count, llvm::Value *align) {
    dest = BitCastInst(dest, LLVMTypes::VoidPointerType);
    src = BitCastInst(src, LLVMTypes::VoidPointerType);
    if (count->getType() != LLVMTypes::Int64Type) {
        AssertPos(currentPos, count->getType() == LLVMTypes::Int32Type);
        count = ZExtInst(count, LLVMTypes::Int64Type, "count_to_64");
    }
    if (align == nullptr)
        align = LLVMInt32(1);
    llvm::FunctionCallee mcFuncCallee =
#ifdef ISPC_OPAQUE_PTR_MODE
        m->module->getOrInsertFunction("llvm.memcpy.p0.p0.i64", LLVMTypes::VoidType, LLVMTypes::VoidPointerType,
                                       LLVMTypes::VoidPointerType, LLVMTypes::Int64Type, LLVMTypes::BoolType);
#else
        m->module->getOrInsertFunction("llvm.memcpy.p0i8.p0i8.i64", LLVMTypes::VoidType, LLVMTypes::VoidPointerType,
                                       LLVMTypes::VoidPointerType, LLVMTypes::Int64Type, LLVMTypes::BoolType);
#endif
    llvm::Constant *mcFunc = llvm::cast<llvm::Constant>(mcFuncCallee.getCallee());
    AssertPos(currentPos, mcFunc != nullptr);
    AssertPos(currentPos, llvm::isa<llvm::Function>(mcFunc));

    std::vector<llvm::Value *> args;
    args.push_back(dest);
    args.push_back(src);
    args.push_back(count);
    args.push_back(LLVMFalse); /* not volatile */
#ifdef ISPC_XE_ENABLED
    llvm::Value *callinst = CallInst(mcFunc, nullptr, args, "");
    if (emitXeHardwareMask()) {
        XeUniformMetadata(callinst);
    }
#else
    CallInst(mcFunc, nullptr, args, "");
#endif
}

void FunctionEmitContext::setLoopUnrollMetadata(llvm::Instruction *inst,
                                                std::pair<Globals::pragmaUnrollType, int> loopAttribute,
                                                SourcePos pos) {
    if (inst == nullptr) {
        return;
    }

    if (loopAttribute.first == Globals::pragmaUnrollType::none) {
        return;
    }

    llvm::SmallVector<llvm::Metadata *, 4> Args;
    llvm::TempMDTuple TempNode = llvm::MDNode::getTemporary(*g->ctx,
#if ISPC_LLVM_VERSION >= ISPC_LLVM_16_0
                                                            std::nullopt
#else
                                                            llvm::None
#endif
    );
    Args.push_back(TempNode.get());
    if (loopAttribute.first == Globals::pragmaUnrollType::count) {
        llvm::Metadata *Vals[] = {llvm::MDString::get(*g->ctx, "llvm.loop.unroll.count"),
                                  llvm::ConstantAsMetadata::get(LLVMInt32(loopAttribute.second))};
        Args.push_back(llvm::MDNode::get(*g->ctx, Vals));
    } else if (loopAttribute.first == Globals::pragmaUnrollType::unroll) {
        llvm::Metadata *Vals[] = {llvm::MDString::get(*g->ctx, "llvm.loop.unroll.enable")};
        Args.push_back(llvm::MDNode::get(*g->ctx, Vals));
    } else if (loopAttribute.first == Globals::pragmaUnrollType::nounroll) {
        llvm::Metadata *Vals[] = {llvm::MDString::get(*g->ctx, "llvm.loop.unroll.disable")};
        Args.push_back(llvm::MDNode::get(*g->ctx, Vals));
    }
    llvm::MDNode *LoopID = llvm::MDNode::getDistinct(*g->ctx, Args);
    LoopID->replaceOperandWith(0, LoopID);
    inst->setMetadata("llvm.loop", LoopID);
}

llvm::Instruction *FunctionEmitContext::BranchInst(llvm::BasicBlock *dest) {
    llvm::Instruction *b = llvm::BranchInst::Create(dest, bblock);
    AddDebugPos(b);
    return b;
}

llvm::Instruction *FunctionEmitContext::BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
                                                   llvm::Value *test) {
    llvm::Instruction *b = nullptr;
    if (test == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return b;
    }
#ifdef ISPC_XE_ENABLED
    if (emitXeHardwareMask())
        test = XePrepareVectorBranch(test);
#endif
    b = llvm::BranchInst::Create(trueBlock, falseBlock, test, bblock);
    AddDebugPos(b);
    return b;
}

llvm::Value *FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const llvm::Twine &name) {
    if (v == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Instruction *ei = nullptr;
    if (llvm::isa<llvm::VectorType>(v->getType()))
        ei = llvm::ExtractElementInst::Create(
            v, LLVMInt32(elt),
            name.isTriviallyEmpty() ? ((llvm::Twine(v->getName()) + "_extract_") + llvm::Twine(elt)) : name, bblock);
    else
        ei = llvm::ExtractValueInst::Create(
            v, elt, name.isTriviallyEmpty() ? ((llvm::Twine(v->getName()) + "_extract_") + llvm::Twine(elt)) : name,
            bblock);
    AddDebugPos(ei);
    return ei;
}

llvm::Value *FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, const llvm::Twine &name) {
    if (v == nullptr || eltVal == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Instruction *ii = nullptr;
    if (llvm::isa<llvm::VectorType>(v->getType()))
        ii = llvm::InsertElementInst::Create(
            v, eltVal, LLVMInt32(elt),
            name.isTriviallyEmpty() ? ((llvm::Twine(v->getName()) + "_insert_") + llvm::Twine(elt)) : name, bblock);
    else
        ii = llvm::InsertValueInst::Create(
            v, eltVal, elt,
            name.isTriviallyEmpty() ? ((llvm::Twine(v->getName()) + "_insert_") + llvm::Twine(elt)) : name, bblock);
    AddDebugPos(ii);
    return ii;
}

llvm::Value *FunctionEmitContext::ShuffleInst(llvm::Value *v1, llvm::Value *v2, llvm::Value *mask,
                                              const llvm::Twine &name) {
    if (v1 == nullptr || v2 == nullptr || mask == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Instruction *ii = new llvm::ShuffleVectorInst(
        v1, v2, mask, name.isTriviallyEmpty() ? (llvm::Twine(v1->getName()) + "_shuffle") : name, bblock);

    AddDebugPos(ii);
    return ii;
}

llvm::Value *FunctionEmitContext::BroadcastValue(llvm::Value *v, llvm::Type *vecType, const llvm::Twine &name) {
    if (v == nullptr || vecType == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::FixedVectorType *ty = llvm::dyn_cast<llvm::FixedVectorType>(vecType);
    Assert(ty && ty->getElementType() == v->getType());

    // Generate the following sequence:
    //   %name_init.i = insertelement <4 x i32> undef, i32 %val, i32 0
    //   %name.i = shufflevector <4 x i32> %name_init.i, <4 x i32> undef,
    //                                              <4 x i32> zeroinitializer

    llvm::Value *undef1 = llvm::UndefValue::get(vecType);
    llvm::Value *undef2 = llvm::UndefValue::get(vecType);

    // InsertElement
    llvm::Value *insert =
        InsertInst(undef1, v, 0, name.isTriviallyEmpty() ? (llvm::Twine(v->getName()) + "_broadcast") : name + "_init");

    // ShuffleVector
    llvm::Constant *zeroVec =
        llvm::ConstantVector::getSplat(llvm::ElementCount::get(static_cast<unsigned int>(ty->getNumElements()), false),
                                       llvm::Constant::getNullValue(llvm::Type::getInt32Ty(*g->ctx)));
    llvm::Value *ret = ShuffleInst(insert, undef2, zeroVec,
                                   name.isTriviallyEmpty() ? (llvm::Twine(v->getName()) + "_broadcast") : name);

    return ret;
}

llvm::PHINode *FunctionEmitContext::PhiNode(llvm::Type *type, int count, const llvm::Twine &name) {
    llvm::PHINode *pn = llvm::PHINode::Create(type, count, name.isTriviallyEmpty() ? "phi" : name, bblock);
    AddDebugPos(pn);
    return pn;
}

llvm::Instruction *FunctionEmitContext::SelectInst(llvm::Value *test, llvm::Value *val0, llvm::Value *val1,
                                                   const llvm::Twine &name) {
    if (test == nullptr || val0 == nullptr || val1 == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    llvm::Instruction *inst = llvm::SelectInst::Create(
        test, val0, val1, name.isTriviallyEmpty() ? (llvm::Twine(test->getName()) + "_select") : name, bblock);
    AddDebugPos(inst);
    return inst;
}

/** Given a value representing a function to be called or possibly-varying
    pointer to a function to be called, figure out how many arguments the
    function has. */
static unsigned int lCalleeArgCount(llvm::Value *callee, const FunctionType *funcType) {
    llvm::Function *calleeFunc = llvm::dyn_cast<llvm::Function>(callee);
    // Easy function type callee
    if (calleeFunc) {
        return calleeFunc->getFunctionType()->getNumParams();
    } else {
        // Uniform or varying function pointer must have funcType != nullptr
        Assert(funcType != nullptr);
        // These calls are always unmasked, others have mask
        if (funcType->isExternC || funcType->isExternSYCL || funcType->isUnmasked)
            return funcType->GetNumParameters();
        // It cannot be task on Xe target
        else {
            if (g->target->isXeTarget()) {
                Assert(funcType->isTask == false);
            }

            return funcType->GetNumParameters() + 1;
        }
    }
}

llvm::Value *FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
                                           const std::vector<llvm::Value *> &args, const llvm::Twine &name) {
    if (func == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }
    std::vector<llvm::Value *> argVals;
    // Most of the time, the mask is passed as the last argument.  this
    // isn't the case for things like intrinsics, builtins, and extern "C"
    // functions from the application.  Add the mask if it's needed.
    // There may be more arguments than function parameters for vararg case.
    unsigned int calleeArgCount = lCalleeArgCount(func, funcType);
    bool disableMask = args.size() == calleeArgCount;

    // For Xe targets check the LLVM function signature and cast address space of
    // passed arguments if needed
    if (g->target->isXeTarget() && funcType) {
#ifdef ISPC_XE_ENABLED
        llvm::FunctionType *llvmFuncType = funcType->LLVMFunctionType(g->ctx, disableMask);
        Assert(args.size() <= llvmFuncType->getFunctionNumParams());
        for (int i = 0; i < args.size(); i++) {
            llvm::Value *adrCast = args[i];
            // Update addrspace of passed argument if needed for Xe target
            adrCast = XeUpdateAddrSpaceForParam(adrCast, llvmFuncType, i);
            argVals.push_back(adrCast);
        }
#endif
    } else {
        argVals = args;
    }

    AssertPos(currentPos, (llvm::isa<llvm::Function>(func) && llvm::cast<llvm::Function>(func)->isVarArg()) ||
                              argVals.size() + 1 == calleeArgCount || argVals.size() == calleeArgCount);
    if (argVals.size() + 1 == calleeArgCount) {
        llvm::Value *mask = nullptr;

#ifdef ISPC_XE_ENABLED
        if (emitXeHardwareMask())
            // This will create mask according to current EM on SIMD CF Lowering.
            // The result will be like       mask = select (EM, AllOn, AllFalse)
            mask = XeSimdCFPredicate(LLVMMaskAllOn);
        else
#endif
            mask = GetFullMask();
        argVals.push_back(mask);
    }

    if (llvm::isa<llvm::VectorType>(func->getType()) == false) {
        // Regular 'uniform' function call--just one function or function
        // pointer, so just emit the IR directly.
        llvm::FunctionType *func_type = nullptr;

        // Easy function type callee
        if (llvm::Function *f = llvm::dyn_cast<llvm::Function>(func)) {
            func_type = f->getFunctionType();
        } else {
            // In case of uniform function pointer get the signature from funcType
            Assert(funcType != nullptr);
            func_type = funcType->LLVMFunctionType(g->ctx, disableMask);
        }
        llvm::CallInst *callinst = llvm::CallInst::Create(func_type, func, argVals, name, bblock);

        // We could be dealing with a function pointer in which case this will not be a 'llvm::Function'.
        // If 'llvm::Function', use same calling convention as the actual function definition. It's
        // important we do this since  prebuilt stdlib functions does not use vectorcall on any OS.
        // If function pointer, it's safe to assume  that we use the cached calling convention
        // since this has to be a user defined function.
        llvm::Function *funcForConv = llvm::dyn_cast<llvm::Function>(func);
        if (funcForConv) {
            callinst->setCallingConv(funcForConv->getCallingConv());
        } else {
            if (g->calling_conv == CallingConv::x86_vectorcall) {
                callinst->setCallingConv(llvm::CallingConv::X86_VectorCall);
            } else {
                if (funcType != nullptr)
                    callinst->setCallingConv(funcType->GetCallingConv());
            }
        }

        llvm::Instruction *ci = callinst;

        // Copy noalias attribute to call instruction, to enable better
        // alias analysis.
        // TODO: what other attributes needs to be copied?
        // TODO: do the same for varing path.
        llvm::CallInst *cc = llvm::dyn_cast<llvm::CallInst>(ci);
        if (cc && cc->getCalledFunction()) {
            if (cc->getCalledFunction()->returnDoesNotAlias()) {
#if ISPC_LLVM_VERSION >= ISPC_LLVM_14_0
                cc->addRetAttr(llvm::Attribute::NoAlias);
#else
                cc->addAttribute(llvm::AttributeList::ReturnIndex, llvm::Attribute::NoAlias);
#endif
            }
            // TO DO:Add x86 changes as a separate commit
            /* unsigned int argSize = cc->arg_size();
            llvm::Function *calledFunc = cc->getCalledFunction();
            for (int argNum = 0; argNum < argSize; argNum++) {
                if (calledFunc->getArg(argNum)->hasAttribute(llvm::Attribute::InReg))
                    cc->addParamAttr(argNum, llvm::Attribute::InReg);
            }*/
        }

        AddDebugPos(ci);
        return ci;
    } else {
        // Emit the code for a varying function call, where we have an
        // vector of function pointers, one for each program instance.  The
        // basic strategy is that we go through the function pointers, and
        // for the executing program instances, for each unique function
        // pointer that's in the vector, call that function with a mask
        // equal to the set of active program instances that also have that
        // function pointer.  When all unique function pointers have been
        // called, we're done.
        llvm::BasicBlock *bbTest = CreateBasicBlock("varying_funcall_test", GetCurrentBasicBlock());
        llvm::BasicBlock *bbCall = CreateBasicBlock("varying_funcall_call", bbTest);
        llvm::BasicBlock *bbDone = CreateBasicBlock("varying_funcall_done", bbCall);

        llvm::BasicBlock *bbSIMDCall = nullptr;
        llvm::BasicBlock *bbSIMDCallJoin = nullptr;
        if (emitXeHardwareMask()) {
            bbSIMDCall = CreateBasicBlock("varying_funcall_simd_call", bbCall);
            bbSIMDCallJoin = CreateBasicBlock("varying_funcall_simd_call_join", bbSIMDCall);
        }

        // Get the current mask value so we can restore it later
        llvm::Value *origMask = GetInternalMask();

        // First allocate memory to accumulate the various program
        // instances' return values...
        Assert(funcType != nullptr);
        const Type *returnType = funcType->GetReturnType();
        llvm::Type *llvmReturnType = returnType->LLVMType(g->ctx);
        AddressInfo *resultPtrInfo = nullptr;
        Assert(llvmReturnType);
        if (llvmReturnType->isVoidTy() == false)
            resultPtrInfo = AllocaInst(returnType);

        // The memory pointed to by maskPointer tracks the set of program
        // instances for which we still need to call the function they are
        // pointing to.  It starts out initialized with the mask of
        // currently running program instances.
        llvm::Value *oldFullMask = nullptr;
        AddressInfo *maskPtrInfo = AllocaInst(LLVMTypes::MaskType);
        if (emitXeHardwareMask()) {
#ifdef ISPC_XE_ENABLED
            // Current mask will be calculated according to EM mask
            oldFullMask = XeSimdCFPredicate(LLVMMaskAllOn);
#endif
        } else {
            oldFullMask = GetFullMask();
        }
        StoreInst(oldFullMask, maskPtrInfo);

        // Mask wasn't initialized
        Assert(oldFullMask != nullptr && "Mask is not initialized");

        // And now we branch to the test to see if there's more work to be
        // done.
        BranchInst(bbTest);

        // bbTest: are any lanes of the mask still on?  If so, jump to
        // bbCall
        SetCurrentBasicBlock(bbTest);
        {
            llvm::Value *maskLoad = LoadInst(maskPtrInfo);
            llvm::Value *any = Any(maskLoad);
            BranchInst(bbCall, bbDone, any);
        }

        // bbCall: this is the body of the loop that calls out to one of
        // the active function pointer values.
        SetCurrentBasicBlock(bbCall);
        {
            // Figure out the first lane that still needs its function
            // pointer to be called.
            llvm::Value *currentMask = LoadInst(maskPtrInfo);
            llvm::Function *cttz = m->module->getFunction(builtin::__count_trailing_zeros_i64);
            AssertPos(currentPos, cttz != nullptr);
            llvm::Value *firstLane64 = CallInst(cttz, nullptr, LaneMask(currentMask), "first_lane64");
            llvm::Value *firstLane = TruncInst(firstLane64, LLVMTypes::Int32Type, "first_lane32");

            // Get the pointer to the function we're going to call this
            // time through: ftpr = func[firstLane]
            llvm::Value *fptr = llvm::ExtractElementInst::Create(func, firstLane, "extract_fptr", bblock);

            // Smear it out into an array of function pointers
            llvm::Value *fptrSmear = SmearUniform(fptr, "func_ptr");

            // fpOverlap = (fpSmearAsVec == fpOrigAsVec).  This gives us a
            // mask for the set of program instances that have the same
            // value for their function pointer.
            llvm::Value *fpOverlap = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, fptrSmear, func);
            fpOverlap = I1VecToBoolVec(fpOverlap);

            // Figure out the mask to use when calling the function
            // pointer: we need to AND the current execution mask to handle
            // the case of any non-running program instances that happen to
            // have this function pointer value.
            // callMask = (currentMask & fpOverlap)
            llvm::Value *callMask =
                BinaryOperator(llvm::Instruction::And, currentMask, fpOverlap, WrapSemantics::None, "call_mask");

            if (emitXeHardwareMask()) {
                // TODO: Seems like it is possible to move code
                // from bbSIMDCallJoin block here. Decide if
                // it should be done.

                // Execution is performed according to EM for Xe.
                // Branch to BB where EM is applied for call.
                BranchInst(bbSIMDCall, bbSIMDCallJoin, callMask);
                // Emit code for call
                SetCurrentBasicBlock(bbSIMDCall);
            } else {
                // Set the mask
                SetInternalMask(callMask);
            }

            // bitcast the i32/64 function pointer to the actual function
            // pointer type.
            llvm::Type *llvmFuncType = funcType->LLVMFunctionType(g->ctx);
            llvm::Type *llvmFPtrType = llvm::PointerType::get(llvmFuncType, 0);
            llvm::Value *fptrCast = IntToPtrInst(fptr, llvmFPtrType);

            // Call the function: callResult = call ftpr(args, args, call mask)
            llvm::Value *callResult = CallInst(fptrCast, funcType, args, name);

            // Now, do a masked store into the memory allocated to
            // accumulate the result using the call mask.
            if (callResult != nullptr && callResult->getType() != LLVMTypes::VoidType) {
                AssertPos(currentPos, resultPtrInfo != nullptr);
                if (emitXeHardwareMask()) {
                    // This store will be predicated during SIMD CF Lowering
                    StoreInst(callResult, resultPtrInfo);
                } else {
                    StoreInst(callResult, resultPtrInfo->getPointer(), callMask, returnType,
                              PointerType::GetUniform(returnType));
                }
            } else
                AssertPos(currentPos, resultPtrInfo == nullptr);

            if (emitXeHardwareMask()) {
                // Finish SIMDCall BB
                BranchInst(bbSIMDCallJoin);
                SetCurrentBasicBlock(bbSIMDCallJoin);
            }

            // Update the mask to turn off the program instances for which
            // we just called the function.
            // currentMask = currentMask & ~callmask
            llvm::Value *notCallMask =
                BinaryOperator(llvm::Instruction::Xor, callMask, LLVMMaskAllOn, WrapSemantics::None, "~callMask");
            currentMask = BinaryOperator(llvm::Instruction::And, currentMask, notCallMask, WrapSemantics::None,
                                         "currentMask&~callMask");
            StoreInst(currentMask, maskPtrInfo);

            // And go back to the test to see if we need to do another
            // call.
            BranchInst(bbTest);
        }

        // bbDone: We're all done; clean up and return the result we've
        // accumulated in the result memory.
        SetCurrentBasicBlock(bbDone);
        SetInternalMask(origMask);
        return resultPtrInfo ? LoadInst(resultPtrInfo, funcType->GetReturnType()) : nullptr;
    }
}

llvm::Value *FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType, llvm::Value *arg,
                                           const llvm::Twine &name) {
    std::vector<llvm::Value *> args;
    args.push_back(arg);
    return CallInst(func, funcType, args, name);
}

llvm::Value *FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType, llvm::Value *arg0,
                                           llvm::Value *arg1, const llvm::Twine &name) {
    std::vector<llvm::Value *> args;
    args.push_back(arg0);
    args.push_back(arg1);
    return CallInst(func, funcType, args, name);
}

llvm::Instruction *FunctionEmitContext::ReturnInst() {
    if (launchedTasks)
        // Add a sync call at the end of any function that launched tasks
        SyncInst();

#ifdef ISPC_XE_ENABLED
    if (emitXeHardwareMask()) {
        // Branch to return point. It will turn off lanes
        // in varying CF. For uniform CF it will be considered
        // as usual jmp.
        // TODO: this is a temporary workaround and will be
        // changed with SPIR-V emitting solution
        BranchInst(returnPoint);
        bblock = nullptr;
        // We don't actually create return instruction here
        return nullptr;
    }
#endif
    // Restore DAZ/FTZ flags if they were set before return statement
    if (functionFTZ_DAZValue != nullptr) {
        RestoreFunctionFTZ_DAZFlags();
    }
    llvm::Instruction *rinst = nullptr;
    if (returnValueAddressInfo != nullptr) {
        // We have value(s) to return; load them from their storage
        // location
        llvm::Value *retVal = LoadInst(returnValueAddressInfo, function->GetReturnType(), "return_value");
        rinst = llvm::ReturnInst::Create(*g->ctx, retVal, bblock);
    } else {
        AssertPos(currentPos, function->GetReturnType()->IsVoidType());
        rinst = llvm::ReturnInst::Create(*g->ctx, bblock);
    }

    AddDebugPos(rinst);
    bblock = nullptr;
    return rinst;
}

llvm::Value *FunctionEmitContext::LaunchInst(llvm::Value *callee, std::vector<llvm::Value *> &argVals,
                                             llvm::Value *launchCount[3], const FunctionType *funcType) {
    if (g->target->isXeTarget()) {
        Error(currentPos, "\"launch\" keyword is not supported for Xe targets");
        return nullptr;
    }

    if (callee == nullptr) {
        AssertPos(currentPos, m->errorCount > 0);
        return nullptr;
    }

    if (!(llvm::isa<llvm::Function>(callee) || llvm::isa<llvm::PointerType>(callee->getType()))) {
        Error(currentPos, "Must provide function name or uniform function pointer to \"task\"-qualified function for "
                          "\"launch\" expression");
        return nullptr;
    }

    launchedTasks = true;

    AssertPos(currentPos, funcType != nullptr);
    llvm::Type *llvmFuncType = funcType->LLVMFunctionType(g->ctx);
    AssertPos(currentPos, funcType->LLVMFunctionType(g->ctx)->getFunctionNumParams() > 0);
    llvm::Type *argType = llvmFuncType->getFunctionParamType(0);
    AssertPos(currentPos, llvm::PointerType::classof(argType));

    llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(argType);
    AssertPos(currentPos, pt);
    std::vector<llvm::Type *> llvmArgTypes = funcType->LLVMFunctionArgTypes(g->ctx);
    llvm::StructType *argStructType = llvm::StructType::get(*g->ctx, llvmArgTypes);
    AssertPos(currentPos, argStructType != nullptr);

    llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
    AssertPos(currentPos, falloc != nullptr);
    llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
    if (structSize->getType() != LLVMTypes::Int64Type)
        // ISPCAlloc expects the size as an uint64_t, but on 32-bit
        // targets, SizeOf returns a 32-bit value
        structSize = ZExtInst(structSize, LLVMTypes::Int64Type, "struct_size_to_64");
    int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());

    std::vector<llvm::Value *> allocArgs;
    allocArgs.push_back(launchGroupHandleAddressInfo->getPointer());
    allocArgs.push_back(structSize);
    allocArgs.push_back(LLVMInt32(align));
    llvm::Value *voidmem = CallInst(falloc, nullptr, allocArgs, "args_ptr");
    llvm::Value *argmem = BitCastInst(voidmem, pt);

    // Copy the values of the parameters into the appropriate place in
    // the argument block
    AddressInfo *argmemInfo = new AddressInfo(argmem, argStructType);
    for (unsigned int i = 0; i < argVals.size(); ++i) {
        llvm::Value *ptr = AddElementOffset(argmemInfo, i, "funarg");
        // don't need to do masked store here, I think
        StoreInst(argVals[i], new AddressInfo(ptr, llvmArgTypes[i]));
    }

    if (argStructType->getNumElements() == argVals.size() + 1) {
        // copy in the mask
        llvm::Value *mask = GetFullMask();
        llvm::Value *ptr = AddElementOffset(argmemInfo, argVals.size(), "funarg_mask");
        StoreInst(mask, new AddressInfo(ptr, LLVMTypes::MaskType));
    }

    // And emit the call to the user-supplied task launch function, passing
    // a pointer to the task function being called and a pointer to the
    // argument block we just filled in
    llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
    llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
    AssertPos(currentPos, flaunch != nullptr);
    std::vector<llvm::Value *> args;
    args.push_back(launchGroupHandleAddressInfo->getPointer());
    args.push_back(fptr);
    args.push_back(voidmem);
    args.push_back(launchCount[0]);
    args.push_back(launchCount[1]);
    args.push_back(launchCount[2]);
    return CallInst(flaunch, nullptr, args, "");
}

void FunctionEmitContext::SyncInst() {
    if (g->target->isXeTarget()) {
        Error(currentPos, "\"sync\" keyword is not supported for Xe targets");
        return;
    }

    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandleAddressInfo);
    llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
    llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, launchGroupHandle, nullPtrValue);
    llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
    llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
    BranchInst(bSync, bPostSync, nonNull);

    SetCurrentBasicBlock(bSync);
    llvm::Function *fsync = m->module->getFunction("ISPCSync");
    if (fsync == nullptr)
        FATAL("Couldn't find ISPCSync declaration?!");
    CallInst(fsync, nullptr, launchGroupHandle, "");

    // zero out the handle so that if ISPCLaunch is called again in this
    // function, it knows it's starting out from scratch
    StoreInst(nullPtrValue, launchGroupHandleAddressInfo);

    BranchInst(bPostSync);

    SetCurrentBasicBlock(bPostSync);
}

/** When we gathering from or scattering to a varying atomic type, we need
    to add an appropriate offset to the final address for each lane right
    before we use it.  Given a varying pointer we're about to use and its
    type, this function determines whether these offsets are needed and
    returns an updated pointer that incorporates these offsets if needed.
 */
llvm::Value *FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType) {
    // This should only be called for varying pointers
    const PointerType *pt = CastType<PointerType>(ptrType);
    AssertPos(currentPos, pt && pt->IsVaryingType());

    const Type *baseType = ptrType->GetBaseType();
    if (Type::IsBasicType(baseType) == false)
        return ptr;

    if (baseType->IsVaryingType() == false)
        return ptr;

    // Find the size of a uniform element of the varying type
    llvm::Type *llvmBaseUniformType = baseType->GetAsUniformType()->LLVMType(g->ctx);
    Assert(llvmBaseUniformType);
    llvm::Value *unifSize = g->target->SizeOf(llvmBaseUniformType, bblock);
    unifSize = SmearUniform(unifSize);

    // Compute offset = <0, 1, .. > * unifSize
    bool is32bits = g->target->is32Bit() || g->opt.force32BitAddressing;
    llvm::Value *varyingOffsets = ProgramIndexVector(is32bits);

    llvm::Value *offset = BinaryOperator(llvm::Instruction::Mul, unifSize, varyingOffsets, WrapSemantics::None);

    if (g->opt.force32BitAddressing == true && g->target->is32Bit() == false)
        // On 64-bit targets where we're doing 32-bit addressing
        // calculations, we need to convert to an i64 vector before adding
        // to the pointer
        offset = SExtInst(offset, LLVMTypes::Int64VectorType, "offset_to_64");

    return BinaryOperator(llvm::Instruction::Add, ptr, offset, WrapSemantics::None);
}

CFInfo *FunctionEmitContext::popCFState() {
    AssertPos(currentPos, controlFlowInfo.size() > 0);
    CFInfo *ci = controlFlowInfo.back();
    controlFlowInfo.pop_back();

    if (ci->IsSwitch()) {
        breakTarget = ci->savedBreakTarget;
        continueTarget = ci->savedContinueTarget;
        breakLanesAddressInfo = ci->savedBreakLanesAddressInfo;
        continueLanesAddressInfo = ci->savedContinueLanesAddressInfo;
        blockEntryMask = ci->savedBlockEntryMask;
        switchExpr = ci->savedSwitchExpr;
        switchFallThroughMaskAddressInfo = ci->savedSwitchFallThroughMaskAddressInfo;
        defaultBlock = ci->savedDefaultBlock;
        if (caseBlocks) {
            // Allocated in FunctionEmitContext::SwitchInst
            delete caseBlocks;
        }
        if (nextBlocks) {
            // Allocated in FunctionEmitContext::SwitchInst
            delete nextBlocks;
        }
        caseBlocks = ci->savedCaseBlocks;
        nextBlocks = ci->savedNextBlocks;
        switchConditionWasUniform = ci->savedSwitchConditionWasUniform;
    } else if (ci->IsLoop() || ci->IsForeach()) {
        breakTarget = ci->savedBreakTarget;
        continueTarget = ci->savedContinueTarget;
        breakLanesAddressInfo = ci->savedBreakLanesAddressInfo;
        continueLanesAddressInfo = ci->savedContinueLanesAddressInfo;
        blockEntryMask = ci->savedBlockEntryMask;
    } else {
        AssertPos(currentPos, ci->IsIf());
        // nothing to do
    }

    return ci;
}

#ifdef ISPC_XE_ENABLED
bool FunctionEmitContext::inXeSimdCF() const {
    // Go backwards through controlFlowInfo, since we add new nested scopes
    // to the back.
    if (controlFlowInfo.size() > 0) {
        int i = controlFlowInfo.size() - 1;
        while (i >= 0) {
            if (controlFlowInfo[i]->isUniformEmulated == true)
                // Found a scope due to an 'if' statement with a emulated uniform test
                return true;
            --i;
        }
    }
    return false;
}

llvm::Value *FunctionEmitContext::XeSimdCFAny(llvm::Value *value) {
    AssertPos(currentPos, llvm::isa<llvm::VectorType>(value->getType()));
    llvm::Value *mask = GetInternalMask();
    value = BinaryOperator(llvm::BinaryOperator::And, mask, value, WrapSemantics::None);
    auto Fn = llvm::GenXIntrinsic::getGenXDeclaration(m->module, llvm::GenXIntrinsic::genx_simdcf_any,
                                                      LLVMTypes::Int1VectorType);
    return llvm::CallInst::Create(Fn, value, "", bblock);
}

llvm::Value *FunctionEmitContext::XeSimdCFPredicate(llvm::Value *value, llvm::Value *defaults) {
    AssertPos(currentPos, llvm::isa<llvm::FixedVectorType>(value->getType()));
    llvm::FixedVectorType *vt = llvm::dyn_cast<llvm::FixedVectorType>(value->getType());
    if (defaults == nullptr) {
        defaults = llvm::ConstantVector::getSplat(
            llvm::ElementCount::get(static_cast<unsigned int>(vt->getNumElements()), false),
            llvm::Constant::getNullValue(vt->getElementType()));
    }

    auto Fn = llvm::GenXIntrinsic::getGenXDeclaration(m->module, llvm::GenXIntrinsic::genx_simdcf_predicate,
                                                      value->getType());
    std::vector<llvm::Value *> args;
    args.push_back(value);
    args.push_back(defaults);
    return llvm::CallInst::Create(Fn, args, "", bblock);
}

llvm::Value *FunctionEmitContext::XePrepareVectorBranch(llvm::Value *value) {
    llvm::Value *ret = value;
    // If condition is varying we just insert simdcf.any intrinsic.
    // If condition is a scalar we should change it to vector but only if we had
    // varying condition which was emulated as uniform in external scopes.
    if (!llvm::isa<llvm::VectorType>(value->getType())) {
        if (!inXeSimdCF())
            return ret;
        ret = BroadcastValue(value, LLVMTypes::Int1VectorType);
    }
    Assert(ret != nullptr);
    return XeSimdCFAny(ret);
}

llvm::Value *FunctionEmitContext::XeStartUnmaskedRegion() {
    auto Fn = llvm::GenXIntrinsic::getGenXDeclaration(m->module, llvm::GenXIntrinsic::genx_unmask_begin);
    std::vector<llvm::Value *> args;
    AddressInfo *maskAlloca = AllocaInst(LLVMTypes::Int32Type);
    llvm::Value *execMask = llvm::CallInst::Create(Fn, args, "", bblock);
    StoreInst(execMask, maskAlloca);
    return maskAlloca->getPointer();
}

void FunctionEmitContext::XeEndUnmaskedRegion(llvm::Value *execMask) {
    llvm::Value *restoredMask = LoadInst(new AddressInfo(execMask, LLVMTypes::MaskType));
    auto Fn = llvm::GenXIntrinsic::getGenXDeclaration(m->module, llvm::GenXIntrinsic::genx_unmask_end);
    llvm::CallInst::Create(Fn, restoredMask, "", bblock);
}

void FunctionEmitContext::XeUniformMetadata(llvm::Value *v) {
    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
    // Set ISPC-Uniform to exclude instruction from predication in CMSIMDCFLowering.
    if (inst != nullptr) {
        llvm::MDNode *N = llvm::MDNode::get(*g->ctx, llvm::MDString::get(*g->ctx, "ISPC-Uniform"));
        inst->setMetadata("ISPC-Uniform", N);
    }
}

llvm::Constant *FunctionEmitContext::XeCreateConstantString(llvm::StringRef str, llvm::StringRef name) {
    auto *initializer = llvm::ConstantDataArray::getString(*g->ctx, str, /* AddNull */ true);
    auto *GV = new llvm::GlobalVariable(*m->module, initializer->getType(),
                                        /* const */ true, llvm::GlobalValue::InternalLinkage, initializer, name,
                                        nullptr, llvm::GlobalVariable::NotThreadLocal,
                                        /* Constant Addrspace */ 2);
#if ISPC_LLVM_VERSION >= ISPC_LLVM_16_0
    GV->setAlignment(llvm::MaybeAlign(g->target->getDataLayout()->getABITypeAlign(initializer->getType())));
#else
    GV->setAlignment(llvm::MaybeAlign(g->target->getDataLayout()->getABITypeAlignment(initializer->getType())));
#endif
    GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);

    return llvm::ConstantExpr::getInBoundsGetElementPtr(GV->getValueType(), GV,
                                                        llvm::ArrayRef<llvm::Constant *>{LLVMInt32(0), LLVMInt32(0)});
}

llvm::Constant *FunctionEmitContext::XeGetOrCreateConstantString(llvm::StringRef str, llvm::StringRef name) {
    auto *GV = m->module->getGlobalVariable(name, /* AllowInternal */ true);
    if (GV)
        return llvm::ConstantExpr::getInBoundsGetElementPtr(
            GV->getValueType(), GV, llvm::ArrayRef<llvm::Constant *>{LLVMInt32(0), LLVMInt32(0)});
    return XeCreateConstantString(str, name);
}

llvm::Value *FunctionEmitContext::XeUpdateAddrSpaceForParam(llvm::Value *val, const llvm::FunctionType *fType,
                                                            const unsigned int paramIndex, bool atEntryBlock) {
    Assert(val != nullptr);
    llvm::Value *adrCast = val;
    if (fType->getFunctionNumParams() >= paramIndex) {
        // We need to check addrspace for arguments with pointer type only
        llvm::PointerType *valType = llvm::dyn_cast<llvm::PointerType>(val->getType());
        llvm::PointerType *fArgType = llvm::dyn_cast<llvm::PointerType>(fType->getFunctionParamType(paramIndex));
        if (valType && fArgType) {
            // Compare address spaces and make cast if needed
            const unsigned int paramAddrSpace = fArgType->getPointerAddressSpace();
            const unsigned int argAddrSpace = valType->getPointerAddressSpace();
            if (argAddrSpace != paramAddrSpace) {
                adrCast = AddrSpaceCastInst(val, ispc::AddressSpace(paramAddrSpace), atEntryBlock);
            }
        }
    }
    return adrCast;
}

#endif

bool FunctionEmitContext::emitXeHardwareMask() {
    bool emitXeHardwareMask = g->target->isXeTarget();
#ifdef ISPC_XE_ENABLED
    emitXeHardwareMask &= g->opt.emitXeHardwareMask;
#endif
    return emitXeHardwareMask;
}

llvm::Value *FunctionEmitContext::InvokeSyclInst(llvm::Value *func, const FunctionType *funcType,
                                                 const std::vector<llvm::Value *> &args) {
    Assert(funcType != nullptr);
    const Type *returnType = funcType->GetReturnType();
    // Broadcast uniform return value to varying to match IGC signature by vISA level
    // for extern "SYCL" functions on Xe targets
    bool broadcastReturnVal = g->target->isXeTarget() && !returnType->IsVoidType() && returnType->IsUniformType();
    if (broadcastReturnVal) {
        returnType = returnType->GetAsVaryingType();
    }
// Setting of HW mask before invoking external function does not work currently and
// requires unification between VC and scalar backends. So invoke_sycl is supported
// in convergent CF only.
// TODO: enable setting HW mask when it is supported in backend
#if 0
    llvm::BasicBlock *bbExternalCall = nullptr;
    llvm::BasicBlock *bbExternalCallJoin = nullptr;
#endif
    AddressInfo *resultPtrInfo = nullptr;
    if (returnType->IsVoidType() == false)
        resultPtrInfo = AllocaInst(returnType);
#if 0
    // Prototype set of HW mask before invoke_sycl call
    if (g->target->isXeTarget()) {
        bbExternalCall = CreateBasicBlock("external_func_call", GetCurrentBasicBlock());
        bbExternalCallJoin = CreateBasicBlock("external_func_join", bbExternalCall);
        llvm::Value *simdcf = XeSimdCFAny(GetInternalMask());
        BranchInst(bbExternalCall, bbExternalCallJoin, simdcf);
        SetCurrentBasicBlock(bbExternalCall);
    }
#endif
    // Broadcast uniform function arguments to varying to match IGC signature by vISA level
    // for extern "SYCL" functions on Xe targets
    std::vector<llvm::Value *> argsFinal;
    for (int i = 0; i < args.size(); i++) {
        llvm::Value *argCast = args[i];
        if (g->target->isXeTarget() && funcType->isExternSYCL && funcType->GetParameterType(i)->IsUniformType()) {
            if (!llvm::isa<llvm::VectorType>(argCast->getType())) {
                if (argCast->getType()->isPointerTy()) {
                    argCast = PtrToIntInst(argCast, LLVMTypes::Int64Type, argCast->getName() + "_ptrtoint");
                }
                llvm::VectorType *typecast = llvm::dyn_cast<llvm::VectorType>(
                    funcType->GetParameterType(i)->GetAsVaryingType()->LLVMType(g->ctx));

                argCast = BroadcastValue(argCast, typecast, argCast->getName() + "_broadcast");
            }
        }
        argsFinal.push_back(argCast);
    }

    llvm::Value *callResult = CallInst(func, funcType, argsFinal, returnType->IsVoidType() ? "" : "calltmp");
    if (returnType->IsVoidType() == false) {
        StoreInst(callResult, resultPtrInfo);
    }
#if 0
    // Finish SIMDCall BB
    if (g->target->isXeTarget()) {
        BranchInst(bbExternalCallJoin);
        SetCurrentBasicBlock(bbExternalCallJoin);
    }
#endif
    if (resultPtrInfo) {
        llvm::Value *res = LoadInst(resultPtrInfo, returnType);
        if (broadcastReturnVal) {
            // If return value was broadcasted, extract the first element from broadcasted result
            // and use it as a final result.
            res = ExtractInst(res, 0);
        }
        return res;
    }
    return nullptr;
}

} // namespace ispc