CC := g++-10
NVCC := nvcc
CFLAGS := -Wall -std=c++17 -fopenmp -MMD
NVCC_FLAGS := -MMD -w -Xcompiler

INCLUDE := ../../
LIBS_DIR :=
LIBS_DIR_GPU := /usr/local/cuda/lib64
LIBS :=
LIBS_GPU := cuda cudart cublas

TARGET = arrayFunctions
SRC_DIR = .
BUILD_DIR = build

# Should not need to modify below.
int main() {
    CudaTools::Array<int> arr = CudaTools::Array<int>::constant(0);
    arr.reshape({4, 5, 5}); // Creates a three dimensional array.

    arr[0][0][0] = 1; // Axis by axis indexing.
    arr[{1, 0, 0}] = 100; // Specific 'coordinate' indexing.
    std::cout << arr << "\n";

    CudaTools::Array<int> arrRange = CudaTools::Array<int>::range(18);
    auto arrSlice = arr.slice({{1, 2}, {1, 4}, {1, 4}}). // Takes a slice of the center.
    std::cout << "Before Copy:\n" << arrSlice << "\n";
    arrSlice = arrRange; // Copies arrRange into arrSlice. (Does NOT replace!)
    std::cout << "After Copy:\n" << arrSlice << "\n";

    std::cout << "Modified: \n" << arr << "\n"; // The original array is modified, since a slice does not copy.

    CudaTools::Array<int> newArr = arr.copy(); // Copies the original Array.
    for (auto it = newArr.begin(); it != newArr.end(); ++it) { // Iterate through the array.
        *it = 1;
    }
    std::cout << "Modified New Array:\n" << newArr << "\n";
    std::cout << "Old Array:\n" << arr << "\n"; // The original array was not modified after a copy.
    return 0;
}
CPU_BUILD_DIR = $(BUILD_DIR)/cpu
GPU_BUILD_DIR = $(BUILD_DIR)/gpu

SRC = $(wildcard $(SRC_DIR)/*/*.cpp) $(wildcard $(SRC_DIR)/*.cpp)

# Get source files and object files.
GCC_SRC = $(filter-out %.cu.cpp ,$(SRC))
NVCC_SRC = $(filter %.cu.cpp, $(SRC))
GCC_OBJ = $(GCC_SRC:$(SRC_DIR)/%.cpp=%.o)
NVCC_OBJ = $(NVCC_SRC:$(SRC_DIR)/%.cpp=%.o)

# If compiling for CPU, all go to GCC. Otherwise, they are split.
CPU_OBJ = $(addprefix $(CPU_BUILD_DIR)/,$(GCC_OBJ)) $(addprefix $(CPU_BUILD_DIR)/,$(NVCC_OBJ))
GPU_GCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(GCC_OBJ))
GPU_NVCC_OBJ = $(addprefix $(GPU_BUILD_DIR)/,$(NVCC_OBJ))

# $(info $$GCC_SRC is [${GCC_SRC}])
# $(info $$NVCC_SRC is [${NVCC_SRC}])
# $(info $$GCC_OBJ is [${GCC_OBJ}])
# $(info $$NVCC_OBJ is [${NVCC_OBJ}])

# $(info $$CPU_OBJ is [${CPU_OBJ}])
# $(info $$GPU_GCC_OBJ is [${GPU_GCC_OBJ}])
# $(info $$GPU_NVCC_OBJ is [${GPU_NVCC_OBJ}])

HEADER = $(wildcard $(SRC_DIR)/*/*.h) $(wildcard $(SRC_DIR)/*.h)
CPU_DEPS = $(wildcard $(CPU_BUILD_DIR)/*.d)
GPU_DEPS = $(wildcard $(GPU_BUILD_DIR)/*.d)

INC := $(INCLUDE:%=-I%)
LIB := $(LIBS_DIR:%=-L%)
LIB_GPU := $(LIBS_DIR_GPU:%=-L%)
LD := $(LIBS:%=-l%)
LD_GPU := $(LIBS_GPU:%=-l%)

# Reminder:
# $< = first prerequisite
# $@ = the target which matched the rule
# $^ = all prerequisites

.PHONY: all clean

all : cpu gpu

cpu: $(TARGET)CPU
gpu: $(TARGET)GPU

$(TARGET)CPU: $(CPU_OBJ)
	$(CC) $(CFLAGS) $^ -o $@ $(INC) $(LIB) $(LDFLAGS)

$(CPU_BUILD_DIR)/%.o $(CPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cpp | $(CPU_BUILD_DIR)
	$(CC) $(CFLAGS) -c -o $@ $< $(INC)

# For GPU, we need to build the NVCC objects, the NVCC linked object, and the
# regular ones. Then, we link them all together.
$(TARGET)GPU: $(GPU_BUILD_DIR)/link.o $(GPU_GCC_OBJ) | $(GPU_BUILD_DIR)
	$(CC) -g -DCUDA $(CFLAGS) $(GPU_NVCC_OBJ) $^ -o $@ $(INC) $(LIB) $(LIB_GPU) $(LD) $(LD_GPU)

$(GPU_BUILD_DIR)/link.o: $(GPU_NVCC_OBJ) | $(GPU_BUILD_DIR)
	$(NVCC) --device-link $^ -o $@

$(GPU_BUILD_DIR)/%.cu.o: $(SRC_DIR)/%.cu.cpp | $(GPU_BUILD_DIR)
	$(NVCC) $(NVCC_FLAGS) -DCUDA -x cu --device-c -o $@ $< $(INC)

$(GPU_BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp | $(GPU_BUILD_DIR)
	$(CC) $(CFLAGS) -g -DCUDA -c -o $@ $< $(INC)

-include $(CPU_DEPS)
-include $(GPU_DEPS)

$(CPU_BUILD_DIR):
	mkdir -p $@

$(GPU_BUILD_DIR):
	mkdir -p $@

clean:
	rm -Rf $(BUILD_DIR) $(TARGET)CPU $(TARGET)GPU
