[BugFix][Codegen, CUDA] Fix faulty codegen for FP8

AntonMoberg · AntonMoberg · commit d1495eeb14a2 · 2025-02-24T17:01:37.000+01:00
Fixed bug where CUDA codegen produces faulty code when a vectorizable
BufferLoadNode contains a Float8 type.

Codegen generated the invalid signature
"make___nv_fp8x2_e5m2(param_0[v_.x], param_0[v_.y])" where "param_0" is
of type "__nv_fp8_e5m2* __restrict__".

This commit adds a missing check "is_float8()" for
CodeGenCUDA::PrintVecElemLoadExpr that is called for
vectorizable BufferLoadNodes. Which instead correctly generates the
signature "_nv_fp8x2_e5m2(make_float2(static_cast&lt;float&gt;(param_0[v_.x],
static_cast&lt;float&gt;(param_0[v_.y])))

Additionally this commit removes the added "make_" prefix for float8 in
CodeGenCuda::PrintVecConstructor as the correct way to instansiate an
nv_fp8x2_[e5m2/e4m3] is through the "_nv_fp8x2_[e5m2/e4m3]"
constructor itself.
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -473,7 +473,9 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) {  // NOLINT(*)
 }
 
 void CodeGenCUDA::PrintVecConstructor(DataType t, std::ostream& os) {
-  os << "make_";
+  if (!t.is_float8()) {
+    os << "make_";  // There is no make___nv_fp8 (/usr/local/cuda/include/vector_functions.hpp)
+  }
   PrintType(t, os);
 }
 
@@ -1554,6 +1556,19 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val
     return;
   }
 
+  if (t.is_float8()) {
+    if (i == 0) {
+      PrintVecConstructor(t, os);
+      os << "(make_float" << t.lanes() << "(";
+    }
+    if (i != 0) os << ", ";
+    os << "static_cast<float>(" << value << ")";
+    if (i == t.lanes() - 1) {
+      os << "))";
+    }
+    return;
+  }
+
   if (i == 0) {
     PrintVecConstructor(t, os);
     os << "(";