Fix faulty codegen for unary fp8 operators

AntonMoberg · AntonMoberg · commit 6f3f13c14f29 · 2025-03-10T10:41:57.000+01:00
Add missing support for unary operators in fp8. FP8 requires you to cast
to fp16 to perform mathmatical operations, and this commit handles the
casting to and from __half and adds missing checks for the tir
intrinsics to generate the correct operator signatures.
diff --git a/python/tvm/relax/transform/legalize_ops/nn.py b/python/tvm/relax/transform/legalize_ops/nn.py
@@ -475,8 +475,8 @@ def te_gelu(x: te.Tensor):
         dtype = x.dtype
         erf_inp = x * tir.const(0.5**0.5, dtype)
 
-        if dtype == "float16":
-            erf = topi.math.cast(topi.erf(topi.math.cast(erf_inp, "float32")), "float16")
+        if dtype == "float16" or dtype == "e5m2_float8" or dtype == "e4m3_float8":
+            erf = topi.math.cast(topi.erf(topi.math.cast(erf_inp, "float32")), dtype)
         else:
             erf = topi.erf(erf_inp)
 
diff --git a/src/relax/op/distributed/nn.cc b/src/relax/op/distributed/nn.cc
@@ -32,7 +32,8 @@ StructInfo InferDistStructInfoSoftmax(const Call& call, const BlockBuilder& ctx)
     ctx->ReportFatal(Diagnostic::Error(call)
                      << "Input of distributed operator must have known ndim");
   }
-  if (!input_tensor_sinfo->IsUnknownDtype() && !input_tensor_sinfo->dtype.is_float()) {
+  if (!input_tensor_sinfo->IsUnknownDtype() && !input_tensor_sinfo->dtype.is_float() &&
+      !input_tensor_sinfo->dtype.is_float16() && !input_tensor_sinfo->dtype.is_float8()) {
     ctx->ReportFatal(Diagnostic::Error(call) << "Softmax requires the input tensor to have float "
                                                 "dtype. However, the given input dtype is "
                                              << input_tensor_sinfo->dtype);
diff --git a/src/relax/op/distributed/unary.h b/src/relax/op/distributed/unary.h
@@ -40,7 +40,8 @@ StructInfo InferDistStructInfoUnary(const Call& call, const BlockBuilder& ctx,
   TensorStructInfo input_tensor_sinfo = input_dtensor_sinfo->tensor_sinfo;
 
   if (require_float_dtype && !input_tensor_sinfo->IsUnknownDtype() &&
-      !input_tensor_sinfo->dtype.is_float()) {
+      !input_tensor_sinfo->dtype.is_float() && !input_tensor_sinfo->dtype.is_float16() &&
+      !input_tensor_sinfo->dtype.is_float8()) {
     ctx->ReportFatal(
         Diagnostic::Error(call)
         << call->op
diff --git a/src/relax/op/nn/nn.cc b/src/relax/op/nn/nn.cc
@@ -74,7 +74,8 @@ StructInfo InferStructInfoSoftmax(const Call& call, const BlockBuilder& ctx) {
   if (data_sinfo->IsUnknownNdim()) {
     return data_sinfo;
   }
-  if (!data_sinfo->IsUnknownDtype() && !data_sinfo->dtype.is_float()) {
+  if (!data_sinfo->IsUnknownDtype() && !data_sinfo->dtype.is_float() &&
+      !data_sinfo->dtype.is_float16() && !data_sinfo->dtype.is_float8()) {
     ctx->ReportFatal(Diagnostic::Error(call) << "Softmax requires the input tensor to have float "
                                                 "dtype. However, the given input dtype is "
                                              << data_sinfo->dtype);
diff --git a/src/relax/op/op_common.h b/src/relax/op/op_common.h
@@ -199,8 +199,9 @@ template <bool require_float_dtype, typename FType>
 inline StructInfo InferStructInfoUnary(const Call& call, const BlockBuilder& ctx,
                                        FType f_compute_out_dtype) {
   TensorStructInfo input_sinfo = GetUnaryInputTensorStructInfo(call, ctx);
-  if (require_float_dtype && !input_sinfo->IsUnknownDtype() &&
-      (!input_sinfo->dtype.is_float() && !input_sinfo->dtype.is_bfloat())) {
+  if (require_float_dtype && !input_sinfo->IsUnknownDtype() && !input_sinfo->dtype.is_float() &&
+      !input_sinfo->dtype.is_bfloat() && !input_sinfo->dtype.is_float16() &&
+      !input_sinfo->dtype.is_float8()) {
     ctx->ReportFatal(
         Diagnostic::Error(call)
         << call->op
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -149,8 +149,7 @@ std::string CodeGenCUDA::Finish() {
   if (enable_fp16_) {
     decl_stream << "#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)\n";
     decl_stream << "#include <cuda_fp16.h>\n";
-    decl_stream << "__device__ half max"
-                << "(half a, half b)\n"
+    decl_stream << "__device__ half max" << "(half a, half b)\n"
                 << "{\n  return __hgt(__half(a), __half(b)) ? a : b;\n}\n";
     decl_stream << "__device__ half min(half a, half b)\n"
                 << "{\n  return __hlt(__half(a), __half(b)) ? a : b;\n}\n";
@@ -165,8 +164,7 @@ std::string CodeGenCUDA::Finish() {
   if (enable_bf16_) {
     decl_stream << "#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)\n";
     decl_stream << "#include <cuda_bf16.h>\n";
-    decl_stream << "__device__ nv_bfloat16 max"
-                << "(nv_bfloat16 a, nv_bfloat16 b)\n"
+    decl_stream << "__device__ nv_bfloat16 max" << "(nv_bfloat16 a, nv_bfloat16 b)\n"
                 << "{\n  return __hgt(a, b) ? a : b;\n}\n";
     decl_stream << "__device__ nv_bfloat16 min(nv_bfloat16 a, nv_bfloat16 b)\n"
                 << "{\n  return __hlt(a, b) ? a : b;\n}\n";
@@ -542,8 +540,7 @@ void CodeGenCUDA::PrintVecBinaryOp(const std::string& op, DataType t, PrimExpr l
       }
       for (int i = 0, lanes = t.lanes() / 2; i < lanes; ++i) {
         if (isalpha(op[0]) || op[0] == '_') {
-          value_temp << op << "2"
-                     << "(__half2(";
+          value_temp << op << "2" << "(__half2(";
           PrintVecElemLoad(vlhs, lhs.dtype(), i * lanes, value_temp);
           value_temp << "), __half2(";
           PrintVecElemLoad(vrhs, rhs.dtype(), i * lanes, value_temp);
@@ -653,8 +650,7 @@ void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i,
   ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4));
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (t.lanes() == 2 || t.lanes() == 3) {
-      stream << vec << '.' << access[i % t.lanes()] << "="
-             << "(" << value << ");\n";
+      stream << vec << '.' << access[i % t.lanes()] << "=" << "(" << value << ");\n";
     } else {
       std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
       stream << ac << "=";
@@ -861,7 +857,23 @@ void CodeGenCUDA::PrintCallExtern(Type ret_type, String global_symbol, const Arr
     }
     os << sret;
   } else {
-    CodeGenC::PrintCallExtern(ret_type, global_symbol, args, skip_first_arg, os);
+    if (ret_dtype.is_float8()) {
+      std::string fp8_type = (ret_dtype.is_e5m2_float8() ? "__NV_E5M2" : "__NV_E4M3");
+      os << "__nv_fp8_" << (ret_dtype.is_e5m2_float8() ? "e5m2" : "e4m3") << "(";
+
+      LOG_INFO << global_symbol;
+      os << global_symbol << "(__half(__nv_cvt_fp8_to_halfraw(";
+      for (size_t i = static_cast<size_t>(skip_first_arg); i < args.size(); ++i) {
+        this->PrintExpr(args[i], os);
+        os << ".__x, " << fp8_type << "))";
+        if (i < args.size() - 1) {
+          os << ", " << "__half(__nv_cvt_fp8_to_halfraw(";
+        }
+      }
+      os << "))";
+    } else {
+      CodeGenC::PrintCallExtern(ret_type, global_symbol, args, skip_first_arg, os);
+    }
   }
 }
 
@@ -1198,8 +1210,7 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
     this->stream << "\" @!p mov.b32 %0, 0;\\n\"\n";
     this->stream << "\" @p ld.global.nc.f32 %0, [%1];}\\n\"\n";
     // stream << "\" @p ld.global.nc.L2::128B.f32 %0, [%1];}\\n\"\n" ;
-    stream << ": \"=f\"(" << reg << "[" << local_addr << "]"
-           << ")\n";
+    stream << ": \"=f\"(" << reg << "[" << local_addr << "]" << ")\n";
     stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr << ")), \"r\"((int)"
            << guard << ")\n";
     stream << ");\n";
@@ -1385,8 +1396,7 @@ void CodeGenCUDA::VisitExpr_(const RampNode* op, std::ostream& os) {
   PrintVecConstructor(op->dtype, os);
   os << "(";
   for (int i = 0; i < lanes; i++) {
-    os << "(" << PrintExpr(op->base) << ")"
-       << "+(" << PrintExpr(op->stride) << "*" << i << ")";
+    os << "(" << PrintExpr(op->base) << ")" << "+(" << PrintExpr(op->stride) << "*" << i << ")";
     if (i != lanes - 1) os << ", ";
   }
   os << ")";
diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc
@@ -52,7 +52,7 @@ struct CUDAMath {
         default:
           return "";
       }
-    } else if (t.is_bfloat16()) {
+    } else if (t.is_bfloat16() || t.is_float8()) {
       if (name == "fabs") {
         return "__habs";
       } else if (name == "round") {

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ struct CUDAMath {`
`52`	`52`	`default:`
`53`	`53`	`return "";`
`54`	`54`	`}`
`55`		`- } else if (t.is_bfloat16()) {`
	`55`	`+ } else if (t.is_bfloat16() \|\| t.is_float8()) {`
`56`	`56`	`if (name == "fabs") {`
`57`	`57`	`return "__habs";`
`58`	`58`	`} else if (name == "round") {`