C#中利用正则表达式去除HTML中的格式

最新推荐文章于 2026-04-30 13:39:26 发布

原创最新推荐文章于 2026-04-30 13:39:26 发布 · 4.4k 阅读

3 ·

本内容遵循CC 4.0 BY-SA版权协议

收录于

ASP.NET学习资料

本文介绍了一种使用C#编程语言去除HTML标签的方法，该方法通过正则表达式来清除HTML标签和特殊字符，保留纯文本内容。具体包括删除脚本、注释、特殊实体等，并替换了一些HTML实体为对应的字符。

想去掉除了段落标记之外的所有html标记，只要页面的文字，好比是我把代码贴到记事本里面的效果，去掉了链接等代码。可以试试。

public static string DelHTML(string Htmlstring)//将HTML去除

         {
             #region
             //删除脚本

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             //删除HTML

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<(.[^>]*)>","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"([/r/n])[/s]+","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"-->","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<!--.*","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<A>.*</A>","");

             //Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"<[a-zA-Z]*=/.[a-zA-Z]*/?[a-zA-Z]+=/d&/w=%[a-zA-Z]*|[A-Z0-9]","");



             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(quot|#34);","/"",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(amp|#38);","&",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(lt|#60);","<",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(gt|#62);",">",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(iexcl|#161);","/xa1",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring = System.Text.RegularExpressions.Regex.Replace(Htmlstring,@"&(cent|#162);","/xa2",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(pound|#163);","/xa3",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring,@"&(copy|#169);","/xa9",System.Text.RegularExpressions.RegexOptions.IgnoreCase);

             Htmlstring =System.Text.RegularExpressions. Regex.Replace(Htmlstring, @"&#(/d+);","",System.Text.RegularExpressions.RegexOptions.IgnoreCase);


             Htmlstring.Replace("<","");

             Htmlstring.Replace(">","");

             Htmlstring.Replace("/r/n","");

             //Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
             #endregion

             return Htmlstring;

         }

调用时传入原数据．返回的就是除去后的．

标签

#html #正则表达式 #c# #string #脚本