C#实现将HTML转换成纯文本的方法
本文实例讲述了C#实现将HTML转换成纯文本的方法。分享给大家供大家参考。具体如下:
使用方法:
HtmlToTextconvert=newHtmlToText(); textBox2.Text=convert.Convert(textBox1.Text);
C#代码如下:
///<summary>
///ConvertsHTMLtoplaintext.
///</summary>
classHtmlToText
{
//Staticdatatables
protectedstaticDictionary<string,string>_tags;
protectedstaticHashSet<string>_ignoreTags;
//Instancevariables
protectedTextBuilder_text;
protectedstring_html;
protectedint_pos;
//Staticconstructor(onetimeonly)
staticHtmlToText()
{
_tags=newDictionary<string,string>();
_tags.Add("address","\n");
_tags.Add("blockquote","\n");
_tags.Add("div","\n");
_tags.Add("dl","\n");
_tags.Add("fieldset","\n");
_tags.Add("form","\n");
_tags.Add("h1","\n");
_tags.Add("/h1","\n");
_tags.Add("h2","\n");
_tags.Add("/h2","\n");
_tags.Add("h3","\n");
_tags.Add("/h3","\n");
_tags.Add("h4","\n");
_tags.Add("/h4","\n");
_tags.Add("h5","\n");
_tags.Add("/h5","\n");
_tags.Add("h6","\n");
_tags.Add("/h6","\n");
_tags.Add("p","\n");
_tags.Add("/p","\n");
_tags.Add("table","\n");
_tags.Add("/table","\n");
_tags.Add("ul","\n");
_tags.Add("/ul","\n");
_tags.Add("ol","\n");
_tags.Add("/ol","\n");
_tags.Add("/li","\n");
_tags.Add("br","\n");
_tags.Add("/td","\t");
_tags.Add("/tr","\n");
_tags.Add("/pre","\n");
_ignoreTags=newHashSet<string>();
_ignoreTags.Add("script");
_ignoreTags.Add("noscript");
_ignoreTags.Add("style");
_ignoreTags.Add("object");
}
///<summary>
///ConvertsthegivenHTMLtoplaintextandreturnstheresult.
///</summary>
///<paramname="html">HTMLtobeconverted</param>
///<returns>Resultingplaintext</returns>
publicstringConvert(stringhtml)
{
//Initializestatevariables
_text=newTextBuilder();
_html=html;
_pos=0;
//Processinput
while(!EndOfText)
{
if(Peek()=='<')
{
//HTMLtag
boolselfClosing;
stringtag=ParseTag(outselfClosing);
//Handlespecialtagcases
if(tag=="body")
{
//Discardcontentbefore<body>
_text.Clear();
}
elseif(tag=="/body")
{
//Discardcontentafter</body>
_pos=_html.Length;
}
elseif(tag=="pre")
{
//Enterpreformattedmode
_text.Preformatted=true;
EatWhitespaceToNextLine();
}
elseif(tag=="/pre")
{
//Exitpreformattedmode
_text.Preformatted=false;
}
stringvalue;
if(_tags.TryGetValue(tag,outvalue))
_text.Write(value);
if(_ignoreTags.Contains(tag))
EatInnerContent(tag);
}
elseif(Char.IsWhiteSpace(Peek()))
{
//Whitespace(treatallasspace)
_text.Write(_text.Preformatted?Peek():'');
MoveAhead();
}
else
{
//Othertext
_text.Write(Peek());
MoveAhead();
}
}
//Returnresult
returnHttpUtility.HtmlDecode(_text.ToString());
}
//Eatsallcharactersthatarepartofthecurrenttag
//andreturnsinformationaboutthattag
protectedstringParseTag(outboolselfClosing)
{
stringtag=String.Empty;
selfClosing=false;
if(Peek()=='<')
{
MoveAhead();
//Parsetagname
EatWhitespace();
intstart=_pos;
if(Peek()=='/')
MoveAhead();
while(!EndOfText&&!Char.IsWhiteSpace(Peek())&&
Peek()!='/'&&Peek()!='>')
MoveAhead();
tag=_html.Substring(start,_pos-start).ToLower();
//Parserestoftag
while(!EndOfText&&Peek()!='>')
{
if(Peek()=='"'||Peek()=='\'')
EatQuotedValue();
else
{
if(Peek()=='/')
selfClosing=true;
MoveAhead();
}
}
MoveAhead();
}
returntag;
}
//Consumesinnercontentfromthecurrenttag
protectedvoidEatInnerContent(stringtag)
{
stringendTag="/"+tag;
while(!EndOfText)
{
if(Peek()=='<')
{
//Consumeatag
boolselfClosing;
if(ParseTag(outselfClosing)==endTag)
return;
//Userecursiontoconsumenestedtags
if(!selfClosing&&!tag.StartsWith("/"))
EatInnerContent(tag);
}
elseMoveAhead();
}
}
//Returnstrueifthecurrentpositionisattheendof
//thestring
protectedboolEndOfText
{
get{return(_pos>=_html.Length);}
}
//Safelyreturnsthecharacteratthecurrentposition
protectedcharPeek()
{
return(_pos<_html.Length)?_html[_pos]:(char)0;
}
//Safelyadvancestocurrentpositiontothenextcharacter
protectedvoidMoveAhead()
{
_pos=Math.Min(_pos+1,_html.Length);
}
//Movesthecurrentpositiontothenextnon-whitespace
//character.
protectedvoidEatWhitespace()
{
while(Char.IsWhiteSpace(Peek()))
MoveAhead();
}
//Movesthecurrentpositiontothenextnon-whitespace
//characterorthestartofthenextline,whichever
//comesfirst
protectedvoidEatWhitespaceToNextLine()
{
while(Char.IsWhiteSpace(Peek()))
{
charc=Peek();
MoveAhead();
if(c=='\n')
break;
}
}
//Movesthecurrentpositionpastaquotedvalue
protectedvoidEatQuotedValue()
{
charc=Peek();
if(c=='"'||c=='\'')
{
//Openingquote
MoveAhead();
//Findendofvalue
intstart=_pos;
_pos=_html.IndexOfAny(newchar[]{c,'\r','\n'},_pos);
if(_pos<0)
_pos=_html.Length;
else
MoveAhead();//Closingquote
}
}
///<summary>
///AStringBuilderclassthathelpseliminateexcesswhitespace.
///</summary>
protectedclassTextBuilder
{
privateStringBuilder_text;
privateStringBuilder_currLine;
privateint_emptyLines;
privatebool_preformatted;
//Construction
publicTextBuilder()
{
_text=newStringBuilder();
_currLine=newStringBuilder();
_emptyLines=0;
_preformatted=false;
}
///<summary>
///Normally,extrawhitespacecharactersarediscarded.
///Ifthispropertyissettotrue,theyarepassed
///throughunchanged.
///</summary>
publicboolPreformatted
{
get
{
return_preformatted;
}
set
{
if(value)
{
//Clearlinebufferifchangingto
//preformattedmode
if(_currLine.Length>0)
FlushCurrLine();
_emptyLines=0;
}
_preformatted=value;
}
}
///<summary>
///Clearsallcurrenttext.
///</summary>
publicvoidClear()
{
_text.Length=0;
_currLine.Length=0;
_emptyLines=0;
}
///<summary>
///Writesthegivenstringtotheoutputbuffer.
///</summary>
///<paramname="s"></param>
publicvoidWrite(strings)
{
foreach(charcins)
Write(c);
}
///<summary>
///Writesthegivencharactertotheoutputbuffer.
///</summary>
///<paramname="c">Charactertowrite</param>
publicvoidWrite(charc)
{
if(_preformatted)
{
//Writepreformattedcharacter
_text.Append(c);
}
else
{
if(c=='\r')
{
//Ignorecarriagereturns.We'llprocess
//'\n'ifitcomesnext
}
elseif(c=='\n')
{
//Flushcurrentline
FlushCurrLine();
}
elseif(Char.IsWhiteSpace(c))
{
//Writesinglespacecharacter
intlen=_currLine.Length;
if(len==0||!Char.IsWhiteSpace(_currLine[len-1]))
_currLine.Append('');
}
else
{
//Addcharactertocurrentline
_currLine.Append(c);
}
}
}
//Appendsthecurrentlinetooutputbuffer
protectedvoidFlushCurrLine()
{
//Getcurrentline
stringline=_currLine.ToString().Trim();
//Determineiflinecontainsnon-spacecharacters
stringtmp=line.Replace(" ",String.Empty);
if(tmp.Length==0)
{
//Anemptyline
_emptyLines++;
if(_emptyLines<2&&_text.Length>0)
_text.AppendLine(line);
}
else
{
//Anon-emptyline
_emptyLines=0;
_text.AppendLine(line);
}
//Resetcurrentline
_currLine.Length=0;
}
///<summary>
///Returnsthecurrentoutputasastring.
///</summary>
publicoverridestringToString()
{
if(_currLine.Length>0)
FlushCurrLine();
return_text.ToString();
}
}
}
希望本文所述对大家的C#程序设计有所帮助。