C#实现将HTML转换成纯文本的方法
本文实例讲述了C#实现将HTML转换成纯文本的方法。分享给大家供大家参考。具体如下:
使用方法:
HtmlToTextconvert=newHtmlToText(); textBox2.Text=convert.Convert(textBox1.Text);
C#代码如下:
///<summary> ///ConvertsHTMLtoplaintext. ///</summary> classHtmlToText { //Staticdatatables protectedstaticDictionary<string,string>_tags; protectedstaticHashSet<string>_ignoreTags; //Instancevariables protectedTextBuilder_text; protectedstring_html; protectedint_pos; //Staticconstructor(onetimeonly) staticHtmlToText() { _tags=newDictionary<string,string>(); _tags.Add("address","\n"); _tags.Add("blockquote","\n"); _tags.Add("div","\n"); _tags.Add("dl","\n"); _tags.Add("fieldset","\n"); _tags.Add("form","\n"); _tags.Add("h1","\n"); _tags.Add("/h1","\n"); _tags.Add("h2","\n"); _tags.Add("/h2","\n"); _tags.Add("h3","\n"); _tags.Add("/h3","\n"); _tags.Add("h4","\n"); _tags.Add("/h4","\n"); _tags.Add("h5","\n"); _tags.Add("/h5","\n"); _tags.Add("h6","\n"); _tags.Add("/h6","\n"); _tags.Add("p","\n"); _tags.Add("/p","\n"); _tags.Add("table","\n"); _tags.Add("/table","\n"); _tags.Add("ul","\n"); _tags.Add("/ul","\n"); _tags.Add("ol","\n"); _tags.Add("/ol","\n"); _tags.Add("/li","\n"); _tags.Add("br","\n"); _tags.Add("/td","\t"); _tags.Add("/tr","\n"); _tags.Add("/pre","\n"); _ignoreTags=newHashSet<string>(); _ignoreTags.Add("script"); _ignoreTags.Add("noscript"); _ignoreTags.Add("style"); _ignoreTags.Add("object"); } ///<summary> ///ConvertsthegivenHTMLtoplaintextandreturnstheresult. ///</summary> ///<paramname="html">HTMLtobeconverted</param> ///<returns>Resultingplaintext</returns> publicstringConvert(stringhtml) { //Initializestatevariables _text=newTextBuilder(); _html=html; _pos=0; //Processinput while(!EndOfText) { if(Peek()=='<') { //HTMLtag boolselfClosing; stringtag=ParseTag(outselfClosing); //Handlespecialtagcases if(tag=="body") { //Discardcontentbefore<body> _text.Clear(); } elseif(tag=="/body") { //Discardcontentafter</body> _pos=_html.Length; } elseif(tag=="pre") { //Enterpreformattedmode _text.Preformatted=true; EatWhitespaceToNextLine(); } elseif(tag=="/pre") { //Exitpreformattedmode _text.Preformatted=false; } stringvalue; if(_tags.TryGetValue(tag,outvalue)) _text.Write(value); if(_ignoreTags.Contains(tag)) EatInnerContent(tag); } elseif(Char.IsWhiteSpace(Peek())) { //Whitespace(treatallasspace) _text.Write(_text.Preformatted?Peek():''); MoveAhead(); } else { //Othertext _text.Write(Peek()); MoveAhead(); } } //Returnresult returnHttpUtility.HtmlDecode(_text.ToString()); } //Eatsallcharactersthatarepartofthecurrenttag //andreturnsinformationaboutthattag protectedstringParseTag(outboolselfClosing) { stringtag=String.Empty; selfClosing=false; if(Peek()=='<') { MoveAhead(); //Parsetagname EatWhitespace(); intstart=_pos; if(Peek()=='/') MoveAhead(); while(!EndOfText&&!Char.IsWhiteSpace(Peek())&& Peek()!='/'&&Peek()!='>') MoveAhead(); tag=_html.Substring(start,_pos-start).ToLower(); //Parserestoftag while(!EndOfText&&Peek()!='>') { if(Peek()=='"'||Peek()=='\'') EatQuotedValue(); else { if(Peek()=='/') selfClosing=true; MoveAhead(); } } MoveAhead(); } returntag; } //Consumesinnercontentfromthecurrenttag protectedvoidEatInnerContent(stringtag) { stringendTag="/"+tag; while(!EndOfText) { if(Peek()=='<') { //Consumeatag boolselfClosing; if(ParseTag(outselfClosing)==endTag) return; //Userecursiontoconsumenestedtags if(!selfClosing&&!tag.StartsWith("/")) EatInnerContent(tag); } elseMoveAhead(); } } //Returnstrueifthecurrentpositionisattheendof //thestring protectedboolEndOfText { get{return(_pos>=_html.Length);} } //Safelyreturnsthecharacteratthecurrentposition protectedcharPeek() { return(_pos<_html.Length)?_html[_pos]:(char)0; } //Safelyadvancestocurrentpositiontothenextcharacter protectedvoidMoveAhead() { _pos=Math.Min(_pos+1,_html.Length); } //Movesthecurrentpositiontothenextnon-whitespace //character. protectedvoidEatWhitespace() { while(Char.IsWhiteSpace(Peek())) MoveAhead(); } //Movesthecurrentpositiontothenextnon-whitespace //characterorthestartofthenextline,whichever //comesfirst protectedvoidEatWhitespaceToNextLine() { while(Char.IsWhiteSpace(Peek())) { charc=Peek(); MoveAhead(); if(c=='\n') break; } } //Movesthecurrentpositionpastaquotedvalue protectedvoidEatQuotedValue() { charc=Peek(); if(c=='"'||c=='\'') { //Openingquote MoveAhead(); //Findendofvalue intstart=_pos; _pos=_html.IndexOfAny(newchar[]{c,'\r','\n'},_pos); if(_pos<0) _pos=_html.Length; else MoveAhead();//Closingquote } } ///<summary> ///AStringBuilderclassthathelpseliminateexcesswhitespace. ///</summary> protectedclassTextBuilder { privateStringBuilder_text; privateStringBuilder_currLine; privateint_emptyLines; privatebool_preformatted; //Construction publicTextBuilder() { _text=newStringBuilder(); _currLine=newStringBuilder(); _emptyLines=0; _preformatted=false; } ///<summary> ///Normally,extrawhitespacecharactersarediscarded. ///Ifthispropertyissettotrue,theyarepassed ///throughunchanged. ///</summary> publicboolPreformatted { get { return_preformatted; } set { if(value) { //Clearlinebufferifchangingto //preformattedmode if(_currLine.Length>0) FlushCurrLine(); _emptyLines=0; } _preformatted=value; } } ///<summary> ///Clearsallcurrenttext. ///</summary> publicvoidClear() { _text.Length=0; _currLine.Length=0; _emptyLines=0; } ///<summary> ///Writesthegivenstringtotheoutputbuffer. ///</summary> ///<paramname="s"></param> publicvoidWrite(strings) { foreach(charcins) Write(c); } ///<summary> ///Writesthegivencharactertotheoutputbuffer. ///</summary> ///<paramname="c">Charactertowrite</param> publicvoidWrite(charc) { if(_preformatted) { //Writepreformattedcharacter _text.Append(c); } else { if(c=='\r') { //Ignorecarriagereturns.We'llprocess //'\n'ifitcomesnext } elseif(c=='\n') { //Flushcurrentline FlushCurrLine(); } elseif(Char.IsWhiteSpace(c)) { //Writesinglespacecharacter intlen=_currLine.Length; if(len==0||!Char.IsWhiteSpace(_currLine[len-1])) _currLine.Append(''); } else { //Addcharactertocurrentline _currLine.Append(c); } } } //Appendsthecurrentlinetooutputbuffer protectedvoidFlushCurrLine() { //Getcurrentline stringline=_currLine.ToString().Trim(); //Determineiflinecontainsnon-spacecharacters stringtmp=line.Replace(" ",String.Empty); if(tmp.Length==0) { //Anemptyline _emptyLines++; if(_emptyLines<2&&_text.Length>0) _text.AppendLine(line); } else { //Anon-emptyline _emptyLines=0; _text.AppendLine(line); } //Resetcurrentline _currLine.Length=0; } ///<summary> ///Returnsthecurrentoutputasastring. ///</summary> publicoverridestringToString() { if(_currLine.Length>0) FlushCurrLine(); return_text.ToString(); } } }
希望本文所述对大家的C#程序设计有所帮助。