有时候,在抓取网站的时候,网站的编码方式可能不统一,这样的情况,可能有些网页编码不成功,而htmlparser报了错,不能正常的读取。抛出来的异常为:org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23
为了解决不管它是用何种编码方式,都能够正常读取数据。我在htmlparser的Page类中加了一个字段,之所以要在Page类中加,那是因为它历遍所有的标签过程中,把meta标签属性content捕获到了,并已经传值到setEncoding(String charset)办法中。
代码如下:
-
-
-
-
-
- package org.htmlparser.lexer;
-
- import java.io.*;
- import java.lang.reflect.InvocationTargetException;
- import java.lang.reflect.Method;
- import java.net.*;
- import java.util.zip.*;
- import org.htmlparser.http.ConnectionManager;
- import org.htmlparser.util.ParserException;
-
-
-
-
-
- public class Page
- implements Serializable
- {
-
- public Page()
- {
- this("");
- }
-
- public Page(URLConnection connection)
- throws ParserException
- {
- if(null == connection)
- {
- throw new IllegalArgumentException("connection cannot be null");
- } else
- {
- setConnection(connection);
- mBaseUrl = null;
- return;
- }
- }
-
- public Page(InputStream stream, String charset)
- throws UnsupportedEncodingException
- {
- if(null == stream)
- throw new IllegalArgumentException("stream cannot be null");
- if(null == charset)
- charset = "ISO-8859-1";
- mSource = new InputStreamSource(stream, charset);
- mIndex = new PageIndex(this);
- mConnection = null;
- mUrl = null;
- mBaseUrl = null;
- }
-
- public Page(String text, String charset)
- {
- if(null == text)
- throw new IllegalArgumentException("text cannot be null");
- if(null == charset)
- charset = "ISO-8859-1";
- mSource = new StringSource(text, charset);
- mIndex = new PageIndex(this);
- mConnection = null;
- mUrl = null;
- mBaseUrl = null;
- }
-
- public Page(String text)
- {
- this(text, null);
- }
-
- public Page(Source source)
- {
- if(null == source)
- {
- throw new IllegalArgumentException("source cannot be null");
- } else
- {
- mSource = source;
- mIndex = new PageIndex(this);
- mConnection = null;
- mUrl = null;
- mBaseUrl = null;
- return;
- }
- }
-
- public static ConnectionManager getConnectionManager()
- {
- return mConnectionManager;
- }
-
- public static void setConnectionManager(ConnectionManager manager)
- {
- mConnectionManager = manager;
- }
-
- public String getCharset(String content)
- {
- String CHARSET_STRING = "charset";
- String ret;
- if(null == mSource)
- ret = "ISO-8859-1";
- else
- ret = mSource.getEncoding();
- if(null != content)
- {
- int index = content.indexOf("charset");
- if(index != -1)
- {
- content = content.substring(index + "charset".length()).trim();
- if(content.startsWith("="))
- {
- content = content.substring(1).trim();
- index = content.indexOf(";");
- if(index != -1)
- content = content.substring(0, index);
- if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())
- content = content.substring(1, content.length() - 1);
- if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())
- content = content.substring(1, content.length() - 1);
- ret = findCharset(content, ret);
- }
- }
- }
- return ret;
- }
-
- public static String findCharset(String name, String fallback)
- {
- String ret;
- try
- {
- Class cls = Class.forName("java.nio.charset.Charset");
- Method method = cls.getMethod("forName", new Class[] {
- java.lang.String.class
- });
- Object object = method.invoke(null, new Object[] {
- name
- });
- method = cls.getMethod("name", new Class[0]);
- object = method.invoke(object, new Object[0]);
- ret = (String)object;
- }
- catch(ClassNotFoundException cnfe)
- {
- ret = name;
- }
- catch(NoSuchMethodException nsme)
- {
- ret = name;
- }
- catch(IllegalAccessException ia)
- {
- ret = name;
- }
- catch(InvocationTargetException ita)
- {
- ret = fallback;
- System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);
- }
- return ret;
- }
-
- private void writeObject(ObjectOutputStream out)
- throws IOException
- {
- if(null != getConnection())
- {
- out.writeBoolean(true);
- out.writeInt(mSource.offset());
- String href = getUrl();
- out.writeObject(href);
- setUrl(getConnection().getURL().toExternalForm());
- Source source = getSource();
- mSource = null;
- PageIndex index = mIndex;
- mIndex = null;
- out.defaultWriteObject();
- mSource = source;
- mIndex = index;
- } else
- {
- out.writeBoolean(false);
- String href = getUrl();
- out.writeObject(href);
- setUrl(null);
- out.defaultWriteObject();
- setUrl(href);
- }
- }
-
- private void readObject(ObjectInputStream in)
- throws IOException, ClassNotFoundException
- {
- boolean fromurl = in.readBoolean();
- if(fromurl)
- {
- int offset = in.readInt();
- String href = (String)in.readObject();
- in.defaultReadObject();
- if(null != getUrl())
- {
- URL url = new URL(getUrl());
- try
- {
- setConnection(url.openConnection());
- }
- catch(ParserException pe)
- {
- throw new IOException(pe.getMessage());
- }
- }
- Cursor cursor = new Cursor(this, 0);
- for(int i = 0; i < offset; i++)
- try
- {
- getCharacter(cursor);
- }
- catch(ParserException pe)
- {
- throw new IOException(pe.getMessage());
- }
-
- setUrl(href);
- } else
- {
- String href = (String)in.readObject();
- in.defaultReadObject();
- setUrl(href);
- }
- }
-
- public void reset()
- {
- getSource().reset();
- mIndex = new PageIndex(this);
- }
-
- public void close()
- throws IOException
- {
- if(null != getSource())
- getSource().destroy();
- }
-
- protected void finalize()
- throws Throwable
- {
- close();
- }
-
- public URLConnection getConnection()
- {
- return mConnection;
- }
-
- public void setConnection(URLConnection connection)
- throws ParserException
- {
- mConnection = connection;
- mConnection.setConnectTimeout(6000);
- mConnection.setReadTimeout(6000);
- try
- {
- getConnection().connect();
- }
- catch(UnknownHostException uhe)
- {
- throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);
- }
- catch(IOException ioe)
- {
- throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
- }
- String type = getContentType();
- String charset = getCharset(type);
- try
- {
- String contentEncoding = connection.getContentEncoding();
- System.out.println("contentEncoding="+contentEncoding);
- Stream stream;
- if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))
- stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));
- else
- if(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))
- stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));
- else{
- stream = new Stream(getConnection().getInputStream());
- }
-
- try
- {
-
-
-
-
- if(charset.indexOf("ISO-8859-1")!=-1){
- charset = getQICHAODEFAULT_CHARSET();
- }
- mSource = new InputStreamSource(stream, charset);
- }
- catch(UnsupportedEncodingException uee)
- {
- charset = "ISO-8859-1";
- mSource = new InputStreamSource(stream, charset);
- }
- }
- catch(IOException ioe)
- {
- throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);
- }
- mUrl = connection.getURL().toExternalForm();
- mIndex = new PageIndex(this);
- }
-
- public String getUrl()
- {
- return mUrl;
- }
-
- public void setUrl(String url)
- {
- mUrl = url;
- }
-
- public String getBaseUrl()
- {
- return mBaseUrl;
- }
-
- public void setBaseUrl(String url)
- {
- mBaseUrl = url;
- }
-
- public Source getSource()
- {
- return mSource;
- }
-
- public String getContentType()
- {
- String ret = "text/html";
- URLConnection connection = getConnection();
- if(null != connection)
- {
- String content = connection.getHeaderField("Content-Type");
- if(null != content)
- ret = content;
- }
- return ret;
- }
-
- public char getCharacter(Cursor cursor)
- throws ParserException
- {
- int i = cursor.getPosition();
- int offset = mSource.offset();
- char ret;
- if(offset == i)
- try
- {
- i = mSource.read();
- if(-1 == i)
- {
- ret = '\uFFFF';
- } else
- {
- ret = (char)i;
- cursor.advance();
- }
- }
- catch(IOException ioe)
- {
- throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
- }
- else
- if(offset > i)
- {
- try
- {
- ret = mSource.getCharacter(i);
- }
- catch(IOException ioe)
- {
- throw new ParserException("can't read a character at position " + i, ioe);
- }
- cursor.advance();
- } else
- {
- throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());
- }
- if('\r' == ret)
- {
- ret = '\n';
- if(mSource.offset() == cursor.getPosition())
- try
- {
- i = mSource.read();
- if(-1 != i)
- if('\n' == (char)i)
- cursor.advance();
- else
- try
- {
- mSource.unread();
- }
- catch(IOException ioe)
- {
- throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);
- }
- }
- catch(IOException ioe)
- {
- throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);
- }
- else
- try
- {
- if('\n' == mSource.getCharacter(cursor.getPosition()))
- cursor.advance();
- }
- catch(IOException ioe)
- {
- throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
- }
- }
- if('\n' == ret)
- mIndex.add(cursor);
- return ret;
- }
-
- public void ungetCharacter(Cursor cursor)
- throws ParserException
- {
- cursor.retreat();
- int i = cursor.getPosition();
- try
- {
- char ch = mSource.getCharacter(i);
- if('\n' == ch && 0 != i)
- {
- ch = mSource.getCharacter(i - 1);
- if('\r' == ch)
- cursor.retreat();
- }
- }
- catch(IOException ioe)
- {
- throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);
- }
- }
-
- public String getEncoding()
- {
- return getSource().getEncoding();
- }
-
- public void setEncoding(String character_set)
- throws ParserException
- {
- this.QICHAODEFAULT_CHARSET = character_set;
- getSource().setEncoding(character_set);
- }
-
- public URL constructUrl(String link, String base)
- throws MalformedURLException
- {
- return constructUrl(link, base, false);
- }
-
- public URL constructUrl(String link, String base, boolean strict)
- throws MalformedURLException
- {
- int index;
- URL url;
- if(!strict && '?' == link.charAt(0))
- {
- if(-1 != (index = base.lastIndexOf('?')))
- base = base.substring(0, index);
- url = new URL(base + link);
- } else
- {
- url = new URL(new URL(base), link);
- }
- String path = url.getFile();
- boolean modified = false;
- boolean absolute = link.startsWith("/");
- if(!absolute)
- do
- {
- if(!path.startsWith("/."))
- break;
- if(path.startsWith("/../"))
- {
- path = path.substring(3);
- modified = true;
- continue;
- }
- if(!path.startsWith("/./") && !path.startsWith("/."))
- break;
- path = path.substring(2);
- modified = true;
- } while(true);
- while(-1 != (index = path.indexOf("/\\")))
- {
- path = path.substring(0, index + 1) + path.substring(index + 2);
- modified = true;
- }
- if(modified)
- url = new URL(url, path);
- return url;
- }
-
- public String getAbsoluteURL(String link)
- {
- return getAbsoluteURL(link, false);
- }
-
- public String getAbsoluteURL(String link, boolean strict)
- {
- String ret;
- if(null == link || "".equals(link))
- ret = "";
- else
- try
- {
- String base = getBaseUrl();
- if(null == base)
- base = getUrl();
- if(null == base)
- {
- ret = link;
- } else
- {
- URL url = constructUrl(link, base, strict);
- ret = url.toExternalForm();
- }
- }
- catch(MalformedURLException murle)
- {
- ret = link;
- }
- return ret;
- }
-
- public int row(Cursor cursor)
- {
- return mIndex.row(cursor);
- }
-
- public int row(int position)
- {
- return mIndex.row(position);
- }
-
- public int column(Cursor cursor)
- {
- return mIndex.column(cursor);
- }
-
- public int column(int position)
- {
- return mIndex.column(position);
- }
-
- public String getText(int start, int end)
- throws IllegalArgumentException
- {
- String ret;
- try
- {
- ret = mSource.getString(start, end - start);
- }
- catch(IOException ioe)
- {
- throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
- }
- return ret;
- }
-
- public void getText(StringBuffer buffer, int start, int end)
- throws IllegalArgumentException
- {
- if(mSource.offset() < start || mSource.offset() < end)
- throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());
- int length;
- if(end < start)
- {
- length = end;
- end = start;
- start = length;
- }
- length = end - start;
- try
- {
- mSource.getCharacters(buffer, start, length);
- }
- catch(IOException ioe)
- {
- throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
- }
- }
-
- public String getText()
- {
- return getText(0, mSource.offset());
- }
-
- public void getText(StringBuffer buffer)
- {
- getText(buffer, 0, mSource.offset());
- }
-
- public void getText(char array[], int offset, int start, int end)
- throws IllegalArgumentException
- {
- if(mSource.offset() < start || mSource.offset() < end)
- throw new IllegalArgumentException("attempt to extract future characters from source");
- int length;
- if(end < start)
- {
- length = end;
- end = start;
- start = length;
- }
- length = end - start;
- try
- {
- mSource.getCharacters(array, offset, start, end);
- }
- catch(IOException ioe)
- {
- throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());
- }
- }
-
- public String getLine(Cursor cursor)
- {
- int line = row(cursor);
- int size = mIndex.size();
- int start;
- int end;
- if(line < size)
- {
- start = mIndex.elementAt(line);
- if(++line <= size)
- end = mIndex.elementAt(line);
- else
- end = mSource.offset();
- } else
- {
- start = mIndex.elementAt(line - 1);
- end = mSource.offset();
- }
- return getText(start, end);
- }
-
- public String getLine(int position)
- {
- return getLine(new Cursor(this, position));
- }
-
- public String toString()
- {
- String ret;
- if(mSource.offset() > 0)
- {
- StringBuffer buffer = new StringBuffer(43);
- int start = mSource.offset() - 40;
- if(0 > start)
- start = 0;
- else
- buffer.append("...");
- getText(buffer, start, mSource.offset());
- ret = buffer.toString();
- } else
- {
- ret = super.toString();
- }
- return ret;
- }
-
- public static final String DEFAULT_CHARSET = "ISO-8859-1";
- public static String QICHAODEFAULT_CHARSET = "gb2312";
- public static final String DEFAULT_CONTENT_TYPE = "text/html";
- public static final char EOF = 65535;
- protected String mUrl;
- protected String mBaseUrl;
- protected Source mSource;
- protected PageIndex mIndex;
- protected transient URLConnection mConnection;
- protected static ConnectionManager mConnectionManager = new ConnectionManager();
- public static String getQICHAODEFAULT_CHARSET() {
- return QICHAODEFAULT_CHARSET;
- }
-
- }
// Decompiled by Jad v1.5.8f. Copyright 2001 Pavel Kouznetsov.// Jad home page: http://www.kpdus.com/jad.html// Decompiler options: packimports(3)// Source File Name: Page.javapackage org.htmlparser.lexer;import java.io.*;import java.lang.reflect.InvocationTargetException;import java.lang.reflect.Method;import java.net.*;import java.util.zip.*;import org.htmlparser.http.ConnectionManager;import org.htmlparser.util.ParserException;// Referenced classes of package org.htmlparser.lexer:// InputStreamSource, PageIndex, StringSource, Cursor,// Stream, Sourcepublic class Pageimplements Serializable{public Page(){this("");}public Page(URLConnection connection)throws ParserException{if(null == connection){throw new IllegalArgumentException("connection cannot be null");} else{setConnection(connection);mBaseUrl = null;return;}}public Page(InputStream stream, String charset)throws UnsupportedEncodingException{if(null == stream)throw new IllegalArgumentException("stream cannot be null");if(null == charset)charset = "ISO-8859-1";mSource = new InputStreamSource(stream, charset);mIndex = new PageIndex(this);mConnection = null;mUrl = null;mBaseUrl = null;}public Page(String text, String charset){if(null == text)throw new IllegalArgumentException("text cannot be null");if(null == charset)charset = "ISO-8859-1";mSource = new StringSource(text, charset);mIndex = new PageIndex(this);mConnection = null;mUrl = null;mBaseUrl = null;}public Page(String text){this(text, null);}public Page(Source source){if(null == source){throw new IllegalArgumentException("source cannot be null");} else{mSource = source;mIndex = new PageIndex(this);mConnection = null;mUrl = null;mBaseUrl = null;return;}}public static ConnectionManager getConnectionManager(){return mConnectionManager;}public static void setConnectionManager(ConnectionManager manager){mConnectionManager = manager;}public String getCharset(String content){String CHARSET_STRING = "charset";String ret;if(null == mSource)ret = "ISO-8859-1";elseret = mSource.getEncoding();if(null != content){int index = content.indexOf("charset");if(index != -1){content = content.substring(index + "charset".length()).trim();if(content.startsWith("=")){content = content.substring(1).trim();index = content.indexOf(";");if(index != -1)content = content.substring(0, index);if(content.startsWith("\"") && content.endsWith("\"") && 1 < content.length())content = content.substring(1, content.length() - 1);if(content.startsWith("'") && content.endsWith("'") && 1 < content.length())content = content.substring(1, content.length() - 1);ret = findCharset(content, ret);}}}return ret;}public static String findCharset(String name, String fallback){String ret;try{Class cls = Class.forName("java.nio.charset.Charset");Method method = cls.getMethod("forName", new Class[] {java.lang.String.class});Object object = method.invoke(null, new Object[] {name});method = cls.getMethod("name", new Class[0]);object = method.invoke(object, new Object[0]);ret = (String)object;}catch(ClassNotFoundException cnfe){ret = name;}catch(NoSuchMethodException nsme){ret = name;}catch(IllegalAccessException ia){ret = name;}catch(InvocationTargetException ita){ret = fallback;System.out.println("unable to determine cannonical charset name for " + name + " - using " + fallback);}return ret;}private void writeObject(ObjectOutputStream out)throws IOException{if(null != getConnection()){out.writeBoolean(true);out.writeInt(mSource.offset());String href = getUrl();out.writeObject(href);setUrl(getConnection().getURL().toExternalForm());Source source = getSource();mSource = null;PageIndex index = mIndex;mIndex = null;out.defaultWriteObject();mSource = source;mIndex = index;} else{out.writeBoolean(false);String href = getUrl();out.writeObject(href);setUrl(null);out.defaultWriteObject();setUrl(href);}}private void readObject(ObjectInputStream in)throws IOException, ClassNotFoundException{boolean fromurl = in.readBoolean();if(fromurl){int offset = in.readInt();String href = (String)in.readObject();in.defaultReadObject();if(null != getUrl()){URL url = new URL(getUrl());try{setConnection(url.openConnection());}catch(ParserException pe){throw new IOException(pe.getMessage());}}Cursor cursor = new Cursor(this, 0);for(int i = 0; i < offset; i++)try{getCharacter(cursor);}catch(ParserException pe){throw new IOException(pe.getMessage());}setUrl(href);} else{String href = (String)in.readObject();in.defaultReadObject();setUrl(href);}}public void reset(){getSource().reset();mIndex = new PageIndex(this);}public void close()throws IOException{if(null != getSource())getSource().destroy();}protected void finalize()throws Throwable{close();}public URLConnection getConnection(){return mConnection;}public void setConnection(URLConnection connection)throws ParserException{mConnection = connection;mConnection.setConnectTimeout(6000);mConnection.setReadTimeout(6000);try{getConnection().connect();}catch(UnknownHostException uhe){throw new ParserException("Connect to " + mConnection.getURL().toExternalForm() + " failed.", uhe);}catch(IOException ioe){throw new ParserException("Exception connecting to " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);}String type = getContentType();String charset = getCharset(type);try{String contentEncoding = connection.getContentEncoding();System.out.println("contentEncoding="+contentEncoding);Stream stream;if(null != contentEncoding && -1 != contentEncoding.indexOf("gzip"))stream = new Stream(new GZIPInputStream(getConnection().getInputStream()));elseif(null != contentEncoding && -1 != contentEncoding.indexOf("deflate"))stream = new Stream(new InflaterInputStream(getConnection().getInputStream(), new Inflater(true)));else{stream = new Stream(getConnection().getInputStream());}try{/** 时间:2008年12月23日* 原因:当String charset = getCharset(type);返回来的是ISO-8859-1的时候,需要处理一下*/if(charset.indexOf("ISO-8859-1")!=-1){charset = getQICHAODEFAULT_CHARSET();} mSource = new InputStreamSource(stream, charset);}catch(UnsupportedEncodingException uee){charset = "ISO-8859-1";mSource = new InputStreamSource(stream, charset);}}catch(IOException ioe){throw new ParserException("Exception getting input stream from " + mConnection.getURL().toExternalForm() + " (" + ioe.getMessage() + ").", ioe);}mUrl = connection.getURL().toExternalForm();mIndex = new PageIndex(this);}public String getUrl(){return mUrl;}public void setUrl(String url){mUrl = url;}public String getBaseUrl(){return mBaseUrl;}public void setBaseUrl(String url){mBaseUrl = url;}public Source getSource(){return mSource;}public String getContentType(){String ret = "text/html";URLConnection connection = getConnection();if(null != connection){String content = connection.getHeaderField("Content-Type");if(null != content)ret = content;}return ret;}public char getCharacter(Cursor cursor)throws ParserException{int i = cursor.getPosition();int offset = mSource.offset();char ret;if(offset == i)try{i = mSource.read();if(-1 == i){ret = '\uFFFF';} else{ret = (char)i;cursor.advance();}}catch(IOException ioe){throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);}elseif(offset > i){try{ret = mSource.getCharacter(i);}catch(IOException ioe){throw new ParserException("can't read a character at position " + i, ioe);}cursor.advance();} else{throw new ParserException("attempt to read future characters from source " + i + " > " + mSource.offset());}if('\r' == ret){ret = '\n';if(mSource.offset() == cursor.getPosition())try{i = mSource.read();if(-1 != i)if('\n' == (char)i)cursor.advance();elsetry{mSource.unread();}catch(IOException ioe){throw new ParserException("can't unread a character at position " + cursor.getPosition(), ioe);}}catch(IOException ioe){throw new ParserException("problem reading a character at position " + cursor.getPosition(), ioe);}elsetry{if('\n' == mSource.getCharacter(cursor.getPosition()))cursor.advance();}catch(IOException ioe){throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);}}if('\n' == ret)mIndex.add(cursor);return ret;}public void ungetCharacter(Cursor cursor)throws ParserException{cursor.retreat();int i = cursor.getPosition();try{char ch = mSource.getCharacter(i);if('\n' == ch && 0 != i){ch = mSource.getCharacter(i - 1);if('\r' == ch)cursor.retreat();}}catch(IOException ioe){throw new ParserException("can't read a character at position " + cursor.getPosition(), ioe);}}public String getEncoding(){return getSource().getEncoding();}public void setEncoding(String character_set)throws ParserException{this.QICHAODEFAULT_CHARSET = character_set;getSource().setEncoding(character_set);}public URL constructUrl(String link, String base)throws MalformedURLException{return constructUrl(link, base, false);}public URL constructUrl(String link, String base, boolean strict)throws MalformedURLException{int index;URL url;if(!strict && '?' == link.charAt(0)){if(-1 != (index = base.lastIndexOf('?')))base = base.substring(0, index);url = new URL(base + link);} else{url = new URL(new URL(base), link);}String path = url.getFile();boolean modified = false;boolean absolute = link.startsWith("/");if(!absolute)do{if(!path.startsWith("/."))break;if(path.startsWith("/../")){path = path.substring(3);modified = true;continue;}if(!path.startsWith("/./") && !path.startsWith("/."))break;path = path.substring(2);modified = true;} while(true);while(-1 != (index = path.indexOf("/\\"))){path = path.substring(0, index + 1) + path.substring(index + 2);modified = true;}if(modified)url = new URL(url, path);return url;}public String getAbsoluteURL(String link){return getAbsoluteURL(link, false);}public String getAbsoluteURL(String link, boolean strict){String ret;if(null == link || "".equals(link))ret = "";elsetry{String base = getBaseUrl();if(null == base)base = getUrl();if(null == base){ret = link;} else{URL url = constructUrl(link, base, strict);ret = url.toExternalForm();}}catch(MalformedURLException murle){ret = link;}return ret;}public int row(Cursor cursor){return mIndex.row(cursor);}public int row(int position){return mIndex.row(position);}public int column(Cursor cursor){return mIndex.column(cursor);}public int column(int position){return mIndex.column(position);}public String getText(int start, int end)throws IllegalArgumentException{String ret;try{ret = mSource.getString(start, end - start);}catch(IOException ioe){throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());}return ret;}public void getText(StringBuffer buffer, int start, int end)throws IllegalArgumentException{if(mSource.offset() < start || mSource.offset() < end)throw new IllegalArgumentException("attempt to extract future characters from source" + start + "|" + end + " > " + mSource.offset());int length;if(end < start){length = end;end = start;start = length;}length = end - start;try{mSource.getCharacters(buffer, start, length);}catch(IOException ioe){throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());}}public String getText(){return getText(0, mSource.offset());}public void getText(StringBuffer buffer){getText(buffer, 0, mSource.offset());}public void getText(char array[], int offset, int start, int end)throws IllegalArgumentException{if(mSource.offset() < start || mSource.offset() < end)throw new IllegalArgumentException("attempt to extract future characters from source");int length;if(end < start){length = end;end = start;start = length;}length = end - start;try{mSource.getCharacters(array, offset, start, end);}catch(IOException ioe){throw new IllegalArgumentException("can't get the " + (end - start) + "characters at position " + start + " - " + ioe.getMessage());}}public String getLine(Cursor cursor){int line = row(cursor);int size = mIndex.size();int start;int end;if(line < size){start = mIndex.elementAt(line);if(++line <= size)end = mIndex.elementAt(line);elseend = mSource.offset();} else{start = mIndex.elementAt(line - 1);end = mSource.offset();}return getText(start, end);}public String getLine(int position){return getLine(new Cursor(this, position));}public String toString(){String ret;if(mSource.offset() > 0){StringBuffer buffer = new StringBuffer(43);int start = mSource.offset() - 40;if(0 > start)start = 0;elsebuffer.append("...");getText(buffer, start, mSource.offset());ret = buffer.toString();} else{ret = super.toString();}return ret;}public static final String DEFAULT_CHARSET = "ISO-8859-1";public static String QICHAODEFAULT_CHARSET = "gb2312";public static final String DEFAULT_CONTENT_TYPE = "text/html";public static final char EOF = 65535;protected String mUrl;protected String mBaseUrl;protected Source mSource;protected PageIndex mIndex;protected transient URLConnection mConnection;protected static ConnectionManager mConnectionManager = new ConnectionManager();public static String getQICHAODEFAULT_CHARSET() {return QICHAODEFAULT_CHARSET;}}
在调用的时候,代码如下:
- Parser parser = new Parser(url);
- parser.setEncoding(parser.getLexer().getPage().getQICHAODEFAULT_CHARSET());
Parser parser = new Parser(url);parser.setEncoding(parser.getLexer().getPage().getQICHAODEFAULT_CHARSET());
一般情况下,设置成这样应该是没问题的啦,但是,你有时候看到的编码方式并不一定是它该网页的编码方式。比如说,肉眼看到页面中有<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />,htmlparser可以正常的获取到,代码如下:
- public void doSemanticAction()
- throws ParserException
- {
- String httpEquiv = getHttpEquiv();
- if("Content-Type".equalsIgnoreCase(httpEquiv))
- {
- String charset = getPage().getCharset(getAttribute("CONTENT"));
- getPage().setEncoding(charset);
- }
- }
public void doSemanticAction()throws ParserException{String httpEquiv = getHttpEquiv();if("Content-Type".equalsIgnoreCase(httpEquiv)){String charset = getPage().getCharset(getAttribute("CONTENT"));getPage().setEncoding(charset);}}
但是,你不要认为这个就是它的编码方式啦,在htmlparser,还进行了一次判断,在类Page中,有个方法是获取报头字段Content-Type的。代码如下:
- public String getContentType()
- {
- String ret = "text/html";
- URLConnection connection = getConnection();
- if(null != connection)
- {
- String content = connection.getHeaderField("Content-Type");
- if(null != content)
- ret = content;
- }
- return ret;
- }
public String getContentType(){String ret = "text/html";URLConnection connection = getConnection();if(null != connection){String content = connection.getHeaderField("Content-Type");if(null != content)ret = content;}return ret;}
两个进行比较,如果不一样的话,它就报了
- org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23
org.htmlparser.util.EncodingChangeException: character mismatch (new: 中 [0x4e2d] != old: [0xd6?]) for encoding change from ISO-8859-1 to GB2312 at character offset 23
,所以,我在类InputStreamSource再一次进行修改:
代码如下:
- public void setEncoding(String character_set)
- throws ParserException
- {
- String encoding = getEncoding();
-
-
-
- if(encoding!=null){
- character_set = encoding;
- }
- if(!encoding.equalsIgnoreCase(character_set))
- {
- InputStream stream = getStream();
- try
- {
- char buffer[] = mBuffer;
- int offset = mOffset;
- stream.reset();
- try
- {
- mEncoding = character_set;
- mReader = new InputStreamReader(stream, character_set);
- mBuffer = new char[mBuffer.length];
- mLevel = 0;
- mOffset = 0;
- mMark = -1;
- if(0 != offset)
- {
- char new_chars[] = new char[offset];
- if(offset != read(new_chars))
- throw new ParserException("reset stream failed");
- for(int i = 0; i < offset; i++)
- if(new_chars[i] != buffer[i])
- throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);
-
- }
- }
- catch(IOException ioe)
- {
- throw new ParserException(ioe.getMessage(), ioe);
- }
- }
- catch(IOException ioe)
- {
- throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);
- }
- }
- }
public void setEncoding(String character_set)throws ParserException{String encoding = getEncoding();/*** time:2008年12月23日*/if(encoding!=null){character_set = encoding;}if(!encoding.equalsIgnoreCase(character_set)){InputStream stream = getStream();try{char buffer[] = mBuffer;int offset = mOffset;stream.reset();try{mEncoding = character_set;mReader = new InputStreamReader(stream, character_set);mBuffer = new char[mBuffer.length];mLevel = 0;mOffset = 0;mMark = -1;if(0 != offset){char new_chars[] = new char[offset];if(offset != read(new_chars))throw new ParserException("reset stream failed");for(int i = 0; i < offset; i++)if(new_chars[i] != buffer[i])throw new EncodingChangeException("character mismatch (new: " + new_chars[i] + " [0x" + Integer.toString(new_chars[i], 16) + "] != old: " + " [0x" + Integer.toString(buffer[i], 16) + buffer[i] + "]) for encoding change from " + encoding + " to " + character_set + " at character offset " + i);}}catch(IOException ioe){throw new ParserException(ioe.getMessage(), ioe);}}catch(IOException ioe){throw new ParserException("Stream reset failed (" + ioe.getMessage() + "), try wrapping it with a org.htmlparser.lexer.Stream", ioe);}}}
这样应该来说,不管什么方式都OK的。