jsoup, HTML Parser example
jsoup HTML Parser »ç¿ë ¿¹
jsoup´Â ÀÚ¹Ù ¶óÀ̺귯¸®À̸ç À¥ÆäÀÌÁö³ª HTML ¹®ÀÚ¿À» ¹Þ¾Æ¼ DOM ¿ÀºêÁ§Æ®·Î »ç¿ëÇÒ ¼ö ÀÖµµ·Ï º¯È¯ÇØÁÖ´Â ±â´ÉÀ» ÇÑ´Ù
HTML ÆÄÀÏÀ̳ª À¥»çÀÌÆ® URL, ȤÀº ¹®ÀÚ¿À» ÀÐ¾î µé¿©¼ DOM °´Ã¼¸¦ ÀÌ¿ëÇÏ¿© ³»¿ëÀ» °£ÆíÇÏ°Ô °¡Á®¿Ã ¼ö ÀÖ´Â ±â´ÉÀ» Á¦°øÇÑ´Ù
ƯÈ÷ select() ¸Þ¼Òµå´Â CSS Query ¸¦ ÀÌ¿ëÇÏ¿© ƯÁ¤ ű׸¦ ã¾Æ³»±â ¶§¹®¿¡ CSS, jQuery µîÀ» »ç¿ëÇغ»ÀûÀÌ ÀÖÀ¸¸é »õ·Î ÀÍÇô¾ßÇÒ »ç¿ë¹ýÀº °ÅÀÇ ¾ø´Â °Í °°´Ù.
Download http://jsoup.org/download
¿©±âÀú±â »çÀÌÆ®·ÎºÎÅÍ °¡Á®¿Â »ç¿ë¹ý ¿¹
// Connect to the web site
Document document = Jsoup.connect(url).get();
// Get the html document title
title = document.title();
// Connect to the web site
Document document = Jsoup.connect(url).get();
// Using Elements to get the Meta data
Elements description = document.select("meta[name=description]");
// Locate the content attribute
desc = description.attr("content");
// Connect to the web site
Document document = Jsoup.connect(url).get();
// Using Elements to get the class data
Elements img = document.select("h1[class=image-logo] img[src]");
// Locate the src attribute
String imgSrc = img.attr("src");
// Download image from URL
InputStream input = new java.net.URL(imgSrc).openStream();
// Decode Bitmap
bitmap = BitmapFactory.decodeStream(input);
Document doc = Jsoup.connect(URL).get();
Elements topicList = doc.select("h2.topic"); // h2 ű×ÀÇ class ¼Ó¼ºÀÌ 'topic' ÀÎ °Í
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
Elements media = doc.select("[src]");
Elements imports = doc.select("link[href]");
for (Element src : media)
{
if (src.tagName().equals("img"))
{
System.out.printf(" * %s: <%s> %sx%s (%s)",
src.tagName(), src.attr("abs:src"), src.attr("width"),
src.attr("height"), trim(src.attr("alt"), 20));
}
else
System.out.printf(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
}
for (Element link : imports)
{
System.out.printf(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));
}
´ÙÀ½Àº À§ÀÇ jsoup »ç¿ë¹ýµéÀ» Âü°íÇÏ¿© ÀÛ¼ºÇغ» Å×½ºÆ® ¿¹Á¦ÀÔ´Ï´Ù
ƯÁ¤ ÆäÀÌÁö¿¡ Á¢±ÙÇÏ¿© ¸µÅ© URLÀ» °¡Á®¿Í¼ Ãâ·ÂÇÏ´Â jsoup ¿¹
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class JsoupTest
{
public static void main(String[] args) throws IOException
{
Document doc = Jsoup.connect("http://blog.naver.com/maccamp").get();
//System.out.printf(doc.toString());
Elements elems = doc.select("frame[src]"); //CSS ¼¿·ºÅÍ
for(Element elem : elems)
{
//String txt = elem.text();
//String html = elem.html();
String src = elem.attr("abs:src"); //Àý´ë°æ·Î·Î º¯°æÇÏ¿© ¸®ÅÏ
doc = Jsoup.connect(src).get();
Elements elems2 = doc.select("a[href]");
for(Element e : elems2)
{
System.out.println(e.attr("abs:href"));
}
//System.out.printf("%s\n", doc.toString());
}
}
}
·ÎÄà ½Ã½ºÅÛ¿¡¼ HTML, JSP¸¦ »ç¿ëÇÏ¿© Å×½ºÆ®ÇÑ ¿¹Á¦
import java.io.IOException;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
public class JsoupTest
{
public static void main(String[] args) throws IOException
{
Document doc = Jsoup.connect("http://localhost:8888/jsoup/index.html").get();
Elements elems = doc.select("a");
for(Element ele : elems)
{
String link = ele.attr("abs:href");
if(link.indexOf("12345")!=-1)
{
Document doc2 = Jsoup.connect(link).get();
String txt = doc2.text();
String html = doc2.html();
String str = doc2.toString();
System.out.println(txt);
System.out.println("---------------------");
System.out.println(html);
System.out.println("---------------------");
System.out.println(str);
System.out.println("---------------------");
}
}
elems = doc.select("script");
for(Element ele : elems)
{
String txt = ele.text();
String html = ele.html();
String str = ele.toString();
System.out.println(txt);
System.out.println("---------------------");
System.out.println(html);
System.out.println("---------------------");
System.out.println(str);
System.out.println("---------------------");
}
}
}
Ãâó: http://micropilot.tistory.com/2427 []