ÃֽŠ°Ô½Ã±Û(JAVA)
2017.07.10 / 17:14

jsoup, HTML Parser example

XMaLL°ü¸®ÀÚ
Ãßõ ¼ö 230

jsoup HTML Parser »ç¿ë ¿¹



jsoup´Â ÀÚ¹Ù ¶óÀ̺귯¸®À̸ç À¥ÆäÀÌÁö³ª HTML ¹®ÀÚ¿­À» ¹Þ¾Æ¼­ DOM ¿ÀºêÁ§Æ®·Î »ç¿ëÇÒ ¼ö ÀÖµµ·Ï º¯È¯ÇØÁÖ´Â ±â´ÉÀ» ÇÑ´Ù

HTML ÆÄÀÏÀ̳ª À¥»çÀÌÆ® URL, ȤÀº ¹®ÀÚ¿­À» ÀÐ¾î µé¿©¼­ DOM °´Ã¼¸¦ ÀÌ¿ëÇÏ¿© ³»¿ëÀ» °£ÆíÇÏ°Ô °¡Á®¿Ã ¼ö ÀÖ´Â ±â´ÉÀ» Á¦°øÇÑ´Ù

ƯÈ÷ select() ¸Þ¼Òµå´Â CSS Query ¸¦ ÀÌ¿ëÇÏ¿© ƯÁ¤ ű׸¦ ã¾Æ³»±â ¶§¹®¿¡ CSS, jQuery µîÀ» »ç¿ëÇغ»ÀûÀÌ ÀÖÀ¸¸é »õ·Î ÀÍÇô¾ßÇÒ »ç¿ë¹ýÀº °ÅÀÇ ¾ø´Â °Í °°´Ù.


Download http://jsoup.org/download


¿©±âÀú±â »çÀÌÆ®·ÎºÎÅÍ °¡Á®¿Â »ç¿ë¹ý ¿¹

// Connect to the web site

Document document = Jsoup.connect(url).get();

// Get the html document title

title = document.title();


// Connect to the web site

Document document = Jsoup.connect(url).get();

// Using Elements to get the Meta data

Elements description = document.select("meta[name=description]");

// Locate the content attribute

desc = description.attr("content");


// Connect to the web site

Document document = Jsoup.connect(url).get();

// Using Elements to get the class data

Elements img = document.select("h1[class=image-logo] img[src]");

// Locate the src attribute

String imgSrc = img.attr("src");

// Download image from URL

InputStream input = new java.net.URL(imgSrc).openStream();

// Decode Bitmap

bitmap = BitmapFactory.decodeStream(input);


Document doc  = Jsoup.connect(URL).get();

Elements topicList = doc.select("h2.topic"); // h2 ű×ÀÇ class ¼Ó¼ºÀÌ 'topic' ÀÎ °Í


Document doc = Jsoup.connect(url).get();

Elements links = doc.select("a[href]");

Elements media = doc.select("[src]");

Elements imports = doc.select("link[href]");


for (Element src : media) 

{

  if (src.tagName().equals("img"))

  {

     System.out.printf(" * %s: <%s> %sx%s (%s)",

       src.tagName(), src.attr("abs:src"), src.attr("width"), 

       src.attr("height"), trim(src.attr("alt"), 20));

  }

  else

     System.out.printf(" * %s: <%s>", src.tagName(), src.attr("abs:src"));

}


for (Element link : imports) 

{

   System.out.printf(" * %s <%s> (%s)", link.tagName(),link.attr("abs:href"), link.attr("rel"));

}



´ÙÀ½Àº À§ÀÇ jsoup »ç¿ë¹ýµéÀ» Âü°íÇÏ¿© ÀÛ¼ºÇغ» Å×½ºÆ® ¿¹Á¦ÀÔ´Ï´Ù


ƯÁ¤ ÆäÀÌÁö¿¡ Á¢±ÙÇÏ¿© ¸µÅ© URLÀ» °¡Á®¿Í¼­ Ãâ·ÂÇÏ´Â jsoup ¿¹

import java.io.IOException;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;


public class JsoupTest 

{

    public static void main(String[] args) throws IOException

    {

        Document doc = Jsoup.connect("http://blog.naver.com/maccamp").get();

        //System.out.printf(doc.toString());

        

        Elements elems =  doc.select("frame[src]"); //CSS ¼¿·ºÅÍ

        for(Element elem : elems)

        {

            //String txt = elem.text();

            //String html = elem.html();

            String src = elem.attr("abs:src"); //Àý´ë°æ·Î·Î º¯°æÇÏ¿© ¸®ÅÏ

            doc = Jsoup.connect(src).get();

            Elements elems2 = doc.select("a[href]");

            for(Element e : elems2)

            {

                System.out.println(e.attr("abs:href"));

            }

            //System.out.printf("%s\n", doc.toString());

        }

        

    }

}



·ÎÄà ½Ã½ºÅÛ¿¡¼­ HTML, JSP¸¦ »ç¿ëÇÏ¿© Å×½ºÆ®ÇÑ ¿¹Á¦


import java.io.IOException;

import org.jsoup.*;

import org.jsoup.nodes.*;

import org.jsoup.select.*;


public class JsoupTest 

{

    public static void main(String[] args) throws IOException 

    {

        Document doc = Jsoup.connect("http://localhost:8888/jsoup/index.html").get();

        

        Elements elems = doc.select("a");

        

        for(Element ele : elems)

        {

            String link = ele.attr("abs:href");

            if(link.indexOf("12345")!=-1)

            {

                Document doc2 = Jsoup.connect(link).get();

                

                String txt = doc2.text();

                String html = doc2.html();

                String str = doc2.toString();

                

                System.out.println(txt);

                System.out.println("---------------------");

                System.out.println(html);

                System.out.println("---------------------");

                System.out.println(str);

                System.out.println("---------------------");

            }

        }

        

        elems = doc.select("script");

        for(Element ele : elems)

        {

            String txt = ele.text();

            String html = ele.html();

            String str = ele.toString();


            System.out.println(txt);

            System.out.println("---------------------");

            System.out.println(html);

            System.out.println("---------------------");

            System.out.println(str);

            System.out.println("---------------------");

        }

    }

    

}



index.html
<!DOCTYPE HTM>
<HTML>
 <HEAD>
  <TITLE> jsoup Test </TITLE>
  <script type="javascript">
function greetings() 
{
alert('Hello everybody!');
}
  </script>
 </HEAD>
 <BODY>
  <a href="sample.jsp?num=12345"> Click </a>
 </BODY>
</HTML>


sample.jsp
<%@ page contentType="text/html;charset=euc-kr"%>
<%="You got the sample.jsp"%>


À§ÀÇ jsoup ¿¹Á¦¸¦ ½ÇÇàÇÏ¸é ´ÙÀ½°ú °°Àº °á°ú°¡ Ãâ·ÂµË´Ï´Ù

You got the sample.jsp
---------------------
<html>
 <head></head>
 <body>
   You got the sample.jsp 
 </body>
</html>
---------------------
<html>
 <head></head>
 <body>
   You got the sample.jsp 
 </body>
</html>
---------------------

---------------------
function greetings() 
{
alert('Hello everybody!');
}
---------------------
<script type="javascript">
function greetings() 
{
alert('Hello everybody!');
}
  </script>
---------------------




Ãâó: http://micropilot.tistory.com/2427 []