Pull out text within specific divs on a webpage











up vote
4
down vote

favorite
1












I am looking for some guidance and feedback on my first golang program. The language is very new to me and its been a long while since I have written anything close to C.



Below is a simple program that takes a url and looks for divs within the page containing a specific class. Once found I read the text within the div outputting it to the terminal.



I am looking for general feedback especially on my use of pointers and slices, the way I am getting the text out of the div (in a loop with a bool variable) which seems a bit hacky to me



package main

import (
"fmt"
"net/http"
"sort"

"golang.org/x/net/html"
)

// Helper function to identify if the token is an image div
func isImageDiv(token html.Token) (ok bool) {
// Iterate over all of the Token's attributes until we find "class"
for _, a := range token.Attr {
if a.Key == "class" {
// The image labels are surrounded by div with the class "project-title"
if a.Val == "project-title" {
ok = true
return
}
}
}

// "bare" return will return the variables (ok) as defined in
// the function definition
return
}

func getLabel(z *html.Tokenizer) (label string) {
return string(z.Text())
}

func crawl(url string, foundLabels *string) {
resp, err := http.Get(url)

if err != nil {
fmt.Println("ERROR: Failed to crawl "" + url + """)
return
}

b := resp.Body
defer b.Close() // close Body when the function returns

z := html.NewTokenizer(b)

readText := false
for {
tt := z.Next()

switch {
case tt == html.ErrorToken:
// End of the document, we're done
return
case tt == html.StartTagToken:
t := z.Token()

// Check if the token is an <div> tag
isDiv := t.Data == "div"
if !isDiv {
continue
}

// Verify that is has the class we are looking for, if there is one
ok := isImageDiv(t)
if !ok {
continue
}
readText = true
case tt == html.TextToken && readText == true:
*foundLabels = append(*foundLabels, getLabel(z))
readText = false
}
}
}

func main() {

url := ""

foundLabels := make(string, 0)

crawl(url, &foundLabels)

sort.Strings(foundLabels)

for _, element := range foundLabels {
// element is the element from someSlice for where we are
fmt.Println(element)
}
}









share|improve this question




























    up vote
    4
    down vote

    favorite
    1












    I am looking for some guidance and feedback on my first golang program. The language is very new to me and its been a long while since I have written anything close to C.



    Below is a simple program that takes a url and looks for divs within the page containing a specific class. Once found I read the text within the div outputting it to the terminal.



    I am looking for general feedback especially on my use of pointers and slices, the way I am getting the text out of the div (in a loop with a bool variable) which seems a bit hacky to me



    package main

    import (
    "fmt"
    "net/http"
    "sort"

    "golang.org/x/net/html"
    )

    // Helper function to identify if the token is an image div
    func isImageDiv(token html.Token) (ok bool) {
    // Iterate over all of the Token's attributes until we find "class"
    for _, a := range token.Attr {
    if a.Key == "class" {
    // The image labels are surrounded by div with the class "project-title"
    if a.Val == "project-title" {
    ok = true
    return
    }
    }
    }

    // "bare" return will return the variables (ok) as defined in
    // the function definition
    return
    }

    func getLabel(z *html.Tokenizer) (label string) {
    return string(z.Text())
    }

    func crawl(url string, foundLabels *string) {
    resp, err := http.Get(url)

    if err != nil {
    fmt.Println("ERROR: Failed to crawl "" + url + """)
    return
    }

    b := resp.Body
    defer b.Close() // close Body when the function returns

    z := html.NewTokenizer(b)

    readText := false
    for {
    tt := z.Next()

    switch {
    case tt == html.ErrorToken:
    // End of the document, we're done
    return
    case tt == html.StartTagToken:
    t := z.Token()

    // Check if the token is an <div> tag
    isDiv := t.Data == "div"
    if !isDiv {
    continue
    }

    // Verify that is has the class we are looking for, if there is one
    ok := isImageDiv(t)
    if !ok {
    continue
    }
    readText = true
    case tt == html.TextToken && readText == true:
    *foundLabels = append(*foundLabels, getLabel(z))
    readText = false
    }
    }
    }

    func main() {

    url := ""

    foundLabels := make(string, 0)

    crawl(url, &foundLabels)

    sort.Strings(foundLabels)

    for _, element := range foundLabels {
    // element is the element from someSlice for where we are
    fmt.Println(element)
    }
    }









    share|improve this question


























      up vote
      4
      down vote

      favorite
      1









      up vote
      4
      down vote

      favorite
      1






      1





      I am looking for some guidance and feedback on my first golang program. The language is very new to me and its been a long while since I have written anything close to C.



      Below is a simple program that takes a url and looks for divs within the page containing a specific class. Once found I read the text within the div outputting it to the terminal.



      I am looking for general feedback especially on my use of pointers and slices, the way I am getting the text out of the div (in a loop with a bool variable) which seems a bit hacky to me



      package main

      import (
      "fmt"
      "net/http"
      "sort"

      "golang.org/x/net/html"
      )

      // Helper function to identify if the token is an image div
      func isImageDiv(token html.Token) (ok bool) {
      // Iterate over all of the Token's attributes until we find "class"
      for _, a := range token.Attr {
      if a.Key == "class" {
      // The image labels are surrounded by div with the class "project-title"
      if a.Val == "project-title" {
      ok = true
      return
      }
      }
      }

      // "bare" return will return the variables (ok) as defined in
      // the function definition
      return
      }

      func getLabel(z *html.Tokenizer) (label string) {
      return string(z.Text())
      }

      func crawl(url string, foundLabels *string) {
      resp, err := http.Get(url)

      if err != nil {
      fmt.Println("ERROR: Failed to crawl "" + url + """)
      return
      }

      b := resp.Body
      defer b.Close() // close Body when the function returns

      z := html.NewTokenizer(b)

      readText := false
      for {
      tt := z.Next()

      switch {
      case tt == html.ErrorToken:
      // End of the document, we're done
      return
      case tt == html.StartTagToken:
      t := z.Token()

      // Check if the token is an <div> tag
      isDiv := t.Data == "div"
      if !isDiv {
      continue
      }

      // Verify that is has the class we are looking for, if there is one
      ok := isImageDiv(t)
      if !ok {
      continue
      }
      readText = true
      case tt == html.TextToken && readText == true:
      *foundLabels = append(*foundLabels, getLabel(z))
      readText = false
      }
      }
      }

      func main() {

      url := ""

      foundLabels := make(string, 0)

      crawl(url, &foundLabels)

      sort.Strings(foundLabels)

      for _, element := range foundLabels {
      // element is the element from someSlice for where we are
      fmt.Println(element)
      }
      }









      share|improve this question















      I am looking for some guidance and feedback on my first golang program. The language is very new to me and its been a long while since I have written anything close to C.



      Below is a simple program that takes a url and looks for divs within the page containing a specific class. Once found I read the text within the div outputting it to the terminal.



      I am looking for general feedback especially on my use of pointers and slices, the way I am getting the text out of the div (in a loop with a bool variable) which seems a bit hacky to me



      package main

      import (
      "fmt"
      "net/http"
      "sort"

      "golang.org/x/net/html"
      )

      // Helper function to identify if the token is an image div
      func isImageDiv(token html.Token) (ok bool) {
      // Iterate over all of the Token's attributes until we find "class"
      for _, a := range token.Attr {
      if a.Key == "class" {
      // The image labels are surrounded by div with the class "project-title"
      if a.Val == "project-title" {
      ok = true
      return
      }
      }
      }

      // "bare" return will return the variables (ok) as defined in
      // the function definition
      return
      }

      func getLabel(z *html.Tokenizer) (label string) {
      return string(z.Text())
      }

      func crawl(url string, foundLabels *string) {
      resp, err := http.Get(url)

      if err != nil {
      fmt.Println("ERROR: Failed to crawl "" + url + """)
      return
      }

      b := resp.Body
      defer b.Close() // close Body when the function returns

      z := html.NewTokenizer(b)

      readText := false
      for {
      tt := z.Next()

      switch {
      case tt == html.ErrorToken:
      // End of the document, we're done
      return
      case tt == html.StartTagToken:
      t := z.Token()

      // Check if the token is an <div> tag
      isDiv := t.Data == "div"
      if !isDiv {
      continue
      }

      // Verify that is has the class we are looking for, if there is one
      ok := isImageDiv(t)
      if !ok {
      continue
      }
      readText = true
      case tt == html.TextToken && readText == true:
      *foundLabels = append(*foundLabels, getLabel(z))
      readText = false
      }
      }
      }

      func main() {

      url := ""

      foundLabels := make(string, 0)

      crawl(url, &foundLabels)

      sort.Strings(foundLabels)

      for _, element := range foundLabels {
      // element is the element from someSlice for where we are
      fmt.Println(element)
      }
      }






      web-scraping go






      share|improve this question















      share|improve this question













      share|improve this question




      share|improve this question








      edited Sep 12 '17 at 17:01









      200_success

      128k15149412




      128k15149412










      asked Sep 12 '17 at 10:53









      SJC

      1563




      1563






















          1 Answer
          1






          active

          oldest

          votes

















          up vote
          0
          down vote













          First of all: how big is this document? If you're not expecting it to be big, you may want to use html.Parse instead of a tokenizer - that way, you'll get the whole document in a tree and would be able to iterate over children of your div when found, which should make the whole thing simpler.



          Your current approach is streaming, which conserves memory, but you are making the assumption that the div with the class project-title will always contain just text. What if the document looks like this:



          <div class="project-title"><div>not title</div>title</div>


          or this



          <div class="project-title"></div><div>wrong title</div>


          In the first case, "not title" would be added. In the second, "wrong title" would be added.





          Assuming you want to keep your logic as-is, let's move on the review!



          A few general tips:




          • Always put string literals on top as constants (e.g. const className = "project-title").

          • You don't need to name your return values unless you see a need.

          • While Go values brevity, z is (almost) never a good variable name.

          • Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").


          Some specific things:





          • In isImageDiv, you don't need a separate if block. You also don't need to name the return value, which actually simplifies your code in this case. Finally, you should also check if the element is a div - that way, you needn't do it in crawl:



            func isImageDiv(token html.Token) bool {
            if token.Data != "div" {
            return false
            }
            for _, a := range token.Attr {
            if a.Key == "class" && a.Val == "project-title" {
            return true
            }
            }
            return false
            }


          • getLabel seems too simple to have a need to exist.


          • In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.



          • In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:



            switch tt {
            case html.ErrorToken:
            return
            case html.StartTagToken:
            if isImageDiv(z.Token()) {
            readText = true
            }
            case html.TextToken:
            if readText {
            *foundLabels = append(*foundLabels, getLabel(z))
            readText = false
            }
            }


          • In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.







          share|improve this answer





















            Your Answer





            StackExchange.ifUsing("editor", function () {
            return StackExchange.using("mathjaxEditing", function () {
            StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
            StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
            });
            });
            }, "mathjax-editing");

            StackExchange.ifUsing("editor", function () {
            StackExchange.using("externalEditor", function () {
            StackExchange.using("snippets", function () {
            StackExchange.snippets.init();
            });
            });
            }, "code-snippets");

            StackExchange.ready(function() {
            var channelOptions = {
            tags: "".split(" "),
            id: "196"
            };
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function() {
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled) {
            StackExchange.using("snippets", function() {
            createEditor();
            });
            }
            else {
            createEditor();
            }
            });

            function createEditor() {
            StackExchange.prepareEditor({
            heartbeatType: 'answer',
            convertImagesToLinks: false,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: null,
            bindNavPrevention: true,
            postfix: "",
            imageUploader: {
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            },
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            });


            }
            });














            draft saved

            draft discarded


















            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f175463%2fpull-out-text-within-specific-divs-on-a-webpage%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown

























            1 Answer
            1






            active

            oldest

            votes








            1 Answer
            1






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes








            up vote
            0
            down vote













            First of all: how big is this document? If you're not expecting it to be big, you may want to use html.Parse instead of a tokenizer - that way, you'll get the whole document in a tree and would be able to iterate over children of your div when found, which should make the whole thing simpler.



            Your current approach is streaming, which conserves memory, but you are making the assumption that the div with the class project-title will always contain just text. What if the document looks like this:



            <div class="project-title"><div>not title</div>title</div>


            or this



            <div class="project-title"></div><div>wrong title</div>


            In the first case, "not title" would be added. In the second, "wrong title" would be added.





            Assuming you want to keep your logic as-is, let's move on the review!



            A few general tips:




            • Always put string literals on top as constants (e.g. const className = "project-title").

            • You don't need to name your return values unless you see a need.

            • While Go values brevity, z is (almost) never a good variable name.

            • Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").


            Some specific things:





            • In isImageDiv, you don't need a separate if block. You also don't need to name the return value, which actually simplifies your code in this case. Finally, you should also check if the element is a div - that way, you needn't do it in crawl:



              func isImageDiv(token html.Token) bool {
              if token.Data != "div" {
              return false
              }
              for _, a := range token.Attr {
              if a.Key == "class" && a.Val == "project-title" {
              return true
              }
              }
              return false
              }


            • getLabel seems too simple to have a need to exist.


            • In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.



            • In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:



              switch tt {
              case html.ErrorToken:
              return
              case html.StartTagToken:
              if isImageDiv(z.Token()) {
              readText = true
              }
              case html.TextToken:
              if readText {
              *foundLabels = append(*foundLabels, getLabel(z))
              readText = false
              }
              }


            • In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.







            share|improve this answer

























              up vote
              0
              down vote













              First of all: how big is this document? If you're not expecting it to be big, you may want to use html.Parse instead of a tokenizer - that way, you'll get the whole document in a tree and would be able to iterate over children of your div when found, which should make the whole thing simpler.



              Your current approach is streaming, which conserves memory, but you are making the assumption that the div with the class project-title will always contain just text. What if the document looks like this:



              <div class="project-title"><div>not title</div>title</div>


              or this



              <div class="project-title"></div><div>wrong title</div>


              In the first case, "not title" would be added. In the second, "wrong title" would be added.





              Assuming you want to keep your logic as-is, let's move on the review!



              A few general tips:




              • Always put string literals on top as constants (e.g. const className = "project-title").

              • You don't need to name your return values unless you see a need.

              • While Go values brevity, z is (almost) never a good variable name.

              • Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").


              Some specific things:





              • In isImageDiv, you don't need a separate if block. You also don't need to name the return value, which actually simplifies your code in this case. Finally, you should also check if the element is a div - that way, you needn't do it in crawl:



                func isImageDiv(token html.Token) bool {
                if token.Data != "div" {
                return false
                }
                for _, a := range token.Attr {
                if a.Key == "class" && a.Val == "project-title" {
                return true
                }
                }
                return false
                }


              • getLabel seems too simple to have a need to exist.


              • In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.



              • In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:



                switch tt {
                case html.ErrorToken:
                return
                case html.StartTagToken:
                if isImageDiv(z.Token()) {
                readText = true
                }
                case html.TextToken:
                if readText {
                *foundLabels = append(*foundLabels, getLabel(z))
                readText = false
                }
                }


              • In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.







              share|improve this answer























                up vote
                0
                down vote










                up vote
                0
                down vote









                First of all: how big is this document? If you're not expecting it to be big, you may want to use html.Parse instead of a tokenizer - that way, you'll get the whole document in a tree and would be able to iterate over children of your div when found, which should make the whole thing simpler.



                Your current approach is streaming, which conserves memory, but you are making the assumption that the div with the class project-title will always contain just text. What if the document looks like this:



                <div class="project-title"><div>not title</div>title</div>


                or this



                <div class="project-title"></div><div>wrong title</div>


                In the first case, "not title" would be added. In the second, "wrong title" would be added.





                Assuming you want to keep your logic as-is, let's move on the review!



                A few general tips:




                • Always put string literals on top as constants (e.g. const className = "project-title").

                • You don't need to name your return values unless you see a need.

                • While Go values brevity, z is (almost) never a good variable name.

                • Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").


                Some specific things:





                • In isImageDiv, you don't need a separate if block. You also don't need to name the return value, which actually simplifies your code in this case. Finally, you should also check if the element is a div - that way, you needn't do it in crawl:



                  func isImageDiv(token html.Token) bool {
                  if token.Data != "div" {
                  return false
                  }
                  for _, a := range token.Attr {
                  if a.Key == "class" && a.Val == "project-title" {
                  return true
                  }
                  }
                  return false
                  }


                • getLabel seems too simple to have a need to exist.


                • In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.



                • In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:



                  switch tt {
                  case html.ErrorToken:
                  return
                  case html.StartTagToken:
                  if isImageDiv(z.Token()) {
                  readText = true
                  }
                  case html.TextToken:
                  if readText {
                  *foundLabels = append(*foundLabels, getLabel(z))
                  readText = false
                  }
                  }


                • In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.







                share|improve this answer












                First of all: how big is this document? If you're not expecting it to be big, you may want to use html.Parse instead of a tokenizer - that way, you'll get the whole document in a tree and would be able to iterate over children of your div when found, which should make the whole thing simpler.



                Your current approach is streaming, which conserves memory, but you are making the assumption that the div with the class project-title will always contain just text. What if the document looks like this:



                <div class="project-title"><div>not title</div>title</div>


                or this



                <div class="project-title"></div><div>wrong title</div>


                In the first case, "not title" would be added. In the second, "wrong title" would be added.





                Assuming you want to keep your logic as-is, let's move on the review!



                A few general tips:




                • Always put string literals on top as constants (e.g. const className = "project-title").

                • You don't need to name your return values unless you see a need.

                • While Go values brevity, z is (almost) never a good variable name.

                • Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").


                Some specific things:





                • In isImageDiv, you don't need a separate if block. You also don't need to name the return value, which actually simplifies your code in this case. Finally, you should also check if the element is a div - that way, you needn't do it in crawl:



                  func isImageDiv(token html.Token) bool {
                  if token.Data != "div" {
                  return false
                  }
                  for _, a := range token.Attr {
                  if a.Key == "class" && a.Val == "project-title" {
                  return true
                  }
                  }
                  return false
                  }


                • getLabel seems too simple to have a need to exist.


                • In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.



                • In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:



                  switch tt {
                  case html.ErrorToken:
                  return
                  case html.StartTagToken:
                  if isImageDiv(z.Token()) {
                  readText = true
                  }
                  case html.TextToken:
                  if readText {
                  *foundLabels = append(*foundLabels, getLabel(z))
                  readText = false
                  }
                  }


                • In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.








                share|improve this answer












                share|improve this answer



                share|improve this answer










                answered Sep 16 '17 at 9:41









                fstanis

                31519




                31519






























                    draft saved

                    draft discarded




















































                    Thanks for contributing an answer to Code Review Stack Exchange!


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    Use MathJax to format equations. MathJax reference.


                    To learn more, see our tips on writing great answers.





                    Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


                    Please pay close attention to the following guidance:


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    To learn more, see our tips on writing great answers.




                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function () {
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f175463%2fpull-out-text-within-specific-divs-on-a-webpage%23new-answer', 'question_page');
                    }
                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    Terni

                    A new problem with tex4ht and tikz

                    Sun Ra