Pull out text within specific divs on a webpage

up vote
4
down vote

favorite

I am looking for some guidance and feedback on my first golang program. The language is very new to me and its been a long while since I have written anything close to C.

Below is a simple program that takes a url and looks for divs within the page containing a specific class. Once found I read the text within the div outputting it to the terminal.

I am looking for general feedback especially on my use of pointers and slices, the way I am getting the text out of the div (in a loop with a bool variable) which seems a bit hacky to me

package main



import (

    "fmt"

    "net/http"

    "sort"



    "golang.org/x/net/html"

)



// Helper function to identify if the token is an image div

func isImageDiv(token html.Token) (ok bool) {

    // Iterate over all of the Token's attributes until we find "class"

    for _, a := range token.Attr {

        if a.Key == "class" {

            // The image labels are surrounded by div with the class "project-title"

            if a.Val == "project-title" {

                ok = true

                return

            }

        }

    }



    // "bare" return will return the variables (ok) as defined in

    // the function definition

    return

}



func getLabel(z *html.Tokenizer) (label string) {

    return string(z.Text())

}



func crawl(url string, foundLabels *string) {

    resp, err := http.Get(url)



    if err != nil {

        fmt.Println("ERROR: Failed to crawl "" + url + """)

        return

    }



    b := resp.Body

    defer b.Close() // close Body when the function returns



    z := html.NewTokenizer(b)



    readText := false

    for {

        tt := z.Next()



        switch {

        case tt == html.ErrorToken:

            // End of the document, we're done

            return

        case tt == html.StartTagToken:

            t := z.Token()



            // Check if the token is an <div> tag

            isDiv := t.Data == "div"

            if !isDiv {

                continue

            }



            // Verify that is has the class we are looking for, if there is one

            ok := isImageDiv(t)

            if !ok {

                continue

            }

            readText = true

        case tt == html.TextToken && readText == true:

            *foundLabels = append(*foundLabels, getLabel(z))

            readText = false

        }

    }

}



func main() {



    url := ""



    foundLabels := make(string, 0)



    crawl(url, &foundLabels)



    sort.Strings(foundLabels)



    for _, element := range foundLabels {

        // element is the element from someSlice for where we are

        fmt.Println(element)

    }

}

edited Sep 12 '17 at 17:01

200_success

128k15149412

asked Sep 12 '17 at 10:53

SJC

1563

add a comment |

up vote
4
down vote

favorite

I am looking for some guidance and feedback on my first golang program. The language is very new to me and its been a long while since I have written anything close to C.

Below is a simple program that takes a url and looks for divs within the page containing a specific class. Once found I read the text within the div outputting it to the terminal.

I am looking for general feedback especially on my use of pointers and slices, the way I am getting the text out of the div (in a loop with a bool variable) which seems a bit hacky to me

package main



import (

    "fmt"

    "net/http"

    "sort"



    "golang.org/x/net/html"

)



// Helper function to identify if the token is an image div

func isImageDiv(token html.Token) (ok bool) {

    // Iterate over all of the Token's attributes until we find "class"

    for _, a := range token.Attr {

        if a.Key == "class" {

            // The image labels are surrounded by div with the class "project-title"

            if a.Val == "project-title" {

                ok = true

                return

            }

        }

    }



    // "bare" return will return the variables (ok) as defined in

    // the function definition

    return

}



func getLabel(z *html.Tokenizer) (label string) {

    return string(z.Text())

}



func crawl(url string, foundLabels *string) {

    resp, err := http.Get(url)



    if err != nil {

        fmt.Println("ERROR: Failed to crawl "" + url + """)

        return

    }



    b := resp.Body

    defer b.Close() // close Body when the function returns



    z := html.NewTokenizer(b)



    readText := false

    for {

        tt := z.Next()



        switch {

        case tt == html.ErrorToken:

            // End of the document, we're done

            return

        case tt == html.StartTagToken:

            t := z.Token()



            // Check if the token is an <div> tag

            isDiv := t.Data == "div"

            if !isDiv {

                continue

            }



            // Verify that is has the class we are looking for, if there is one

            ok := isImageDiv(t)

            if !ok {

                continue

            }

            readText = true

        case tt == html.TextToken && readText == true:

            *foundLabels = append(*foundLabels, getLabel(z))

            readText = false

        }

    }

}



func main() {



    url := ""



    foundLabels := make(string, 0)



    crawl(url, &foundLabels)



    sort.Strings(foundLabels)



    for _, element := range foundLabels {

        // element is the element from someSlice for where we are

        fmt.Println(element)

    }

}

edited Sep 12 '17 at 17:01

200_success

128k15149412

asked Sep 12 '17 at 10:53

SJC

1563

add a comment |

up vote
4
down vote

favorite

I am looking for some guidance and feedback on my first golang program. The language is very new to me and its been a long while since I have written anything close to C.

Below is a simple program that takes a url and looks for divs within the page containing a specific class. Once found I read the text within the div outputting it to the terminal.

I am looking for general feedback especially on my use of pointers and slices, the way I am getting the text out of the div (in a loop with a bool variable) which seems a bit hacky to me

package main



import (

    "fmt"

    "net/http"

    "sort"



    "golang.org/x/net/html"

)



// Helper function to identify if the token is an image div

func isImageDiv(token html.Token) (ok bool) {

    // Iterate over all of the Token's attributes until we find "class"

    for _, a := range token.Attr {

        if a.Key == "class" {

            // The image labels are surrounded by div with the class "project-title"

            if a.Val == "project-title" {

                ok = true

                return

            }

        }

    }



    // "bare" return will return the variables (ok) as defined in

    // the function definition

    return

}



func getLabel(z *html.Tokenizer) (label string) {

    return string(z.Text())

}



func crawl(url string, foundLabels *string) {

    resp, err := http.Get(url)



    if err != nil {

        fmt.Println("ERROR: Failed to crawl "" + url + """)

        return

    }



    b := resp.Body

    defer b.Close() // close Body when the function returns



    z := html.NewTokenizer(b)



    readText := false

    for {

        tt := z.Next()



        switch {

        case tt == html.ErrorToken:

            // End of the document, we're done

            return

        case tt == html.StartTagToken:

            t := z.Token()



            // Check if the token is an <div> tag

            isDiv := t.Data == "div"

            if !isDiv {

                continue

            }



            // Verify that is has the class we are looking for, if there is one

            ok := isImageDiv(t)

            if !ok {

                continue

            }

            readText = true

        case tt == html.TextToken && readText == true:

            *foundLabels = append(*foundLabels, getLabel(z))

            readText = false

        }

    }

}



func main() {



    url := ""



    foundLabels := make(string, 0)



    crawl(url, &foundLabels)



    sort.Strings(foundLabels)



    for _, element := range foundLabels {

        // element is the element from someSlice for where we are

        fmt.Println(element)

    }

}

edited Sep 12 '17 at 17:01

200_success

128k15149412

asked Sep 12 '17 at 10:53

SJC

1563

I am looking for some guidance and feedback on my first golang program. The language is very new to me and its been a long while since I have written anything close to C.

Below is a simple program that takes a url and looks for divs within the page containing a specific class. Once found I read the text within the div outputting it to the terminal.

I am looking for general feedback especially on my use of pointers and slices, the way I am getting the text out of the div (in a loop with a bool variable) which seems a bit hacky to me

package main



import (

    "fmt"

    "net/http"

    "sort"



    "golang.org/x/net/html"

)



// Helper function to identify if the token is an image div

func isImageDiv(token html.Token) (ok bool) {

    // Iterate over all of the Token's attributes until we find "class"

    for _, a := range token.Attr {

        if a.Key == "class" {

            // The image labels are surrounded by div with the class "project-title"

            if a.Val == "project-title" {

                ok = true

                return

            }

        }

    }



    // "bare" return will return the variables (ok) as defined in

    // the function definition

    return

}



func getLabel(z *html.Tokenizer) (label string) {

    return string(z.Text())

}



func crawl(url string, foundLabels *string) {

    resp, err := http.Get(url)



    if err != nil {

        fmt.Println("ERROR: Failed to crawl "" + url + """)

        return

    }



    b := resp.Body

    defer b.Close() // close Body when the function returns



    z := html.NewTokenizer(b)



    readText := false

    for {

        tt := z.Next()



        switch {

        case tt == html.ErrorToken:

            // End of the document, we're done

            return

        case tt == html.StartTagToken:

            t := z.Token()



            // Check if the token is an <div> tag

            isDiv := t.Data == "div"

            if !isDiv {

                continue

            }



            // Verify that is has the class we are looking for, if there is one

            ok := isImageDiv(t)

            if !ok {

                continue

            }

            readText = true

        case tt == html.TextToken && readText == true:

            *foundLabels = append(*foundLabels, getLabel(z))

            readText = false

        }

    }

}



func main() {



    url := ""



    foundLabels := make(string, 0)



    crawl(url, &foundLabels)



    sort.Strings(foundLabels)



    for _, element := range foundLabels {

        // element is the element from someSlice for where we are

        fmt.Println(element)

    }

}

web-scraping go

edited Sep 12 '17 at 17:01

200_success

128k15149412

asked Sep 12 '17 at 10:53

SJC

1563

edited Sep 12 '17 at 17:01

200_success

128k15149412

asked Sep 12 '17 at 10:53

SJC

1563

edited Sep 12 '17 at 17:01

200_success

128k15149412

edited Sep 12 '17 at 17:01

200_success

128k15149412

edited Sep 12 '17 at 17:01

200_success

128k15149412

asked Sep 12 '17 at 10:53

SJC

1563

asked Sep 12 '17 at 10:53

SJC

1563

asked Sep 12 '17 at 10:53

SJC

1563

add a comment |

1 Answer
1

active

oldest

votes

up vote
0
down vote

First of all: how big is this document? If you're not expecting it to be big, you may want to use html.Parse instead of a tokenizer - that way, you'll get the whole document in a tree and would be able to iterate over children of your div when found, which should make the whole thing simpler.

Your current approach is streaming, which conserves memory, but you are making the assumption that the div with the class project-title will always contain just text. What if the document looks like this:

<div class="project-title"><div>not title</div>title</div>

or this

<div class="project-title"></div><div>wrong title</div>

In the first case, "not title" would be added. In the second, "wrong title" would be added.

Assuming you want to keep your logic as-is, let's move on the review!

A few general tips:

Always put string literals on top as constants (e.g. const className = "project-title").

You don't need to name your return values unless you see a need.

While Go values brevity, z is (almost) never a good variable name.

Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").

Some specific things:

In isImageDiv, you don't need a separate if block. You also don't need to name the return value, which actually simplifies your code in this case. Finally, you should also check if the element is a div - that way, you needn't do it in crawl:

func isImageDiv(token html.Token) bool {

    if token.Data != "div" {

        return false

    }

    for _, a := range token.Attr {

        if a.Key == "class" && a.Val == "project-title" {

            return true

        }

    }

    return false

}

getLabel seems too simple to have a need to exist.

In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.

In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:

switch tt {

case html.ErrorToken:

    return

case html.StartTagToken:

    if isImageDiv(z.Token()) {

        readText = true

    }

case html.TextToken:

    if readText {

        *foundLabels = append(*foundLabels, getLabel(z))

        readText = false

    }

}

In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.

answered Sep 16 '17 at 9:41

fstanis

31519

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f175463%2fpull-out-text-within-specific-divs-on-a-webpage%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
0
down vote

<div class="project-title"><div>not title</div>title</div>

or this

<div class="project-title"></div><div>wrong title</div>

In the first case, "not title" would be added. In the second, "wrong title" would be added.

Assuming you want to keep your logic as-is, let's move on the review!

A few general tips:

Always put string literals on top as constants (e.g. const className = "project-title").

You don't need to name your return values unless you see a need.

While Go values brevity, z is (almost) never a good variable name.

Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").

Some specific things:

func isImageDiv(token html.Token) bool {

    if token.Data != "div" {

        return false

    }

    for _, a := range token.Attr {

        if a.Key == "class" && a.Val == "project-title" {

            return true

        }

    }

    return false

}

getLabel seems too simple to have a need to exist.

In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.

In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:

switch tt {

case html.ErrorToken:

    return

case html.StartTagToken:

    if isImageDiv(z.Token()) {

        readText = true

    }

case html.TextToken:

    if readText {

        *foundLabels = append(*foundLabels, getLabel(z))

        readText = false

    }

}

In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.

answered Sep 16 '17 at 9:41

fstanis

31519

add a comment |

up vote
0
down vote

<div class="project-title"><div>not title</div>title</div>

or this

<div class="project-title"></div><div>wrong title</div>

In the first case, "not title" would be added. In the second, "wrong title" would be added.

Assuming you want to keep your logic as-is, let's move on the review!

A few general tips:

Always put string literals on top as constants (e.g. const className = "project-title").

You don't need to name your return values unless you see a need.

While Go values brevity, z is (almost) never a good variable name.

Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").

Some specific things:

func isImageDiv(token html.Token) bool {

    if token.Data != "div" {

        return false

    }

    for _, a := range token.Attr {

        if a.Key == "class" && a.Val == "project-title" {

            return true

        }

    }

    return false

}

getLabel seems too simple to have a need to exist.

In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.

In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:

switch tt {

case html.ErrorToken:

    return

case html.StartTagToken:

    if isImageDiv(z.Token()) {

        readText = true

    }

case html.TextToken:

    if readText {

        *foundLabels = append(*foundLabels, getLabel(z))

        readText = false

    }

}

In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.

answered Sep 16 '17 at 9:41

fstanis

31519

add a comment |

up vote
0
down vote

<div class="project-title"><div>not title</div>title</div>

or this

<div class="project-title"></div><div>wrong title</div>

In the first case, "not title" would be added. In the second, "wrong title" would be added.

Assuming you want to keep your logic as-is, let's move on the review!

A few general tips:

Always put string literals on top as constants (e.g. const className = "project-title").

You don't need to name your return values unless you see a need.

While Go values brevity, z is (almost) never a good variable name.

Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").

Some specific things:

func isImageDiv(token html.Token) bool {

    if token.Data != "div" {

        return false

    }

    for _, a := range token.Attr {

        if a.Key == "class" && a.Val == "project-title" {

            return true

        }

    }

    return false

}

getLabel seems too simple to have a need to exist.

In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.

In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:

switch tt {

case html.ErrorToken:

    return

case html.StartTagToken:

    if isImageDiv(z.Token()) {

        readText = true

    }

case html.TextToken:

    if readText {

        *foundLabels = append(*foundLabels, getLabel(z))

        readText = false

    }

}

In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.

answered Sep 16 '17 at 9:41

fstanis

31519

<div class="project-title"><div>not title</div>title</div>

or this

<div class="project-title"></div><div>wrong title</div>

In the first case, "not title" would be added. In the second, "wrong title" would be added.

Assuming you want to keep your logic as-is, let's move on the review!

A few general tips:

Always put string literals on top as constants (e.g. const className = "project-title").

You don't need to name your return values unless you see a need.

While Go values brevity, z is (almost) never a good variable name.

Most of the comments don't add any value (e.g. "Check if the token is an <div> tag").

Some specific things:

func isImageDiv(token html.Token) bool {

    if token.Data != "div" {

        return false

    }

    for _, a := range token.Attr {

        if a.Key == "class" && a.Val == "project-title" {

            return true

        }

    }

    return false

}

getLabel seems too simple to have a need to exist.

In crawl, you shouldn't use fmt.Println. Instead, return an error, the same way http.Get returns it. Let the caller handle it.

In crawl, you can simplify the switch a lot - continue is not really needed and we made our isImageDiv function do the div check as well:

switch tt {

case html.ErrorToken:

    return

case html.StartTagToken:

    if isImageDiv(z.Token()) {

        readText = true

    }

case html.TextToken:

    if readText {

        *foundLabels = append(*foundLabels, getLabel(z))

        readText = false

    }

}

In main, you don't need to use make for a slice since the default value for a slice is an empty slice (you can just define it as var foundLabels string and it will just work). Also, you shouldn't pass the slice as a pointer - slices are passed by reference anyway (not really, but due to the nature of how they work, they're already pointers). Make sure you change your crawl function as well after that.

answered Sep 16 '17 at 9:41

fstanis

31519

answered Sep 16 '17 at 9:41

fstanis

31519

answered Sep 16 '17 at 9:41

fstanis

31519

answered Sep 16 '17 at 9:41

fstanis

31519

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Gfrktyl