mirror of
https://github.com/Rogiel/httpchannel
synced 2025-12-06 07:32:50 +00:00
Implements a new, more clean and robust HTML parser
This commit is contained in:
@@ -22,10 +22,9 @@ import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.rogiel.httpchannel.captcha.ImageCaptcha;
|
||||
import com.rogiel.httpchannel.http.HttpContext;
|
||||
import com.rogiel.httpchannel.util.PatternUtils;
|
||||
import com.rogiel.httpchannel.util.htmlparser.HTMLPage;
|
||||
import com.rogiel.httpchannel.util.html.Page;
|
||||
|
||||
/**
|
||||
* This class provides utility methods to extract an {@link ImageCaptcha} from
|
||||
@@ -57,8 +56,8 @@ public class ReCaptchaExtractor {
|
||||
* the {@link HttpContext}
|
||||
* @return the {@link ImageCaptcha} embedded at the given <code>page</code>
|
||||
*/
|
||||
public static ImageCaptcha extractCaptcha(HTMLPage page, HttpContext ctx) {
|
||||
final String uri = page.findScriptSrc(CAPTCHA_URI_PATTERN);
|
||||
public static ImageCaptcha extractCaptcha(Page page, HttpContext ctx) {
|
||||
final String uri = page.scriptBySource(CAPTCHA_URI_PATTERN).asString();
|
||||
if (uri == null)
|
||||
return null;
|
||||
try {
|
||||
@@ -77,8 +76,8 @@ public class ReCaptchaExtractor {
|
||||
* the {@link HttpContext}
|
||||
* @return the {@link ImageCaptcha} contained at the given <code>page</code>
|
||||
*/
|
||||
public static ImageCaptcha extractAjaxCaptcha(HTMLPage page, HttpContext ctx) {
|
||||
final String siteID = page.findScript(CAPTCHA_ID_PATTERN, 1);
|
||||
public static ImageCaptcha extractAjaxCaptcha(Page page, HttpContext ctx) {
|
||||
final String siteID = page.script(CAPTCHA_ID_PATTERN).asString(1);
|
||||
try {
|
||||
return doExtract(ctx.get(CHALLENGE_BASE_URI + siteID).asString());
|
||||
} catch (IOException e) {
|
||||
|
||||
@@ -33,7 +33,7 @@ import org.json.simple.parser.JSONParser;
|
||||
import org.json.simple.parser.ParseException;
|
||||
|
||||
import com.rogiel.httpchannel.util.HttpClientUtils;
|
||||
import com.rogiel.httpchannel.util.htmlparser.HTMLPage;
|
||||
import com.rogiel.httpchannel.util.html.Page;
|
||||
|
||||
public abstract class Request {
|
||||
private static final JSONParser jsonParser = new JSONParser();
|
||||
@@ -90,14 +90,14 @@ public abstract class Request {
|
||||
});
|
||||
}
|
||||
|
||||
public HTMLPage asPage() throws ClientProtocolException, IOException {
|
||||
return HTMLPage.parse(asString());
|
||||
public Page asPage() throws ClientProtocolException, IOException {
|
||||
return Page.parse(asString());
|
||||
}
|
||||
|
||||
public Future<HTMLPage> asPageAsync() throws IOException {
|
||||
return ctx.threadPool.submit(new Callable<HTMLPage>() {
|
||||
public Future<Page> asPageAsync() throws IOException {
|
||||
return ctx.threadPool.submit(new Callable<Page>() {
|
||||
@Override
|
||||
public HTMLPage call() throws Exception {
|
||||
public Page call() throws Exception {
|
||||
return asPage();
|
||||
}
|
||||
});
|
||||
|
||||
@@ -31,7 +31,7 @@ import org.apache.http.client.HttpClient;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.client.methods.HttpUriRequest;
|
||||
|
||||
import com.rogiel.httpchannel.util.htmlparser.HTMLPage;
|
||||
import com.rogiel.httpchannel.util.html.Page;
|
||||
|
||||
public class HttpClientUtils {
|
||||
private static final ExecutorService threadPool = Executors
|
||||
@@ -82,7 +82,7 @@ public class HttpClientUtils {
|
||||
}
|
||||
}
|
||||
|
||||
public static HTMLPage toPage(HttpResponse response) throws IOException {
|
||||
return HTMLPage.parse(toString(response));
|
||||
public static Page toPage(HttpResponse response) throws IOException {
|
||||
return Page.parse(toString(response));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,174 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.html;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Tag;
|
||||
|
||||
/**
|
||||
* An {@link PageElement} that has an matched string attached to it
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*/
|
||||
public class MatchedElement<T extends Tag> extends PageElement<T> {
|
||||
/**
|
||||
* The regular expression {@link Matcher} that retains the matched strings
|
||||
* to it
|
||||
*/
|
||||
private final Matcher matcher;
|
||||
|
||||
/**
|
||||
* @param tag
|
||||
* the tag
|
||||
* @param matcher
|
||||
* the matcher
|
||||
*/
|
||||
public MatchedElement(T tag, Matcher matcher) {
|
||||
super(tag);
|
||||
this.matcher = matcher;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tag
|
||||
* the tag
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @param content
|
||||
* the content
|
||||
*/
|
||||
public MatchedElement(T tag, Pattern pattern, String content) {
|
||||
super(tag);
|
||||
this.matcher = pattern.matcher(content);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tag
|
||||
* the tag
|
||||
* @param content
|
||||
* the content
|
||||
*/
|
||||
public MatchedElement(T tag, String content) {
|
||||
this(tag, Pattern.compile(Pattern.quote(content)), content);
|
||||
this.matcher.matches();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return <code>true</code> if the element has an matched element
|
||||
*/
|
||||
public boolean matches() {
|
||||
matcher.reset();
|
||||
return matcher.matches();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return <code>true</code> if the element has an matched element (the
|
||||
* entire value matches the pattern)
|
||||
*/
|
||||
public boolean matchesEntirelly() {
|
||||
return matcher.lookingAt();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return <code>true</code> if the pattern has found something on the
|
||||
* element that matches it
|
||||
*/
|
||||
public boolean find() {
|
||||
matcher.reset();
|
||||
return matcher.find();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param n
|
||||
* the group number
|
||||
* @return <code>true</code> if the group exists
|
||||
*/
|
||||
public boolean hasGroup(int n) {
|
||||
return n <= matcher.groupCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the entire matched value as a string
|
||||
*/
|
||||
public String asString() {
|
||||
return asString(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the group value as a string
|
||||
*/
|
||||
public String asString(int n) {
|
||||
return matcher.group(n);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the entire matched value as a integer
|
||||
*/
|
||||
public int asInteger() {
|
||||
return asInteger(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the group value as a integer
|
||||
*/
|
||||
public int asInteger(int n) {
|
||||
return Integer.parseInt(asString(n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the entire matched value as a long
|
||||
*/
|
||||
public long asLong() {
|
||||
return asLong(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the group value as a long
|
||||
*/
|
||||
public long asLong(int n) {
|
||||
return Long.parseLong(asString(n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the entire matched value as a double
|
||||
*/
|
||||
public double asDouble() {
|
||||
return asDouble(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the group value as a double
|
||||
*/
|
||||
public double asDouble(int n) {
|
||||
return Double.parseDouble(asString(n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the pattern matched against the element
|
||||
*/
|
||||
public Pattern getPattern() {
|
||||
return matcher.pattern();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "MatchedElement [tag=" + tag + ", pattern=" + getPattern() + "]";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,841 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.html;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.Tag;
|
||||
import org.htmlparser.filters.AndFilter;
|
||||
import org.htmlparser.tags.FormTag;
|
||||
import org.htmlparser.tags.FrameTag;
|
||||
import org.htmlparser.tags.ImageTag;
|
||||
import org.htmlparser.tags.InputTag;
|
||||
import org.htmlparser.tags.LinkTag;
|
||||
import org.htmlparser.tags.ScriptTag;
|
||||
import org.htmlparser.tags.TextareaTag;
|
||||
import org.htmlparser.util.NodeIterator;
|
||||
import org.htmlparser.util.NodeList;
|
||||
import org.htmlparser.util.ParserException;
|
||||
|
||||
import com.rogiel.httpchannel.util.html.PageElement.TagMatcher;
|
||||
import com.rogiel.httpchannel.util.html.filter.TypeTagFilter;
|
||||
import com.rogiel.httpchannel.util.html.matcher.IDTagMatcher;
|
||||
import com.rogiel.httpchannel.util.html.matcher.NameTagMatcher;
|
||||
|
||||
/**
|
||||
* This class handles all HTML parsing and searching. With this class is easy to
|
||||
* search for links matching an {@link Pattern}, for images, frames, forms,
|
||||
* inputs and maany more HTML widgets.
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*/
|
||||
public class Page {
|
||||
/**
|
||||
* The list of nodes on the HTML DOM model
|
||||
*/
|
||||
private final NodeList nodes;
|
||||
|
||||
/**
|
||||
* This interface provides a mean to transform an list of objects into
|
||||
* another type
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*
|
||||
* @param <I>
|
||||
* the input object type
|
||||
* @param <O>
|
||||
* the output object type
|
||||
*/
|
||||
private interface ListProcessor<I extends Tag, O> {
|
||||
O process(I tag);
|
||||
}
|
||||
|
||||
/**
|
||||
* An default {@link ListProcessor} that converts all tags to an
|
||||
* {@link PageElement}
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*
|
||||
* @param <I>
|
||||
* the input type
|
||||
*/
|
||||
private class DefaultListProcessor<I extends Tag> implements
|
||||
ListProcessor<I, PageElement<I>> {
|
||||
@Override
|
||||
public PageElement<I> process(I tag) {
|
||||
return new PageElement<I>(tag);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new page instance
|
||||
*
|
||||
* @param parser
|
||||
* the HTML parser
|
||||
* @throws ParserException
|
||||
* an parsing exception
|
||||
*/
|
||||
public Page(Parser parser) throws ParserException {
|
||||
this.nodes = parser.parse(null);
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** INTERNAL
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* Filters all the tags within this page to those matching the filter
|
||||
*
|
||||
* @param processor
|
||||
* the list processor
|
||||
* @param filters
|
||||
* the filters to be applied
|
||||
* @return an list of matching tags
|
||||
*/
|
||||
private <T extends Tag, O> List<O> filter(ListProcessor<T, O> processor,
|
||||
NodeFilter... filters) {
|
||||
final NodeFilter filter;
|
||||
if (filters.length == 1)
|
||||
filter = filters[0];
|
||||
else
|
||||
filter = new AndFilter(filters);
|
||||
try {
|
||||
return list(nodes.extractAllNodesThatMatch(filter, true), processor);
|
||||
} catch (ParserException e) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a list of converted objects
|
||||
*
|
||||
* @param list
|
||||
* the input list
|
||||
* @param processor
|
||||
* the processor that converts the object types
|
||||
* @return the processed and converted list
|
||||
* @throws ParserException
|
||||
* if any exception occur
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private <T extends Tag, O> List<O> list(final NodeList list,
|
||||
ListProcessor<T, O> processor) throws ParserException {
|
||||
final List<O> filtered = new ArrayList<>();
|
||||
final NodeIterator iterator = list.elements();
|
||||
while (iterator.hasMoreNodes()) {
|
||||
filtered.add(processor.process((T) iterator.nextNode()));
|
||||
}
|
||||
return filtered;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to search for a tag value that matches exactly (the entire string)
|
||||
* with the pattern.
|
||||
*
|
||||
* @param list
|
||||
* the list of elements
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @param tagMatcher
|
||||
* the tag matcher (which will be matched against the pattern)
|
||||
* @param realMatcher
|
||||
* the real matcher (which will be returned on the
|
||||
* {@link MatchedElement})
|
||||
* @return an list of {@link MatchedElement}
|
||||
*/
|
||||
private <T extends Tag, E extends PageElement<T>> List<MatchedElement<T>> match(
|
||||
List<E> list, Pattern pattern, TagMatcher<T> tagMatcher,
|
||||
TagMatcher<T> realMatcher) {
|
||||
final List<MatchedElement<T>> matchList = new ArrayList<>();
|
||||
for (final E tag : list) {
|
||||
final MatchedElement<T> matched = tag.match(pattern, tagMatcher);
|
||||
if (matched == null)
|
||||
continue;
|
||||
if (matched.matches()) {
|
||||
if (tagMatcher == realMatcher) {
|
||||
matchList.add(matched);
|
||||
} else {
|
||||
matchList.add(tag.match(realMatcher));
|
||||
}
|
||||
}
|
||||
}
|
||||
return matchList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to search for a tag value that matches exactly (the entire string)
|
||||
* with the pattern.
|
||||
*
|
||||
* @param list
|
||||
* the list of elements
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @param tagMatcher
|
||||
* the tag matcher (which will be matched against the pattern and
|
||||
* used on {@link MatchedElement})
|
||||
* @return an list of {@link MatchedElement}
|
||||
*/
|
||||
private <T extends Tag, E extends PageElement<T>> List<MatchedElement<T>> match(
|
||||
List<E> list, Pattern pattern, TagMatcher<T> tagMatcher) {
|
||||
return match(list, pattern, tagMatcher, tagMatcher);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to search for a tag value that contains the content within the
|
||||
* pattern.
|
||||
*
|
||||
* @param list
|
||||
* the list of elements
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @param tagMatcher
|
||||
* the tag matcher (which will be matched against the pattern and
|
||||
* used on {@link MatchedElement})
|
||||
* @return an list of {@link MatchedElement}
|
||||
*/
|
||||
|
||||
private <T extends Tag, E extends PageElement<T>> List<MatchedElement<T>> find(
|
||||
List<E> list, Pattern pattern, TagMatcher<T> tagMatcher) {
|
||||
final List<MatchedElement<T>> matchList = new ArrayList<>();
|
||||
for (final E tag : list) {
|
||||
final MatchedElement<T> matched = tag.match(pattern, tagMatcher);
|
||||
if (matched.find())
|
||||
matchList.add(matched);
|
||||
}
|
||||
return matchList;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a single element from the list
|
||||
*
|
||||
* @param list
|
||||
* the list
|
||||
* @return the first element at the list
|
||||
*/
|
||||
private <O> O single(List<O> list) {
|
||||
if (list.size() == 0)
|
||||
return null;
|
||||
return list.get(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the HTML page to a plain string. This is similar to the
|
||||
* "SEO preview" systems
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public String asPlainString() {
|
||||
String string = nodes.asString().replaceAll(" ", "");
|
||||
final String[] lines = string.split("\n");
|
||||
|
||||
final StringBuilder builder = new StringBuilder();
|
||||
for (final String line : lines) {
|
||||
String procLine = line.replaceAll("\t", " ").trim();
|
||||
if (procLine.length() == 0)
|
||||
continue;
|
||||
builder.append(line.replaceAll("\t", " ").trim()).append(" ");
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** TEXT SEARCH
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* Searches for the given pattern at the entire page
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the search results
|
||||
*/
|
||||
public SearchResults search(Pattern pattern) {
|
||||
return new SearchResults(pattern, asPlainString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches for the given text at the entire page
|
||||
*
|
||||
* @param text
|
||||
* the text
|
||||
* @return the search results
|
||||
*/
|
||||
public SearchResults searchFirst(String text) {
|
||||
return search(Pattern.compile(Pattern.quote(text)));
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** LINKS
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* An {@link TagMatcher} that returns the link href
|
||||
*/
|
||||
private static final TagMatcher<LinkTag> LINK_TAG_MATCHER = new TagMatcher<LinkTag>() {
|
||||
@Override
|
||||
public String content(LinkTag tag) {
|
||||
return tag.getLink();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @return a list of all links contained at the page
|
||||
*/
|
||||
public List<PageElement<LinkTag>> links() {
|
||||
return filter(new DefaultListProcessor<LinkTag>(), new TypeTagFilter(
|
||||
LinkTag.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all links whose URL matches the given pattern
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the list of links matching the pattern
|
||||
*/
|
||||
public List<MatchedElement<LinkTag>> links(Pattern pattern) {
|
||||
return match(links(), pattern, LINK_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the first link whose URL matches the given pattern
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the first link matching the pattern
|
||||
*/
|
||||
public MatchedElement<LinkTag> link(Pattern pattern) {
|
||||
return single(links(pattern));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the links whose IDs matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<LinkTag>> linksByID(Pattern pattern) {
|
||||
return match(links(), pattern, new IDTagMatcher<LinkTag>(),
|
||||
LINK_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param id
|
||||
* the link ID
|
||||
* @return the link with the given ID
|
||||
*/
|
||||
public MatchedElement<LinkTag> linkByID(String id) {
|
||||
return single(linksByID(Pattern.compile(Pattern.quote(id))));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the name pattern
|
||||
* @return the links whose name matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<LinkTag>> linksByName(Pattern pattern) {
|
||||
return match(links(), pattern, new NameTagMatcher<LinkTag>(),
|
||||
LINK_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the name
|
||||
* @return the link with the given name
|
||||
*/
|
||||
public MatchedElement<LinkTag> linkByName(String name) {
|
||||
return single(linksByName(Pattern.compile(Pattern.quote(name))));
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** IMAGES
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* An {@link TagMatcher} that returns the image source url
|
||||
*/
|
||||
private static final TagMatcher<ImageTag> IMAGE_TAG_MATCHER = new TagMatcher<ImageTag>() {
|
||||
@Override
|
||||
public String content(ImageTag tag) {
|
||||
return tag.getImageURL();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @return the list of all images at the page
|
||||
*/
|
||||
public List<PageElement<ImageTag>> images() {
|
||||
return filter(new DefaultListProcessor<ImageTag>(), new TypeTagFilter(
|
||||
ImageTag.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the image url pattern
|
||||
* @return the list of images matching the url pattern
|
||||
*/
|
||||
public List<MatchedElement<ImageTag>> images(Pattern pattern) {
|
||||
return match(images(), pattern, IMAGE_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the image url pattern
|
||||
* @return the first image whose url matches the pattern
|
||||
*/
|
||||
public MatchedElement<ImageTag> image(Pattern pattern) {
|
||||
return single(images(pattern));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the pattern id
|
||||
* @return the list of images that match the given id
|
||||
*/
|
||||
public List<MatchedElement<ImageTag>> imagesByID(Pattern pattern) {
|
||||
return match(images(), pattern, new IDTagMatcher<ImageTag>(),
|
||||
IMAGE_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param id
|
||||
* the image ID
|
||||
* @return the image that matches with the given id
|
||||
*/
|
||||
public MatchedElement<ImageTag> imageByID(String id) {
|
||||
return single(imagesByID(Pattern.compile(Pattern.quote(id))));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the image name pattern
|
||||
* @return the list of images whose names match the pattern
|
||||
*/
|
||||
public List<MatchedElement<ImageTag>> imagesByName(Pattern pattern) {
|
||||
return match(images(), pattern, new NameTagMatcher<ImageTag>(),
|
||||
IMAGE_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the image name
|
||||
* @return the image whose name matches the given
|
||||
*/
|
||||
public MatchedElement<ImageTag> imageByName(String name) {
|
||||
return single(imagesByName(Pattern.compile(Pattern.quote(name))));
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** FORM
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* An {@link TagMatcher} that returns the form action (or submit) url
|
||||
*/
|
||||
private static final TagMatcher<FormTag> FORM_TAG_MATCHER = new TagMatcher<FormTag>() {
|
||||
@Override
|
||||
public String content(FormTag tag) {
|
||||
return tag.getFormLocation();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @return the list of all forms on the page
|
||||
*/
|
||||
public List<PageElement<FormTag>> forms() {
|
||||
return filter(new DefaultListProcessor<FormTag>(), new TypeTagFilter(
|
||||
FormTag.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the action url pattern
|
||||
* @return the forms whose urls matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<FormTag>> forms(Pattern pattern) {
|
||||
return match(forms(), pattern, FORM_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the action url pattern
|
||||
* @return the first form whose action url matches the pattern
|
||||
*/
|
||||
public MatchedElement<FormTag> form(Pattern pattern) {
|
||||
return single(forms(pattern));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the form id pattern
|
||||
* @return the forms whose ids matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<FormTag>> formsByID(Pattern pattern) {
|
||||
return match(forms(), pattern, new IDTagMatcher<FormTag>(),
|
||||
FORM_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param id
|
||||
* the form id
|
||||
* @return the form whose id matches the given
|
||||
*/
|
||||
public MatchedElement<FormTag> formByID(String id) {
|
||||
return single(formsByID(Pattern.compile(Pattern.quote(id))));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the form name pattern
|
||||
* @return the forms whose names matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<FormTag>> formsByName(Pattern pattern) {
|
||||
return match(forms(), pattern, new NameTagMatcher<FormTag>(),
|
||||
FORM_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the form name
|
||||
* @return the form whose name matches the given
|
||||
*/
|
||||
public MatchedElement<FormTag> formByName(String name) {
|
||||
return single(formsByName(Pattern.compile(Pattern.quote(name))));
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** INPUT
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* An {@link TagMatcher} that returns the input value
|
||||
*/
|
||||
private static final TagMatcher<InputTag> INPUT_TAG_MATCHER = new TagMatcher<InputTag>() {
|
||||
@Override
|
||||
public String content(InputTag tag) {
|
||||
return tag.getAttribute("value");
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @return the list of all inputs on the page
|
||||
*/
|
||||
public List<PageElement<InputTag>> inputs() {
|
||||
return filter(new DefaultListProcessor<InputTag>(), new TypeTagFilter(
|
||||
InputTag.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the input value pattern
|
||||
* @return the inputs whose values matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<InputTag>> inputs(Pattern pattern) {
|
||||
return find(inputs(), pattern, INPUT_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the action url pattern
|
||||
* @return the first input whose value matches the pattern
|
||||
*/
|
||||
public MatchedElement<InputTag> input(Pattern pattern) {
|
||||
return single(inputs(pattern));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the input id pattern
|
||||
* @return the inputs whose ids matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<InputTag>> inputsByID(Pattern pattern) {
|
||||
return match(inputs(), pattern, new IDTagMatcher<InputTag>(),
|
||||
INPUT_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the input id
|
||||
* @return the input whose id matches the given
|
||||
*/
|
||||
public MatchedElement<InputTag> inputByID(String id) {
|
||||
return single(inputsByID(Pattern.compile(Pattern.quote(id))));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the input name pattern
|
||||
* @return the inputs whose name matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<InputTag>> inputsByName(Pattern pattern) {
|
||||
return match(inputs(), pattern, new NameTagMatcher<InputTag>(),
|
||||
INPUT_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the input name
|
||||
* @return the input whose name matches the given
|
||||
*/
|
||||
public MatchedElement<InputTag> inputByName(String name) {
|
||||
return single(inputsByName(Pattern.compile(Pattern.quote(name))));
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** TEXTAREA
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* An {@link TagMatcher} that returns the textarea value
|
||||
*/
|
||||
private static final TagMatcher<TextareaTag> TEXTAREA_TAG_MATCHER = new TagMatcher<TextareaTag>() {
|
||||
@Override
|
||||
public String content(TextareaTag tag) {
|
||||
return tag.getStringText();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @return the list of all textareas on the page
|
||||
*/
|
||||
public List<PageElement<TextareaTag>> textareas() {
|
||||
return filter(new DefaultListProcessor<TextareaTag>(),
|
||||
new TypeTagFilter(TextareaTag.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the textarea value pattern
|
||||
* @return the textareas whose values matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<TextareaTag>> textareas(Pattern pattern) {
|
||||
return match(textareas(), pattern, TEXTAREA_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the textarea value pattern
|
||||
* @return the first textarea whose value matches the pattern
|
||||
*/
|
||||
public MatchedElement<TextareaTag> textarea(Pattern pattern) {
|
||||
return single(textareas(pattern));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the textarea id pattern
|
||||
* @return the textareas whose ids matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<TextareaTag>> textareasByID(Pattern pattern) {
|
||||
return match(textareas(), pattern, new IDTagMatcher<TextareaTag>(),
|
||||
TEXTAREA_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the textarea id
|
||||
* @return the textarea whose id matches the given
|
||||
*/
|
||||
public MatchedElement<TextareaTag> textareaByID(String id) {
|
||||
return single(textareasByID(Pattern.compile(Pattern.quote(id))));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the textarea name pattern
|
||||
* @return the textareas whose name matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<TextareaTag>> textareasByName(Pattern pattern) {
|
||||
return match(textareas(), pattern, new NameTagMatcher<TextareaTag>(),
|
||||
TEXTAREA_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the textarea name
|
||||
* @return the textarea whose name matches the given
|
||||
*/
|
||||
public MatchedElement<TextareaTag> textareaByName(String name) {
|
||||
return single(textareasByName(Pattern.compile(Pattern.quote(name))));
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** JAVASCRIPT
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* An {@link TagMatcher} that returns the script code
|
||||
*/
|
||||
public List<PageElement<ScriptTag>> scripts() {
|
||||
return filter(new DefaultListProcessor<ScriptTag>(), new TypeTagFilter(
|
||||
ScriptTag.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the list of all scripts on the page
|
||||
*/
|
||||
public List<MatchedElement<ScriptTag>> scripts(Pattern pattern) {
|
||||
return find(scripts(), pattern, new TagMatcher<ScriptTag>() {
|
||||
@Override
|
||||
public String content(ScriptTag tag) {
|
||||
return tag.getScriptCode();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the script code pattern
|
||||
* @return the first script whose code matches the pattern
|
||||
*/
|
||||
public MatchedElement<ScriptTag> script(Pattern pattern) {
|
||||
return single(scripts(pattern));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the script url pattern
|
||||
* @return the scripts whose urls matches the pattern
|
||||
*/
|
||||
public MatchedElement<ScriptTag> scriptBySource(Pattern pattern) {
|
||||
return single(match(scripts(), pattern, new TagMatcher<ScriptTag>() {
|
||||
@Override
|
||||
public String content(ScriptTag tag) {
|
||||
return tag.getAttribute("src");
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** FRAME
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* An {@link TagMatcher} that returns the frame url
|
||||
*/
|
||||
private static final TagMatcher<FrameTag> FRAME_TAG_MATCHER = new TagMatcher<FrameTag>() {
|
||||
@Override
|
||||
public String content(FrameTag tag) {
|
||||
return tag.getFrameLocation();
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @return the list of all frames on the page
|
||||
*/
|
||||
public List<PageElement<FrameTag>> frames() {
|
||||
return filter(new DefaultListProcessor<FrameTag>(), new TypeTagFilter(
|
||||
FrameTag.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the frame url pattern
|
||||
* @return the frames whose urls matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<FrameTag>> frames(Pattern pattern) {
|
||||
return match(frames(), pattern, FRAME_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the frame url pattern
|
||||
* @return the first frame whose url matches the pattern
|
||||
*/
|
||||
public MatchedElement<FrameTag> frame(Pattern pattern) {
|
||||
return single(frames(pattern));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the frame id pattern
|
||||
* @return the frames whose id matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<FrameTag>> framesByID(Pattern pattern) {
|
||||
return match(frames(), pattern, new IDTagMatcher<FrameTag>(),
|
||||
FRAME_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the frame id
|
||||
* @return the frame whose id matches the given
|
||||
*/
|
||||
public MatchedElement<FrameTag> frameByID(String id) {
|
||||
return single(framesByID(Pattern.compile(Pattern.quote(id))));
|
||||
}
|
||||
|
||||
/**
|
||||
* @param pattern
|
||||
* the frame name pattern
|
||||
* @return the frames whose name matches the pattern
|
||||
*/
|
||||
public List<MatchedElement<FrameTag>> framesByName(Pattern pattern) {
|
||||
return match(frames(), pattern, new NameTagMatcher<FrameTag>(),
|
||||
FRAME_TAG_MATCHER);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* the frame name
|
||||
* @return the frame whose name matches the given
|
||||
*/
|
||||
public MatchedElement<FrameTag> frameByName(String name) {
|
||||
return single(framesByName(Pattern.compile(Pattern.quote(name))));
|
||||
}
|
||||
|
||||
/*
|
||||
* ************************************************************************
|
||||
* ***** INITIALIZERS
|
||||
* ************************************************************************
|
||||
*/
|
||||
/**
|
||||
* Creates a new page parsing the HTML input
|
||||
*
|
||||
* @param html
|
||||
* the html code
|
||||
* @return the newly created {@link Page} object
|
||||
*/
|
||||
public static Page parse(String html) {
|
||||
try {
|
||||
return new Page(Parser.createParser(html, null));
|
||||
} catch (ParserException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return nodes.toHtml(false);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.html;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Tag;
|
||||
|
||||
/**
|
||||
* An element that represents an tag on the page
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*/
|
||||
public class PageElement<T extends Tag> {
|
||||
/**
|
||||
* The tag represented by this element
|
||||
*/
|
||||
protected final T tag;
|
||||
|
||||
/**
|
||||
* Creates a new instance
|
||||
*
|
||||
* @param tag
|
||||
* the tag
|
||||
*/
|
||||
public PageElement(T tag) {
|
||||
this.tag = tag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to match the element with a given pattern
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the matched element
|
||||
*/
|
||||
public MatchedElement<T> match(Pattern pattern) {
|
||||
return match(pattern, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to match the element with a given pattern using an alternative
|
||||
* {@link TagMatcher}
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @param tagMatcher
|
||||
* the tag matcher
|
||||
* @return the matched element
|
||||
*/
|
||||
public MatchedElement<T> match(Pattern pattern, TagMatcher<T> tagMatcher) {
|
||||
if (tagMatcher == null) {
|
||||
tagMatcher = new TagMatcher<T>() {
|
||||
@Override
|
||||
public String content(T tag) {
|
||||
return tag.toHtml();
|
||||
}
|
||||
};
|
||||
}
|
||||
final String content = tagMatcher.content(tag);
|
||||
if (content == null)
|
||||
return null;
|
||||
return new MatchedElement<T>(tag, pattern, tagMatcher.content(tag));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to match the element with itself (return a {@link MatchedElement}
|
||||
* that always matched it self)
|
||||
*
|
||||
* @param tagMatcher
|
||||
* the tag matcher
|
||||
* @return always an {@link MatchedElement} whose group 0 matches it self
|
||||
*/
|
||||
public MatchedElement<T> match(TagMatcher<T> tagMatcher) {
|
||||
if (tagMatcher == null) {
|
||||
tagMatcher = new TagMatcher<T>() {
|
||||
@Override
|
||||
public String content(T tag) {
|
||||
return tag.toHtml();
|
||||
}
|
||||
};
|
||||
}
|
||||
final String content = tagMatcher.content(tag);
|
||||
if (content == null)
|
||||
return null;
|
||||
return new MatchedElement<T>(tag, tagMatcher.content(tag));
|
||||
}
|
||||
|
||||
/**
|
||||
* An tag matcher is an helper class that can return an value that the
|
||||
* matcher should use to test the pattern against it.
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*
|
||||
* @param <T>
|
||||
* the tag type
|
||||
*/
|
||||
public interface TagMatcher<T extends Tag> {
|
||||
String content(T tag);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the tag object
|
||||
*/
|
||||
public T tag() {
|
||||
return tag;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PageElement [tag=" + tag + "]";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,137 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.html;
|
||||
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Represents an search done against an page string
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*/
|
||||
public class SearchResults {
|
||||
/**
|
||||
* The matcher
|
||||
*/
|
||||
private final Matcher matcher;
|
||||
|
||||
/**
|
||||
* Creates a new instance
|
||||
*
|
||||
* @param matcher
|
||||
* the matcher
|
||||
*/
|
||||
public SearchResults(Matcher matcher) {
|
||||
this.matcher = matcher;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @param content
|
||||
* the content
|
||||
*/
|
||||
public SearchResults(Pattern pattern, String content) {
|
||||
this.matcher = pattern.matcher(content);
|
||||
this.matcher.find();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return <code>true</code> if the matcher has found any results
|
||||
*/
|
||||
public boolean hasResults() {
|
||||
matcher.reset();
|
||||
return matcher.find();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param n
|
||||
* the group number
|
||||
* @return <code>true</code> if the group exists
|
||||
*/
|
||||
public boolean hasGroup(int n) {
|
||||
return n <= matcher.groupCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the entire matched value as a string
|
||||
*/
|
||||
public String asString() {
|
||||
return asString(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the group value as a string
|
||||
*/
|
||||
public String asString(int n) {
|
||||
return matcher.group(n);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the entire matched value as a integer
|
||||
*/
|
||||
public int asInteger() {
|
||||
return asInteger(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the group value as a integer
|
||||
*/
|
||||
public int asInteger(int n) {
|
||||
return Integer.parseInt(asString(n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the entire matched value as a long
|
||||
*/
|
||||
public long asLong() {
|
||||
return asLong(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the group value as a long
|
||||
*/
|
||||
public long asLong(int n) {
|
||||
return Long.parseLong(asString(n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the entire matched value as a double
|
||||
*/
|
||||
public double asDouble() {
|
||||
return asDouble(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the group value as a double
|
||||
*/
|
||||
public double asDouble(int n) {
|
||||
return Double.parseDouble(asString(n));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the pattern matched against the element
|
||||
*/
|
||||
public Pattern getPattern() {
|
||||
return matcher.pattern();
|
||||
}
|
||||
}
|
||||
@@ -16,29 +16,36 @@
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
package com.rogiel.httpchannel.util.html.filter;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.Tag;
|
||||
|
||||
public class IDFilter implements NodeFilter {
|
||||
/**
|
||||
* An filter that selects all tags matching an given type
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*/
|
||||
public class TypeTagFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final String id;
|
||||
/**
|
||||
* The tag type
|
||||
*/
|
||||
private final Class<? extends Tag> type;
|
||||
|
||||
public IDFilter(String id) {
|
||||
this.id = id;
|
||||
/**
|
||||
* Creates a new instance
|
||||
*
|
||||
* @param type
|
||||
* the tag type
|
||||
*/
|
||||
public TypeTagFilter(Class<? extends Tag> type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof Tag))
|
||||
return false;
|
||||
final Tag tag = (Tag) node;
|
||||
if (tag.getAttribute("id") == null)
|
||||
return false;
|
||||
if (!tag.getAttribute("id").equals(id))
|
||||
return false;
|
||||
return true;
|
||||
return type.isAssignableFrom(node.getClass());
|
||||
}
|
||||
}
|
||||
@@ -1,38 +1,36 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
|
||||
public class ContainsFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern content;
|
||||
|
||||
public ContainsFilter(Pattern content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
return content.matcher(node.getText()).find();
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.html.matcher;
|
||||
|
||||
import org.htmlparser.Tag;
|
||||
|
||||
import com.rogiel.httpchannel.util.html.PageElement.TagMatcher;
|
||||
|
||||
/**
|
||||
* An {@link TagMatcher} that always returns the tag ID
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*/
|
||||
public class IDTagMatcher<T extends Tag> implements TagMatcher<T> {
|
||||
@Override
|
||||
public String content(T tag) {
|
||||
return tag.getAttribute("id");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,38 +1,36 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
|
||||
public class ContainsInLowerCaseFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern content;
|
||||
|
||||
public ContainsInLowerCaseFilter(Pattern content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
return content.matcher(node.getText().toLowerCase()).find();
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.html.matcher;
|
||||
|
||||
import org.htmlparser.Tag;
|
||||
|
||||
import com.rogiel.httpchannel.util.html.PageElement.TagMatcher;
|
||||
|
||||
/**
|
||||
* An {@link TagMatcher} that always returns the tag name
|
||||
*
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*/
|
||||
public class NameTagMatcher<T extends Tag> implements TagMatcher<T> {
|
||||
@Override
|
||||
public String content(T tag) {
|
||||
return tag.getAttribute("name");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.tags.FormTag;
|
||||
|
||||
public class FormActionPatternFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern pattern;
|
||||
|
||||
public FormActionPatternFilter(Pattern pattern) {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof FormTag))
|
||||
return false;
|
||||
final FormTag form = (FormTag) node;
|
||||
return pattern.matcher(form.getFormLocation()).matches();
|
||||
}
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.nodes.TagNode;
|
||||
|
||||
public class FramePatternFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern pattern;
|
||||
|
||||
public FramePatternFilter(Pattern pattern) {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof TagNode))
|
||||
return false;
|
||||
final TagNode frame = (TagNode) node;
|
||||
if (frame.getAttribute("src") == null)
|
||||
return false;
|
||||
return pattern.matcher(frame.getAttribute("src")).matches();
|
||||
}
|
||||
}
|
||||
@@ -1,304 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.Parser;
|
||||
import org.htmlparser.Tag;
|
||||
import org.htmlparser.filters.AndFilter;
|
||||
import org.htmlparser.nodes.TagNode;
|
||||
import org.htmlparser.tags.FormTag;
|
||||
import org.htmlparser.tags.ImageTag;
|
||||
import org.htmlparser.tags.InputTag;
|
||||
import org.htmlparser.tags.LinkTag;
|
||||
import org.htmlparser.tags.ScriptTag;
|
||||
import org.htmlparser.tags.TextareaTag;
|
||||
import org.htmlparser.util.NodeIterator;
|
||||
import org.htmlparser.util.NodeList;
|
||||
import org.htmlparser.util.ParserException;
|
||||
|
||||
/**
|
||||
* @author <a href="http://www.rogiel.com">Rogiel</a>
|
||||
*/
|
||||
public class HTMLPage {
|
||||
private final NodeList nodes;
|
||||
|
||||
private HTMLPage(Parser parser) throws ParserException {
|
||||
this.nodes = parser.parse(null);
|
||||
}
|
||||
|
||||
private <T extends Node> List<T> filter(final Class<T> nodeType,
|
||||
NodeFilter... filters) {
|
||||
final NodeFilter filter;
|
||||
if (filters.length == 1)
|
||||
filter = filters[0];
|
||||
else
|
||||
filter = new AndFilter(filters);
|
||||
try {
|
||||
return list(nodes.extractAllNodesThatMatch(filter, true));
|
||||
} catch (ParserException e) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private <T extends Node> List<T> list(final NodeList list)
|
||||
throws ParserException {
|
||||
final List<T> filtered = new ArrayList<>();
|
||||
final NodeIterator iterator = list.elements();
|
||||
while (iterator.hasMoreNodes()) {
|
||||
filtered.add((T) iterator.nextNode());
|
||||
}
|
||||
return filtered;
|
||||
}
|
||||
|
||||
public boolean containsPlain(Pattern pattern) {
|
||||
return pattern.matcher(asString()).find();
|
||||
}
|
||||
|
||||
public boolean contains(final Pattern pattern) {
|
||||
return !filter(Node.class, new ContainsFilter(pattern)).isEmpty();
|
||||
}
|
||||
|
||||
public boolean contains(final String text) {
|
||||
return contains(Pattern.compile(Pattern.quote(text)));
|
||||
}
|
||||
|
||||
public boolean containsIgnoreCase(final String text) {
|
||||
return !filter(
|
||||
Node.class,
|
||||
new ContainsInLowerCaseFilter(Pattern.compile(Pattern
|
||||
.quote(text.toLowerCase())))).isEmpty();
|
||||
}
|
||||
|
||||
public String findPlain(final Pattern pattern, int n) {
|
||||
final Matcher matcher = pattern.matcher(asString());
|
||||
if (matcher.find())
|
||||
return matcher.group(n);
|
||||
return null;
|
||||
}
|
||||
|
||||
public int findIntPlain(final Pattern pattern, int n) {
|
||||
return Integer.parseInt(findPlain(pattern, n));
|
||||
}
|
||||
|
||||
public double findDoublePlain(final Pattern pattern, int n) {
|
||||
return Double.parseDouble(findPlain(pattern, n));
|
||||
}
|
||||
|
||||
public String find(final Pattern pattern, int n) {
|
||||
for (final Node tag : filter(Tag.class, new ContainsFilter(pattern))) {
|
||||
final Matcher matcher = pattern.matcher(tag.getText());
|
||||
if (matcher.find())
|
||||
return matcher.group(n);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public int findAsInt(final Pattern pattern, int n) {
|
||||
String found = find(pattern, n);
|
||||
if (found == null)
|
||||
return 0;
|
||||
return Integer.parseInt(findScript(pattern, n));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to find a link that has an URI following the given pattern
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the link content, if found. <code>null</code> otherwise
|
||||
*/
|
||||
public String findLink(final Pattern pattern) {
|
||||
for (final LinkTag tag : filter(LinkTag.class, new LinkPatternFilter(
|
||||
pattern))) {
|
||||
return tag.getLink();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to find a frame that has an URI following the given pattern
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the iframe uri, if found. <code>null</code> otherwise
|
||||
*/
|
||||
public String findFrame(final Pattern pattern) {
|
||||
for (final TagNode tag : filter(TagNode.class, new FramePatternFilter(
|
||||
pattern))) {
|
||||
return tag.getAttribute("src");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to find a image that has an URI following the given pattern
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the iframe uri, if found. <code>null</code> otherwise
|
||||
*/
|
||||
public String findImage(final Pattern pattern) {
|
||||
for (final ImageTag tag : filter(ImageTag.class,
|
||||
new ImagePatternFilter(pattern))) {
|
||||
return tag.getImageURL();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to find a form which has an location that respects the given
|
||||
* pattern
|
||||
*
|
||||
* @param pattern
|
||||
* the pattern
|
||||
* @return the URI found, if any. <code>null</code> otherwise
|
||||
*/
|
||||
public String findFormAction(final Pattern pattern) {
|
||||
for (final FormTag tag : filter(FormTag.class,
|
||||
new FormActionPatternFilter(pattern))) {
|
||||
return tag.getFormLocation();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private String inputValue(List<InputTag> tags) {
|
||||
for (final InputTag tag : tags) {
|
||||
return tag.getAttribute("value");
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public String getInputValue(final String inputName) {
|
||||
return inputValue(filter(InputTag.class, new InputNameFilter(inputName)));
|
||||
}
|
||||
|
||||
public int getInputValueAsInt(final String inputName) {
|
||||
return Integer.parseInt(getInputValue(inputName));
|
||||
}
|
||||
|
||||
public String getInputValueById(final String id) {
|
||||
return inputValue(filter(InputTag.class, new InputIDFilter(id)));
|
||||
}
|
||||
|
||||
public int getInputValueByIdInt(final String id) {
|
||||
return Integer.parseInt(inputValue(filter(InputTag.class,
|
||||
new InputIDFilter(id))));
|
||||
}
|
||||
|
||||
public String getInputValue(final Pattern pattern) {
|
||||
return inputValue(filter(InputTag.class, new InputValuePatternFilter(
|
||||
pattern)));
|
||||
}
|
||||
|
||||
public String getTextareaValueById(String id) {
|
||||
return ((TextareaTag) getTagByID(id)).getStringText();
|
||||
}
|
||||
|
||||
public String getTextareaValueByName(String name) {
|
||||
return ((TextareaTag) getTagByName(name)).getStringText();
|
||||
}
|
||||
|
||||
public Tag getTagByID(final String id) {
|
||||
for (final Tag tag : filter(Tag.class, new IDFilter(id))) {
|
||||
return tag;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public Tag getTagByName(final String name) {
|
||||
for (final Tag tag : filter(Tag.class, new NameFilter(name))) {
|
||||
return tag;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public String findScript(final Pattern pattern, int n) {
|
||||
for (final ScriptTag tag : filter(ScriptTag.class,
|
||||
new ScriptContainsFilter(pattern))) {
|
||||
final Matcher matcher = pattern.matcher(tag.getScriptCode());
|
||||
if (matcher.find())
|
||||
return matcher.group(n);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public String findScriptSrc(final Pattern pattern) {
|
||||
for (final ScriptTag tag : filter(ScriptTag.class, new ScriptSrcFilter(
|
||||
pattern))) {
|
||||
final Matcher matcher = pattern.matcher(tag.getAttribute("src"));
|
||||
if (matcher.matches())
|
||||
return matcher.group();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public int findScriptAsInt(final Pattern pattern, int n) {
|
||||
String found = findScript(pattern, n);
|
||||
if (found == null)
|
||||
return 0;
|
||||
return Integer.parseInt(found);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
// try {
|
||||
// return parser.parse(null).toHtml(false);
|
||||
// } catch (ParserException e1) {
|
||||
// return null;
|
||||
// }
|
||||
return nodes.toHtml(false);
|
||||
}
|
||||
|
||||
public static HTMLPage parse(String html) {
|
||||
try {
|
||||
return new HTMLPage(Parser.createParser(html, null));
|
||||
} catch (ParserException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public String asString() {
|
||||
StringBuffer buff = new StringBuffer();
|
||||
for (int i = 0; i < nodes.size(); i++) {
|
||||
// final String content = nodes.elementAt(i).toPlainTextString()
|
||||
// .replaceAll("\n", "").replaceAll("\\t", "").trim();
|
||||
// if (content.length() > 0) {
|
||||
// buff.append(" ").append(content);
|
||||
// }
|
||||
final String[] lines = nodes.elementAt(i).toPlainTextString()
|
||||
.split("\n");
|
||||
for (final String line : lines) {
|
||||
final String processed = line.trim();
|
||||
if (processed.length() > 0) {
|
||||
buff.append(line.trim()).append(" ");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return buff.toString();
|
||||
}
|
||||
}
|
||||
@@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.tags.ImageTag;
|
||||
|
||||
public class ImagePatternFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern pattern;
|
||||
|
||||
public ImagePatternFilter(Pattern pattern) {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof ImageTag))
|
||||
return false;
|
||||
final ImageTag frame = (ImageTag) node;
|
||||
return pattern.matcher(frame.getImageURL()).matches();
|
||||
}
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.tags.InputTag;
|
||||
|
||||
public class InputIDFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final String id;
|
||||
|
||||
public InputIDFilter(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof InputTag))
|
||||
return false;
|
||||
final InputTag input = (InputTag) node;
|
||||
if (input.getAttribute("id") == null)
|
||||
return false;
|
||||
if (!input.getAttribute("id").equals(id))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.tags.InputTag;
|
||||
|
||||
public class InputNameFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final String name;
|
||||
|
||||
public InputNameFilter(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof InputTag))
|
||||
return false;
|
||||
final InputTag input = (InputTag) node;
|
||||
if (input.getAttribute("name") == null)
|
||||
return false;
|
||||
if (!input.getAttribute("name").equals(name))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -1,46 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.tags.InputTag;
|
||||
|
||||
public class InputValuePatternFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern pattern;
|
||||
|
||||
public InputValuePatternFilter(Pattern pattern) {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof InputTag))
|
||||
return false;
|
||||
final InputTag input = (InputTag) node;
|
||||
if (input.getAttribute("value") == null)
|
||||
return false;
|
||||
if (!pattern.matcher(input.getAttribute("value")).matches())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.tags.LinkTag;
|
||||
|
||||
public class LinkPatternFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern pattern;
|
||||
|
||||
public LinkPatternFilter(Pattern pattern) {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof LinkTag))
|
||||
return false;
|
||||
final LinkTag link = (LinkTag) node;
|
||||
return pattern.matcher(link.getLink()).matches();
|
||||
}
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.Tag;
|
||||
|
||||
public class NameFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final String name;
|
||||
|
||||
public NameFilter(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof Tag))
|
||||
return false;
|
||||
final Tag tag = (Tag) node;
|
||||
if (tag.getAttribute("name") == null)
|
||||
return false;
|
||||
if (!tag.getAttribute("name").equals(name))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -1,42 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.tags.ScriptTag;
|
||||
|
||||
public class ScriptContainsFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern pattern;
|
||||
|
||||
public ScriptContainsFilter(Pattern pattern) {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof ScriptTag))
|
||||
return false;
|
||||
final ScriptTag script = (ScriptTag) node;
|
||||
return pattern.matcher(script.getScriptCode()).find();
|
||||
}
|
||||
}
|
||||
@@ -1,44 +0,0 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package com.rogiel.httpchannel.util.htmlparser;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlparser.Node;
|
||||
import org.htmlparser.NodeFilter;
|
||||
import org.htmlparser.tags.ScriptTag;
|
||||
|
||||
public class ScriptSrcFilter implements NodeFilter {
|
||||
private static final long serialVersionUID = 1L;
|
||||
private final Pattern pattern;
|
||||
|
||||
public ScriptSrcFilter(Pattern pattern) {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean accept(Node node) {
|
||||
if (!(node instanceof ScriptTag))
|
||||
return false;
|
||||
final ScriptTag script = (ScriptTag) node;
|
||||
if (script.getAttribute("src") == null)
|
||||
return false;
|
||||
return pattern.matcher(script.getAttribute("src")).matches();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user