|
Today whim, want to be a search word things, they rush to the way dictionary official website looked, that we want to query word is embedded in a Web page address to the proper way dictionary, then the result is that the page we need the interpretation of the word, so the only thing needed technical knowledge:
Regular Expressions
We have to do is extract the interpretation of the word from the acquired Web page source code, so just say here that regular expressions to extract the word interpretation.
Analysis page source code, we can see that the interpretation of the word in a div tag inside
The primary goal is to get this part of the regular expression can be written:
(? S) < div class = \ "trans-container \">. *? < Ul>. *? < / Div>
// (? S) is to make the meaning of '' can match a newline, the default is mismatched
? //.* Mean, in the non-greedy pattern matches any number of characters access to this section, the further we need is the interpretation of the word inside, so we can do:
(? M) < li> (. *?) < / Li>
// (? M) is the meaning of matching rows in a row are not in accordance with this regular expression matching, default is not a branch, unified matching
.? // Here to use parentheses * wrap, in order to obtain direct meaning of the word, lay down next to the label below is specific code:
A, Java code,
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test {
public static void main (String [] args) throws IOException {
CloseableHttpClient httpClient = HttpClients.createDefault ();
System.out.print ( "Please enter the word you want to check:");
Scanner s = new Scanner (System.in);
String word = s.nextLine ();
word = word.replaceAll ( "", "+");
// Find the address lookup words based on configuration
HttpGet getWordMean = new HttpGet ( "http://dict.youdao.com/search?q=" + word + "& keyfrom = dict.index");
CloseableHttpResponse response = httpClient.execute (getWordMean); // Get reverse page source
String result = EntityUtils.toString (response.getEntity ());
response.close ();
// Note (? S), which means let '' matches a newline does not match the default
Pattern searchMeanPattern = Pattern.compile ( "? (S) < div class = \" trans-container \ "> * < ul> * < / div>.?.?");
Matcher m1 = searchMeanPattern.matcher (result); // m1 is the translation of the entire Gets the < div>
if (m1.find ()) {
String means = m1.group (); // all the explanations, including the page tags
Pattern getChinese = Pattern.compile ( "(m) < li> (*) < / li>?.?"); // (? M) on behalf of row match
Matcher m2 = getChinese.matcher (means);
System.out.println ( "Interpretation:");
while (m2.find ()) {
// In Java (. *?) Is Group 1, so with group (1)
System.out.println ( "\ t" + m2.group (1));
}
} Else {
System.out.println ( "not find the interpretation.");
System.exit (0);
}
}
} Two, Python Code
#! / Usr / bin / python
#coding: utf-8
import urllib
import sys
import re
if len (sys.argv) == 1: # there is no word on Usage Tips
print "Usage: ./ Dict.py want to find the word"
sys.exit ()
word = ""
for x in range (len (sys.argv) - 1): # find may be the phrase, with a space, such as "join in", the word here splicing
word + = "" + sys.argv [x + 1]
print "word:" + word
searchUrl = "http://dict.youdao.com/search?q=" + word + "& keyfrom = dict.index" # Find Address
response = urllib.urlopen (searchUrl) .read () # get the page to find the source code
# Source code from a Web page to extract the word interpretation of that part of the
searchSuccess = re.search (r "(? s) < div class = \" trans-container \ ">. *? < ul>. *? < / div>", response)
if searchSuccess:
# Get the word we want to extract the core of the interpretation in the case of only one packet, findall returns a list of the sub-group of strings
means = re.findall (r "(? m) < li> (. *?) < / li>", searchSuccess.group ())
print "Interpretation:"
for mean in means:
print "\ t" + mean # output interpretation
else: |
|
|
|