225 lines
5.0 KiB
JavaScript
225 lines
5.0 KiB
JavaScript
const request = require('request-promise')
|
|
const iconv = require('iconv-lite')
|
|
const cheerio = require('cheerio')
|
|
const md5 = require('md5')
|
|
const path = require('path')
|
|
const fs = require('fs')
|
|
const retry = require('async-retry')
|
|
|
|
const special = require('./special')
|
|
|
|
function sleep(time) {
|
|
return new Promise(resolve => {
|
|
console.log(`sleep: ${time}`)
|
|
setTimeout(() => {
|
|
resolve()
|
|
}, time)
|
|
})
|
|
}
|
|
|
|
async function getPage(url, returnMeta = false) {
|
|
let content = ''
|
|
let isCache = false
|
|
const id = md5(url)
|
|
const file = `./temp/${id}.html`
|
|
if (fs.existsSync(file)) {
|
|
content = fs.readFileSync(file, 'utf-8')
|
|
isCache = true
|
|
} else {
|
|
|
|
await retry(async bail => {
|
|
try {
|
|
const rs = await request(url, {
|
|
resolveWithFullResponse: true,
|
|
encoding: null,
|
|
})
|
|
|
|
content = iconv.decode(rs.body, 'gb2312')
|
|
fs.writeFileSync(file, content)
|
|
return true
|
|
} catch (e) {
|
|
// bail(new Error('FAIL'))
|
|
}
|
|
}, {
|
|
retries: 50,
|
|
minTimeout: 5000,
|
|
onRetry: function () {
|
|
console.log('retry')
|
|
}
|
|
})
|
|
|
|
}
|
|
if (returnMeta) {
|
|
return {
|
|
isCache,
|
|
content
|
|
}
|
|
} else {
|
|
return content
|
|
}
|
|
}
|
|
|
|
;
|
|
(async() => {
|
|
try {
|
|
|
|
} catch (e) {
|
|
console.log(e)
|
|
}
|
|
const cityIds = []
|
|
const rs = {
|
|
'86': {}
|
|
}
|
|
const level0 = await getPage(
|
|
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html')
|
|
const $ = cheerio.load(level0)
|
|
const list = $('.provincetr td a')
|
|
const provinces = []
|
|
const urls = []
|
|
list.each((index, el) => {
|
|
const url =
|
|
`http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/${$(el).attr('href')}`
|
|
const data = {
|
|
id: $(el).attr('href').split('.')[0] + '0000',
|
|
name: $(el).text()
|
|
}
|
|
rs['86'][data.id] = data.name
|
|
urls.push(url)
|
|
})
|
|
|
|
const specialProvinces = {
|
|
"710000": "台湾省",
|
|
"810000": "香港特别行政区",
|
|
"820000": "澳门特别行政区"
|
|
}
|
|
|
|
Object.assign(rs['86'], specialProvinces)
|
|
|
|
// 市
|
|
for (const url of urls) {
|
|
const level1 = await getPage(url)
|
|
const $ = cheerio.load(level1)
|
|
const list = $('.citytr td a')
|
|
const countyUrls = []
|
|
list.each(async(index, el) => {
|
|
const url =
|
|
`http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/${$(el).attr('href')}`
|
|
// filter number
|
|
const text = $(el).text()
|
|
if (!/\d+/.test(text)) {
|
|
const data = {
|
|
id: $(el).attr('href').split('.')[0].slice(3) + '00',
|
|
name: $(el).text()
|
|
}
|
|
|
|
const parentId = data.id.slice(0, 2) + '0000'
|
|
if (!rs[parentId]) {
|
|
rs[parentId] = {}
|
|
}
|
|
cityIds.push(data.id)
|
|
rs[parentId][data.id] = data.name
|
|
}
|
|
countyUrls.push(url)
|
|
})
|
|
|
|
|
|
for (const url of countyUrls) {
|
|
const level2 = await getPage(url)
|
|
const $ = cheerio.load(level2)
|
|
const list = $('.countytr td, .towntr td')
|
|
let townUrls = []
|
|
const currentUrl = url
|
|
for (let index = 0; index < list.length; index++) {
|
|
const { data, url } = processLinks($, list, index, currentUrl.split('/')[7] + '/')
|
|
if (data.id) {
|
|
console.log(data)
|
|
if (data.isLink) {
|
|
data.id = data.id.split('.')[0].split('/')[1]
|
|
} else {
|
|
data.id = data.id.replace(/0+$/g, '')
|
|
}
|
|
const parentId = data.id.slice(0, 4) + '00'
|
|
if (!rs[parentId]) {
|
|
rs[parentId] = {}
|
|
}
|
|
rs[parentId][data.id] = data.name
|
|
}
|
|
if (url) {
|
|
townUrls.push(url)
|
|
}
|
|
}
|
|
|
|
// console.log('街道数量', townUrls.length)
|
|
// townUrls = Array.from(new Set(townUrls))
|
|
// for (const town of townUrls) {
|
|
// console.log('获取街道')
|
|
// console.log('url', url, town)
|
|
// console.log(url.split('/')[7])
|
|
// const level3 = await getPage(town, true)
|
|
// if (!level3.isCache) {
|
|
// await sleep(2000)
|
|
// }
|
|
// const $ = cheerio.load(level3.content)
|
|
// const list = $('.towntr td')
|
|
//
|
|
// for (let index = 0; index < list.length; index++) {
|
|
// const { data, url } = processLinks($, list, index)
|
|
// // data.id = data.id.split('/')[1].split('.')[0]
|
|
// if (data.id) {
|
|
// console.log(data)
|
|
// }
|
|
// }
|
|
// }
|
|
}
|
|
|
|
}
|
|
|
|
setInterval(() => {
|
|
// return
|
|
fs.writeFileSync('../data.json', JSON.stringify(Object.assign({}, rs,
|
|
special), null, 2))
|
|
fs.writeFileSync('./data.json', JSON.stringify(Object.assign({}, rs,
|
|
special), null, 2))
|
|
const _2levelCityIds = cityIds.filter(c => !Object.keys(rs[c] || {}).length)
|
|
fs.writeFileSync('./only_2_level_city_id.json', JSON.stringify(
|
|
_2levelCityIds, null, 2))
|
|
console.log('done')
|
|
// process.exit(0)
|
|
}, 6000)
|
|
})()
|
|
|
|
function processLinks ($, list, index, linkPrefix = '') {
|
|
const el = list[index]
|
|
const link = $(el).find('a')
|
|
let url
|
|
if ($(link[0]).attr('href')) {
|
|
url =
|
|
`http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/${linkPrefix}${$(link[0]).attr('href')}`
|
|
}
|
|
let data = {}
|
|
if (link.length) {
|
|
if (!/\d+/.test($(el).text())) {
|
|
data = {
|
|
isLink: true,
|
|
id: $(link[0]).attr('href'),
|
|
name: $(el).text()
|
|
}
|
|
}
|
|
} else {
|
|
if (!/\d+/.test($(el).text())) {
|
|
data = {
|
|
isLink: false,
|
|
id: $(list[index - 1]).text(),
|
|
name: $(el).text()
|
|
}
|
|
}
|
|
}
|
|
// console.log(data, url)
|
|
return {
|
|
data,
|
|
url,
|
|
}
|
|
|
|
|
|
}
|